class JPUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.tmp_sql_server = None self.concurrency = 8 self.goods_index = 1 self.delete_sql_str = jp_delete_str_3 async def _get_pc_headers(self) -> dict: headers = await async_get_random_headers( upgrade_insecure_requests=False) headers.update({ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'host': 'm.juanpi.com', }) return headers async def _get_db_old_data(self) -> (None, list): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None) await async_sleep(5) result = list( self.tmp_sql_server._select_table(sql_str=jp_select_str_4)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_new_jp_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.juanpi_miaosha except: pass collect() self.juanpi_miaosha = JuanPiParse() async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] tab_id = item[2] page = item[3] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_jp_obj(index=index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jp_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res elif is_recent_time == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jp_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info( '过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) else: self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(page), ) # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url)) body = Requests.get_url_body(url=tmp_url, headers=await self._get_pc_headers(), ip_pool_type=self.ip_pool_type) try: data = json_2_dict(body, default_res={}).get('data', {}) assert data != {}, 'data为空dict!' data = data.get('goodslist', []) assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format( tab_id, page) except AssertionError: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res miaosha_goods_list = await self._get_miaoshao_goods_info_list( data=data) # self.lg.info(str(miaosha_goods_list)) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in miaosha_goods_list ] self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 if miaosha_goods_all_goods_id != []: # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过! self.lg.info( '该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id)) else: # 表示该tab_id,page中没有了该goods_id res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jp_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info( '该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format( goods_id)) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update( miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res async def _update_db(self) -> None: ''' 秒杀数据实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.juanpi_miaosha = JuanPiParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.juanpi_miaosha except: pass collect() async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :param kwargs: :return: ''' res = False miaosha_goods_list = kwargs.get('miaosha_goods_list') goods_id = kwargs.get('goods_id') for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == goods_id: self.juanpi_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.juanpi_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 break else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get('stock_info') goods_data['goods_id'] = item_1.get('goods_id') # goods_data['username'] = '******' if item_1.get('stock_info').get('activity_stock') > 0: goods_data['price'] = item_1.get('price') # 秒杀前的原特价 goods_data['taobao_price'] = item_1.get( 'taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get('sub_title', '') goods_data['miaosha_time'] = item_1.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get('miaosha_time')) res = self.juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) await async_sleep(.3) # 避免太快 break else: pass return res async def _get_miaoshao_goods_info_list(self, data) -> list: ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: tmp = {} tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(item.get('start_time'))), 'miaosha_end_time': timestamp_to_regulartime(int(item.get('end_time'))), } stock = item.get('stock', 0) tmp['goods_id'] = item.get('goods_id') # 限时秒杀库存信息 tmp['stock_info'] = { 'activity_stock': int(item.get('stock', 0) * (item.get('rate', 0) / 100)), 'stock': item.get('stock', 0), } # 原始价格 tmp['price'] = round(float(item.get('oprice', '0')), 2) tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2) miaosha_goods_list.append(tmp) return miaosha_goods_list async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(time.time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -259200: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time < -172800: # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来14小时的商品信息(20点到第二天10点时间间隔为14小时) return 0 # 已过期恢复原价的 elif diff_time > -172800 and diff_time < 50400: return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
class GoodsSortByShopTypeSpider2(AsyncCrawler): def __init__(self): AsyncCrawler.__init__( self, user_agent_type=PHONE, ip_pool_type=IP_POOL_TYPE, log_print=True, logger=None, log_save_path=MY_SPIDER_LOGS_PATH + '/goods_sort_by_shop_type/_/', headless=True, ) self.req_num_retries = 6 # 必须是新开m站超市页面来获取 # tm m站超市分类 url: https://chaoshi.m.tmall.com/ # tm每次使用请先更换 # 直接复制对应接口得新cookies # 并修改cp_utils中的block_calculate_tb_right_sign中的t(该t为params参数中的t), _m_h5_tk为最新请求对应的值即可 # eg: # 天猫超市入口的t, _m_h5_tk # t = '1590387891307' # _m_h5_tk = '6f594c22870353cede88c2796cc28ee9' self.tm_new_chrome_t = '1590545557798' self.tm_new_chrome_sign = 'c2d1fced4d7b1333d0f19b6b637fed9f' self.tm_new_chrome_cookies_str = 'hng=CN%7Czh-CN%7CCNY%7C156; cna=wRsVFTj6JEoCAXHXtCqXOzC7; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=MXX6theE39REQu4vFae7f5vi8A8GAdt5pdcQAJY7eR3zuOxwTSUu0zQGRWpBLbzxbJUsLvdHk4vB8ZWvQR%2BjQg%3D%3D; l=eB_zn817vA2VK0x_BOfZnurza779_IRAguPzaNbMiOCPOdfH5H0fWZAGqqTMCnGVh6uk83JDb3ZQBeYBcBdKnxvOnrZgURDmn; sm4=330100; csa=0_0_0_0_0_0_0_0_0_0_0_0_0; sgcookie=EbIdqdSy36jBPHKaO%2FPZS; uc3=id2=UUplY9Ft9xwldQ%3D%3D&lg2=W5iHLLyFOGW7aA%3D%3D&vt3=F8dBxGZjLZslLqBqC3E%3D&nk2=rUtEoY7x%2Bk8Rxyx1ZtN%2FAg%3D%3D; t=c413bd0891628c3269938122b2bee15f; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; uc4=id4=0%40U2gvLJ3%2BK6kqeorNX%2B21sXN8x3lW&nk4=0%40r7rCNeQ4%2Bj7fAj%2BMcdPH4%2B0X9x%2FwQLp0Sd4%2F; lgc=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=ee5587773876d; cookie2=13ed682f1aa10261d267e8e5a9e8e223; _m_h5_tk=883da77eaee1f1b25a7fb1f4c95b68e6_1590554541015; _m_h5_tk_enc=d531110c50a3daed05299dbb0b6dc3f0; isg=BBISyyg5mGC0AuZBht_2zq0HY970Ixa9xZzn1dxrPkWw77LpxLNmzRgJWw32n45V' self.tm_first_sort_list = [ { 'name': '休闲零食', 'icon_type': 'categoryxiuxianlingshi', 'level1_id': 78, }, { 'name': '粮油米面', 'icon_type': 'categoryliangyoumimian', 'level1_id': 80, }, { 'name': '乳饮酒水', 'icon_type': 'categorynaipinshuiyin', 'level1_id': 79, }, { 'name': '日用百货', 'icon_type': 'categorychufangriyong', 'level1_id': 81, }, { 'name': '母婴用品', 'icon_type': 'categorymuyingyongping', 'level1_id': 82, }, { 'name': '个人护理', 'icon_type': 'categorygerenhuli', 'level1_id': 83, }, { 'name': '纸品家清', 'icon_type': 'categoryjiaqingjiaju', 'level1_id': 84, }, { 'name': '美容护肤', 'icon_type': 'categorymeironghufu', 'level1_id': 94, }, { 'name': '方便速食', 'icon_type': 'categoryfangbiansushi', 'level1_id': 92, }, { 'name': '中外名酒', 'icon_type': 'categoryzhongwaimingjiu', 'level1_id': 87, }, { 'name': '童装童鞋', 'icon_type': 'categorytongzhuang', 'level1_id': 138, }, { 'name': '成人用品', 'icon_type': 'categorychengrenyongpin', 'level1_id': 93, }, { 'name': '家纺内衣', 'icon_type': 'categoryjiafangneiyi', 'level1_id': 90, }, { 'name': '宠物生活', 'icon_type': 'categorychongwuyongpin', 'level1_id': 91, }, { 'name': '电器数码', 'icon_type': 'category3cqipei', 'level1_id': 95, }, { 'name': '进口好货', 'icon_type': 'categoryjinkouhaohuo', 'level1_id': 85, }, { 'name': '医疗保健', 'icon_type': 'categoryzibubaojian', 'level1_id': 89, }, ] self.tm_skip_name_tuple = ( '好货', '为你推荐', '热销榜单', '每日特惠', '一件包邮', '新品尝鲜', '年中大赏', '喵九八', '新品推荐', '特惠', '尝新', '精致好货', '超值爆款', '包邮', '优选', '直播', '尖叫单品', '品牌专区', '大牌', '网红爆款', '新品', '清凉一夏', '热销', '推荐', '国家馆', '优惠', '折', '送', # eg: 买一送一 '精选', '爆款', '上新', '秒杀', '热门', '减', '满减', ) self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self._init_sql_str() self.db_existed_goods_id_list = [] self.sql_cli_remainder = 20 def _init_sql_str(self): self.sql_str0 = 'insert into dbo.common_shop_sort_level_table(unique_id, sort_level1_id, sort_level1_name, sort_level2_id, sort_level2_name, sort_level3_id, sort_level3_name, shop_id) values(%s, %s, %s, %s, %s, %s, %s, %s)' self.sql_str1 = """ select unique_id, sort_level1_id, sort_level2_id, sort_level2_name, sort_level3_id, sort_level3_name from dbo.common_shop_sort_level_table where sort_level2_id != '' and sort_level3_id != '' """ self.sql_str2 = 'insert into dbo.common_shop_sort_and_goods_relation_table(create_time, unique_id, sort_unique_id, goods_id, goods_url) values(%s, %s, %s, %s, %s)' self.sql_str3 = """ select unique_id from dbo.common_shop_sort_and_goods_relation_table """ self.sql_str4 = 'select GoodsID from dbo.GoodsInfoAutoGet' self.sql_str5 = """ select goods_id from dbo.common_shop_sort_and_goods_relation_table where sort_unique_id in ( select unique_id from dbo.common_shop_sort_level_table where shop_id='tmcs' ) """ async def _fck_run(self): # await self.get_tm_sort_info_and_2_db() # 处理待存入的tmcs goods_id await self.deal_with_tmcs_goods_sort_relation_2_goods_table() async def deal_with_tmcs_goods_sort_relation_2_goods_table(self): """ 处理common_shop_sort_and_goods_relation_table表中未存入的goods_id, 对应存入原商品表中 :return: """ while True: # 用于定位增加商品的个数 self.add_goods_index = 0 try: result0 = list( self.sql_cli._select_table(sql_str=self.sql_str4)) assert result0 is not None result1 = list( self.sql_cli._select_table(sql_str=self.sql_str5)) assert result1 is not None self.lg.info('db 已存在的goods_id_num: {}'.format(len(result0))) self.db_existed_goods_id_list = [item[0] for item in result0] assert self.db_existed_goods_id_list != [] # 获取db 中待处理存入的tmcs goods self.db_wait_2_save_goods_id_list = [ item[0] for item in result1 ] assert self.db_wait_2_save_goods_id_list != [] except Exception: self.lg.error('遇到错误:', exc_info=True) await async_sleep(15) continue try: del result0 del result1 except: pass collect() # 处理待存入的tmcs 的goods_id_list await self.deal_with_tmcs_goods_id_list() async def deal_with_tmcs_goods_id_list(self): self.lg.info('即将开始抓取tmcs goods, 请耐心等待...') for item in self.db_wait_2_save_goods_id_list: # eg: '61864164616' goods_id = item if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=self.sql_cli_remainder, ) if self.sql_cli.is_connect_success: # 加spm 是为了get_goods_id_from_url能筛选, id # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id) goods_url = 'https://detail.tmall.com/item.htm?id={}'.format( goods_id) # 下面这个goods_id为类型加goods_id的list goods_id = tmall.get_goods_id_from_url(goods_url) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id, ) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if len(data['all_img_url']) <= 1: self.lg.info( '[goods_id: {}]主图个数<=1, pass'.format(goods_id)) return False result = tmall.old_tmall_goods_insert_into_new_table( data=data, pipeline=self.sql_cli) if result: # 避免后续重复采集 self.db_existed_goods_id_list.append(goods_id) else: pass else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.lg.info('tmcs已经抓取完毕!') return True def get_common_shop_sort_level_table_unique_id(self, shop_id, sort_level1_id='', sort_level2_id='', sort_level3_id='') -> str: """ 生成common_shop_sort_level_table唯一的unique_id 以避免记录重复 :param shop_id: :param sort_level1_id: :param sort_level2_id: :param sort_level3_id: :return: """ # 拼接成类似于'{}::{}::{}::{}'结构 target_str = shop_id if sort_level1_id != '': target_str = target_str + '::' + str(sort_level1_id) if sort_level2_id != '': target_str = target_str + '::' + str(sort_level2_id) if sort_level3_id != '': target_str = target_str + '::' + str(sort_level3_id) self.lg.info(target_str) return get_uuid3(target_str=target_str) async def get_tm_sort_info_and_2_db(self): """ 获取天猫的分类信息 :return: """ # 插入tmcs 第一分类的数据到db # await self._tmcs_insert_into_sort_level1_2_db() # 获取第二分类的id信息 # for item in self.tm_first_sort_list: # sort_level1_id = item.get('level1_id', 0) # sort_level1_name = item.get('name', '') # icon_type = item.get('icon_type', '') # is_success = False # target_data = {} # try: # self.lg.info('Get sort_level1_name: {}, sort_level1_id: {} ing ...'.format( # sort_level1_name, # sort_level1_id)) # target_data = await self.get_tm_second_sort_info_by_first_sort_name( # first_sort_name=sort_level1_name, # icon_type=icon_type, # level1_id=sort_level1_id,) # if target_data != {}: # is_success = True # else: # pass # assert target_data != {} # # 插入db # await self._tmcs_insert_into_sort_level2_2_db(data=target_data) # except Exception: # self.lg.error('遇到错误:', exc_info=True) # # self.lg.info('[{}] sort_level1_name: {}, sort_level1_id: {}'.format( # '+' if is_success else '-', # sort_level1_name, # sort_level1_id, # )) # # # 再获取其第三类的分类信息 # # # 测试 # # 获取第三分类的信息 # # 即上面的second_list中item的id # # second_id = 298 # # icon_type = 'categoryxiuxianlingshi' # # business = 'B2C' # # await self.get_tm_third_sort_info_by_second_id( # # second_id=second_id, # # icon_type=icon_type, # # business=business # # ) # # if target_data == {}: # continue # # for i in target_data.get('second_list', []): # is_success2 = False # try: # sort_level2_id = i.get('id', -1) # assert sort_level2_id != -1 # sort_level2_name = i.get('name', '') # assert sort_level2_name != '' # business = i.get('business', '') # assert business != '' # # self.lg.info('Get sort_level1_name: {}, sort_level1_id: {}, sort_level2_name: {}, sort_level2_id: {} ing ...'.format( # sort_level1_name, # sort_level1_id, # sort_level2_name, # sort_level2_id)) # target_data2 = await self.get_tm_third_sort_info_by_second_id( # second_id=sort_level2_id, # icon_type=icon_type, # business=business,) # if target_data2 != {}: # is_success2 = True # else: # pass # assert target_data2 != {} # # 插入db # await self._tmcs_insert_into_sort_level3_2_db( # sort_level1_id=sort_level1_id, # sort_level1_name=sort_level1_name, # sort_level2_name=sort_level2_name, # data=target_data2) # except Exception: # self.lg.error('遇到错误:', exc_info=True) # continue # # self.lg.info('[{}] sort_level2_name: {}, sort_level2_id: {}'.format( # '+' if is_success2 else '-', # sort_level2_name, # sort_level2_id, # )) # 获取第三分类对应的goods_id信息并存入db try: ori_db_data = self.sql_cli._select_table( sql_str=self.sql_str1, logger=self.lg, ) assert ori_db_data is not None # pprint(ori_db_data) # 获取common_shop_sort_and_goods_relation_table 中原先已存在的unique_id_list self.db_goods_sort_relation_unique_id_list = self.sql_cli._select_table( sql_str=self.sql_str3, logger=self.lg, ) assert self.db_goods_sort_relation_unique_id_list is not None self.db_goods_sort_relation_unique_id_list = [ i[0] for i in self.db_goods_sort_relation_unique_id_list ] # pprint(self.db_goods_sort_relation_unique_id_list) assert self.db_goods_sort_relation_unique_id_list != [] # 获取并存入对应分类的goods数据 await self._tmcs_insert_into_goods_info_2_db( ori_db_data=ori_db_data) except Exception: self.lg.error('遇到错误:', exc_info=True) async def _tmcs_insert_into_goods_info_2_db( self, ori_db_data: (tuple, list), ): """ 把对应分类的商品数据插入到中间表common_shop_sort_and_goods_relation_table中 :param ori_db_data: :return: """ # 测试 # 获取第四分类, 即直接获取最后一级分类对应的goods_id_list # second_id = 298 # third_id = 0 # icon_type = 'categoryxiuxianlingshi' # business = 'B2C' # await self.get_tm_fourth_sort_info_by_second_id_and_third_id( # second_id=second_id, # third_id=third_id, # icon_type=icon_type, # business=business # ) for item in ori_db_data: is_success = False try: sort_unique_id, sort_level1_id = item[0], int(item[1].replace( 'tmcs', '')) sort_level2_id, sort_level2_name = int(item[2].replace( 'tmcs', '')), item[3] sort_level3_id, sort_level3_name = int(item[4].replace( 'tmcs', '')), item[5] assert sort_level2_id != '' assert sort_level3_id != '' icon_type = self.get_tmcs_icon_type_by_sort_level1_id( sort_level1_id=sort_level1_id, ) self.lg.info( 'Get sort_level1_id: {}, sort_level2_name: {}, sort_level2_id: {}, sort_level3_name: {}, sort_level3_id: {} ing ...' .format( sort_level1_id, sort_level2_name, sort_level2_id, sort_level3_name, sort_level3_id, )) target_data = await self.get_tm_fourth_sort_info_by_second_id_and_third_id( second_id=sort_level2_id, third_id=sort_level3_id, icon_type=icon_type, business='B2C', ) assert target_data != {} is_success = True now_time = get_shanghai_time() # 插入db for i in target_data.get('goods_list', []): try: goods_id = i.get('goods_id', '') assert goods_id != '' goods_relation_unique_id = self.get_tmcs_goods_relation_unique_id( sort_unique_id=sort_unique_id, goods_id=goods_id, ) if goods_relation_unique_id in self.db_goods_sort_relation_unique_id_list: self.lg.info( 'db 已存在goods_relation_unique_id: {}, 跳过'. format(goods_relation_unique_id)) continue res = self.sql_cli._insert_into_table_2( sql_str=self.sql_str2, params=( now_time, goods_relation_unique_id, sort_unique_id, goods_id, '', ), logger=self.lg, ) if res: self.db_goods_sort_relation_unique_id_list.append( goods_relation_unique_id) else: pass except Exception: continue except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info( '[{}] sort_level3_name: {}, sort_level3_id: {}'.format( '+' if is_success else '-', sort_level3_name, sort_level3_id, )) return def get_tmcs_goods_relation_unique_id(self, sort_unique_id: str, goods_id: str): """ 获取分类与商品id的唯一id, 以在db 中进行唯一去重 :param sort_unique_id: :param goods_id: :return: """ return get_uuid3(target_str=sort_unique_id + goods_id) def get_tmcs_icon_type_by_sort_level1_id(self, sort_level1_id: int) -> str: """ 根据sort_level1_id获取对应icon_type的值 :param sort_level1_id: :return: """ for item in self.tm_first_sort_list: if item.get('level1_id', -1) == sort_level1_id: return item.get('icon_type', '') else: continue raise ValueError( '获取sort_level1_id: {}, 对应的icon_type异常'.format(sort_level1_id)) async def _tmcs_insert_into_sort_level3_2_db( self, sort_level1_id, sort_level1_name, sort_level2_name, data: dict, ): """ tmcs插入第三分类等级数据 :param sort_level1_id: :param sort_level1_name: :param sort_level2_name: :param data: :return: """ try: sort_level2_id = data.get('second_id', -1) assert sort_level2_id != -1 # 转换为该等级唯一的tmcs的id sort_level1_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level1_id, ) sort_level2_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level2_id, ) assert sort_level1_name != '' assert sort_level2_name != '' for item in data.get('third_list', []): try: sort_level3_id = item.get('id', -1) assert sort_level3_id != -1 # 转换为该等级唯一的tmcs的id sort_level3_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level3_id, ) sort_level3_name = item.get('name', '') assert sort_level3_name != '' self.lg.info( 'sort_level3_id: {}, sort_level3_name: {}'.format( sort_level3_id, sort_level3_name, )) unique_id = self.get_common_shop_sort_level_table_unique_id( shop_id='tmcs', sort_level1_id=sort_level1_id, sort_level2_id=sort_level2_id, sort_level3_id=sort_level3_id, ) self.sql_cli._insert_into_table_2( sql_str=self.sql_str0, params=( unique_id, sort_level1_id, sort_level1_name, sort_level2_id, sort_level2_name, sort_level3_id, sort_level3_name, 'tmcs', ), logger=self.lg, ) except Exception: continue except Exception: self.lg.error('遇到错误:', exc_info=True) return async def _tmcs_insert_into_sort_level2_2_db(self, data: dict): """ tmcs插入第二分类等级数据 :return: """ try: sort_level1_id = data.get('level1_id', -1) assert sort_level1_id != -1 # 转换为该等级唯一的tmcs的id sort_level1_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level1_id, ) sort_level1_name = data.get('first_sort_name', '') assert sort_level1_name != '' for item in data.get('second_list', []): try: sort_level2_id = item.get('id', -1) assert sort_level2_id != -1 # 转换为该等级唯一的tmcs的id sort_level2_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level2_id, ) sort_level2_name = item.get('name', '') assert sort_level2_name != '' self.lg.info( 'sort_level2_id: {}, sort_level2_name: {}'.format( sort_level2_id, sort_level2_name, )) unique_id = self.get_common_shop_sort_level_table_unique_id( shop_id='tmcs', sort_level1_id=sort_level1_id, sort_level2_id=sort_level2_id, ) self.sql_cli._insert_into_table_2( sql_str=self.sql_str0, params=( unique_id, sort_level1_id, sort_level1_name, sort_level2_id, sort_level2_name, '', '', 'tmcs', ), logger=self.lg, ) except Exception: continue except Exception: self.lg.error('遇到错误:', exc_info=True) return async def _tmcs_insert_into_sort_level1_2_db(self): """ tmcs插入第一分类等级数据 :return: """ for item in self.tm_first_sort_list: sort_level1_id = item.get('level1_id', '') # 转换为该等级唯一的tmcs的id sort_level1_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level1_id) sort_level1_name = item.get('name', '') self.lg.info('sort_level1_id: {}, sort_level1_name: {}'.format( sort_level1_id, sort_level1_name, )) unique_id = self.get_common_shop_sort_level_table_unique_id( shop_id='tmcs', sort_level1_id=sort_level1_id, ) try: self.sql_cli._insert_into_table_2( sql_str=self.sql_str0, params=( unique_id, sort_level1_id, sort_level1_name, '', '', '', '', 'tmcs', ), logger=self.lg, ) except Exception: self.lg.error('遇到错误:', exc_info=True) continue return def _get_unique_tmcs_sort_level_id(self, sort_level_id) -> str: """ 获取tmcs的唯一分类id 标识 :param sort_level_id: :return: """ return 'tmcs' + str(sort_level_id) async def get_tm_second_sort_info_by_first_sort_name( self, first_sort_name, icon_type: str, level1_id: int): """ 根据first_sort_name来获取second分类信息 :param first_sort_name: :param icon_type: :param level1_id: :return: """ # pc 分类需登录cookies, 且老失效, pass # tm m站超市分类 url: https://chaoshi.m.tmall.com/ # 每次使用请先 headers = { 'authority': 'h5api.m.tmall.com', 'user-agent': get_random_phone_ua(), 'accept': '*/*', # 'referer': 'https://pages.tmall.com/wow/chaoshi/act/chaoshi-category?spm=a3204.12691414.201609072.d78&wh_biz=tm&wh_showError=true&iconType=categoryxiuxianlingshi&name=%E4%BC%91%E9%97%B2%E9%9B%B6%E9%A3%9F&cateId=78&version=newIcon&storeId=&disableNav=YES', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', } # 每次使用请先 # ** 必传cookies(测试发现无敏感cookie可直接从chrome中复制) # 方法(推荐): # 直接复制对应接口得新cookies, 并修改cp_utils中的block_calculate_tb_right_sign中的t(该t为params参数中的t), _m_h5_tk为最新请求对应的值即可 # cookies = { # # '_l_g_': 'Ug%3D%3D', # # 令牌得官方服务器返回, 自己无法伪造 # # '_m_h5_tk': 'd7ad6d69cf119b053cf88936309cfc96_1590377299017', # # '_m_h5_tk_enc': '7723c851b571ed56b57a314a8446ea99', # '_m_h5_tk': '9d2d854027bf7cb19b47642e47e83e6c_1590382583798', # '_m_h5_tk_enc': '01ac5d6c8604ff0413af5c0ddcbfc1f3', # '_tb_token_': 'e5339a70ef7f4', # 'cookie17': 'UUplY9Ft9xwldQ%3D%3D', # 'cookie2': '169a1fd5707fc53066e20184c77dd949', # 't': '49edda27be5a68434c5899c297529ebb', # # '_nk_': '%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA', # # 'cna': 'wRsVFTj6JEoCAXHXtCqXOzC7', # # 'cookie1': 'UR3Wq2iKhDJHTTOd%2FGn4oh0oxwBK8EUqK%2Bm%2Bxv62FEM%3D', # # 'csa': '7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0', # # 'csg': '0cc7d64f', # # 'dnk': '%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA', # # 'enc': 'MXX6theE39REQu4vFae7f5vi8A8GAdt5pdcQAJY7eR3zuOxwTSUu0zQGRWpBLbzxbJUsLvdHk4vB8ZWvQR%2BjQg%3D%3D', # # 'hng': 'CN%7Czh-CN%7CCNY%7C156', # # 'isg': 'BEZGL3omZJqhYDIl6tv6ojn7lzrIp4ph0VBT8TBvMmlEM-ZNmDfacSzFDylam4J5', # # 'l': 'eB_zn817vA2VKUQxBOfwourza77OSIRAguPzaNbMiOCPOd5B5-j1WZAMqxL6C3GVh649R3JDb3ZQBeYBc3K-nxvtpdcXq3Dmn', # # 'lgc': '%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA', # # 'lid': '%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA', # # 'login': '******', # # 'sg': '%E4%BA%BA73', # # 'sm4': '330108', # # 'tracknick': '%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA', # # 'uc1': 'pas', # # 'uc3': 'id2', # # 'uc4': 'id4', # # 'unb': '2242024317', # } headers.update({ # 'cookie': dict_cookies_2_str(cookies), # 测试发现无敏感cookie可直接从chrome中复制 'cookie': self.tm_new_chrome_cookies_str, }) data = dumps({ "smAreaId": 330100, "csaInfo": "0_0_0_0_0_0_0_0_0_0_0_0_0", "csa": "0_0_0_0_0_0_0_0_0_0_0_0_0", "iconType": icon_type, "level1Id": str(level1_id), }) params = ( ('jsv', '2.5.1'), ('appKey', '12574478'), ('t', self.tm_new_chrome_t), # eg: '1590379653365' ('sign', self.tm_new_chrome_sign ), # eg: 'f0e252789605777cb36b6d99ce41ee7c' ('api', 'mtop.chaoshi.aselfshoppingguide.category.level1'), ('v', '1.0'), ('type', 'jsonp'), ('dataType', 'jsonp'), ('callback', 'mtopjsonp2'), # ('data', '{"smAreaId":330108,"csaInfo":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","csa":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","iconType":"categoryxiuxianlingshi","level1Id":"78"}'), ('data', data), ) base_url = 'https://h5api.m.tmall.com/h5/mtop.chaoshi.aselfshoppingguide.category.level1/1.0/' # 测试发现只需请求第一次即可获取到数据 result0 = await get_taobao_sign_and_body( base_url=base_url, headers=headers, params=tuple_or_list_params_2_dict_params(params=params), data=data, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, logger=self.lg, ) # self.lg.info(str(result0)) _m_h5_tk, body = result0[0], result0[2] assert body != '' # self.lg.info(_m_h5_tk) # self.lg.info(body) # assert _m_h5_tk != '' # _m_h5_tk, _session1, body = block_get_tb_sign_and_body( # base_url=base_url, # headers=headers, # params=tuple_or_list_params_2_dict_params(params=params), # data=data, # _m_h5_tk=_m_h5_tk, # session=result0[1], # ip_pool_type=tri_ip_pool, # # proxy_type=PROXY_TYPE_HTTPS, # ) # self.lg.info(body) data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}, logger=self.lg, ).get('data', {}).get('data', {}) assert data != {} # pprint(data) try: data['banner'] = [] except Exception: pass second_list = [] for item in data.get('secondList', []): # self.lg.info(item) try: _id = item.get('id', '') assert _id != '' name = item.get('text', '') assert name != '' # 过滤非通用分类 assert name not in self.tm_skip_name_tuple for i in self.tm_skip_name_tuple: if i in name: # 处理eg: '特惠', '尝新' 字眼 raise ValueError('出现跳过字眼, pass') else: pass business = item.get('business', '') assert business != '' except Exception: continue second_list.append({ 'id': _id, 'name': name, 'business': business, }) res = { 'first_sort_name': first_sort_name, 'icon_type': icon_type, 'level1_id': level1_id, 'second_list': second_list, } pprint(res) return res async def get_tm_third_sort_info_by_second_id( self, second_id: int, icon_type: str, business: str = 'B2C', ): """ 根据second_sort_id来获取third分类信息 :param second_id: :param icon_type: :param business: :return: """ headers = { 'authority': 'h5api.m.tmall.com', 'user-agent': get_random_phone_ua(), 'accept': '*/*', # 'referer': 'https://pages.tmall.com/wow/chaoshi/act/chaoshi-category?spm=a3204.12691414.201609072.d78&wh_biz=tm&wh_showError=true&iconType=categoryxiuxianlingshi&name=%E4%BC%91%E9%97%B2%E9%9B%B6%E9%A3%9F&cateId=78&version=newIcon&storeId=&disableNav=YES', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 测试发现无敏感cookie可直接从chrome中复制 'cookie': self.tm_new_chrome_cookies_str, } data = dumps({ "smAreaId": 330100, "csaInfo": "0_0_0_0_0_0_0_0_0_0_0_0_0", "csa": "0_0_0_0_0_0_0_0_0_0_0_0_0", "iconType": icon_type, "level2Id": str(second_id), "business": business, }) params = ( ('jsv', '2.5.1'), ('appKey', '12574478'), ('t', self.tm_new_chrome_t), # eg: '1590379653365' ('sign', self.tm_new_chrome_sign ), # eg: 'f0e252789605777cb36b6d99ce41ee7c' ('api', 'mtop.chaoshi.aselfshoppingguide.category.level2'), ('v', '1.0'), ('type', 'jsonp'), ('dataType', 'jsonp'), ('callback', 'mtopjsonp7'), # ('data', '{"smAreaId":330108,"csaInfo":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","csa":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","iconType":"categoryxiuxianlingshi","level1Id":"78"}'), ('data', data), ) base_url = 'https://h5api.m.tmall.com/h5/mtop.chaoshi.aselfshoppingguide.category.level2/1.0/' # 测试发现只需请求第一次即可获取到数据 result0 = await get_taobao_sign_and_body( base_url=base_url, headers=headers, params=tuple_or_list_params_2_dict_params(params=params), data=data, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, logger=self.lg, ) # self.lg.info(str(result0)) _m_h5_tk, body = result0[0], result0[2] assert body != '' # self.lg.info(_m_h5_tk) # self.lg.info(body) # assert _m_h5_tk != '' # _m_h5_tk, _session1, body = block_get_tb_sign_and_body( # base_url=base_url, # headers=headers, # params=tuple_or_list_params_2_dict_params(params=params), # data=data, # _m_h5_tk=_m_h5_tk, # session=result0[1], # ip_pool_type=tri_ip_pool, # # proxy_type=PROXY_TYPE_HTTPS, # ) # self.lg.info(body) data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}, logger=self.lg, ).get('data', {}).get('data', {}) assert data != {} # pprint(data) try: data['banner'] = [] except Exception: pass third_list = [] for item in data.get('thrirdList', []): # self.lg.info(item) try: _id = item.get('id', '') assert _id != '' name = item.get('text', '') assert name != '' # 过滤非通用分类 assert name not in self.tm_skip_name_tuple for i in self.tm_skip_name_tuple: # self.lg.info(name) if i in name: # 处理eg: '特惠', '尝新' 字眼 raise ValueError('出现跳过字眼, pass') else: pass except Exception: continue third_list.append({ 'id': _id, 'name': name, }) res = { 'second_id': second_id, 'third_list': third_list, } pprint(res) return res async def get_tm_fourth_sort_info_by_second_id_and_third_id( self, second_id: int, third_id: int, icon_type: str, business: str = 'B2C', ): """ 根据second_sort_id来获取third分类信息 :param second_id: :param third_id: :param icon_type: :param business: :return: """ headers = { 'authority': 'h5api.m.tmall.com', 'user-agent': get_random_phone_ua(), 'accept': '*/*', # 'referer': 'https://pages.tmall.com/wow/chaoshi/act/chaoshi-category?spm=a3204.12691414.201609072.d78&wh_biz=tm&wh_showError=true&iconType=categoryxiuxianlingshi&name=%E4%BC%91%E9%97%B2%E9%9B%B6%E9%A3%9F&cateId=78&version=newIcon&storeId=&disableNav=YES', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 测试发现无敏感cookie可直接从chrome中复制 'cookie': self.tm_new_chrome_cookies_str, } data = dumps({ "smAreaId": 330100, "csaInfo": "0_0_0_0_0_0_0_0_0_0_0_0_0", "csa": "0_0_0_0_0_0_0_0_0_0_0_0_0", "iconType": icon_type, "level2Id": str(second_id), 'level3Id': str(third_id), 'index': 50, 'pageSize': 20, "business": business, }) params = ( ('jsv', '2.5.1'), ('appKey', '12574478'), ('t', self.tm_new_chrome_t), # eg: '1590379653365' ('sign', self.tm_new_chrome_sign ), # eg: 'f0e252789605777cb36b6d99ce41ee7c' ('api', 'mtop.chaoshi.aselfshoppingguide.category.level3'), ('v', '1.0'), ('type', 'jsonp'), ('dataType', 'jsonp'), ('callback', 'mtopjsonp7'), # ('data', '{"smAreaId":330108,"csaInfo":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","csa":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","iconType":"categoryxiuxianlingshi","level1Id":"78"}'), ('data', data), ) base_url = 'https://h5api.m.tmall.com/h5/mtop.chaoshi.aselfshoppingguide.category.level3/1.0/' # 测试发现只需请求第一次即可获取到数据 result0 = await get_taobao_sign_and_body( base_url=base_url, headers=headers, params=tuple_or_list_params_2_dict_params(params=params), data=data, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, logger=self.lg, ) # self.lg.info(str(result0)) _m_h5_tk, body = result0[0], result0[2] assert body != '' # self.lg.info(_m_h5_tk) # self.lg.info(body) # assert _m_h5_tk != '' # _m_h5_tk, _session1, body = block_get_tb_sign_and_body( # base_url=base_url, # headers=headers, # params=tuple_or_list_params_2_dict_params(params=params), # data=data, # _m_h5_tk=_m_h5_tk, # session=result0[1], # ip_pool_type=tri_ip_pool, # # proxy_type=PROXY_TYPE_HTTPS, # ) # self.lg.info(body) data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}, logger=self.lg, ).get('data', {}).get('data', {}) assert data != {} # pprint(data) try: data['banner'] = [] except Exception: pass goods_list = [] for item in data.get('itemList', {}).get('itemAndContentList', []): # self.lg.info(item) try: goods_id = item.get('itemId', '') assert goods_id != '' title = item.get('shortTitle', '') assert title != '' except Exception: continue goods_list.append({ 'goods_id': str(goods_id), 'title': title, }) res = { 'second_id': second_id, 'third_id': third_id, 'goods_list': goods_list, } pprint(res) return res def __del__(self): try: del self.lg del self.sql_cli del self.db_existed_goods_id_list except: pass collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where (SiteID=7 or SiteID=8 or SiteID=9 or SiteID=10) and GETDATE() - ModfiyTime > 3' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') continue print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 jd = JdParse() for item in result: # 实时更新数据 # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # jd = JdParse() if index % 5 == 0: try: del jd except: pass gc.collect() jd = JdParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[1], index)) tmp_item = [] if item[0] == 7 or item[0] == 8: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 9: tmp_item.append(1) elif item[0] == 10: tmp_item.append(2) tmp_item.append(item[1]) jd.get_goods_data(goods_id=tmp_item) data = jd.deal_with_data(goods_id=tmp_item) if data != {}: data['goods_id'] = item[1] data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[2], MyShelfAndDownTime=item[3] ) data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[4], old_taobao_price=item[5], new_price=data['price'], new_taobao_price=data['taobao_price'] ) # print('------>>>| 爬取到的数据为: ', data) jd.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del jd # except: # pass gc.collect() sleep(1.5) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) try: del jd except: pass if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) # del ali_1688 gc.collect()
''' @author = super_fazai @File : test_sql_str.py @Time : 2018/6/14 07:41 @connect : [email protected] ''' import sys sys.path.append('..') from pprint import pprint from my_pipeline import SqlServerMyPageInfoSaveItemPipeline from fzutils.sql_utils import pretty_table _ = SqlServerMyPageInfoSaveItemPipeline() sql_str = ''' select top 20 id, head_img_url from dbo.sina_weibo where sina_type = 'bilibili' ''' pretty_table(cursor=_._get_one_select_cursor(sql_str=sql_str, params=None)) # 更新 # sql_str_2 = 'UPDATE dbo.daren_recommend set share_img_url_list=NULL, goods_id_list=NULL, share_goods_base_info=%s where MainID=579;' # result = _._update_table(sql_str=sql_str_2, params=params) # print(result) # 删除 # delete_sql = 'delete from dbo.sina_weibo where id=%s' # while True:
class DbTimingScript(AsyncCrawler): """数据库定时脚本""" def __init__(self): AsyncCrawler.__init__( self, ) self.sleep_time = 2. * 60 self.init_sql_str() def init_sql_str(self): # 删除下架又上架但是状态还是下架的异常数据(即下架状态但是delete_time<shelf_time)(原因后台无法更新) self.sql_str0 = ''' select top 100 GoodsID, SiteID from dbo.GoodsInfoAutoGet where MainGoodsID is not NULL and IsDelete=1 and delete_time < shelf_time ''' self.sql_str1 = ''' update dbo.GoodsInfoAutoGet set ModfiyTime=%s, delete_time=%s where GoodsID=%s ''' # 更改原先下架但是delete_time为空的商品(原因后台无法由上架变下架) self.sql_str2 = ''' select top 100 GoodsID, SiteID from dbo.GoodsInfoAutoGet where MainGoodsID is not null and IsDelete=1 and delete_time is null ''' self.sql_str3 = ''' update dbo.GoodsInfoAutoGet set delete_time=%s where GoodsID=%s ''' # 更改上架状态但是delete_time>shelf_time的商品(原因后台无法更新下架变上架) self.sql_str4 = ''' select top 100 GoodsID, SiteID from dbo.GoodsInfoAutoGet where MainGoodsID is not NUll and IsDelete=0 and shelf_time < delete_time ''' self.sql_str5 = ''' update dbo.GoodsInfoAutoGet set ModfiyTime=%s, shelf_time=%s where GoodsID=%s ''' # tb天天特价过期下架 self.sql_str6 = ''' select top 500 goods_id, site_id from dbo.taobao_tiantiantejia where MainGoodsID is not null and is_delete=0 and miaosha_end_time < GETDATE() ''' # zhe800秒杀标记下架 self.sql_str7 = ''' select top 500 goods_id, site_id from dbo.zhe_800_xianshimiaosha where MainGoodsID is not null and is_delete=0 and miaosha_end_time <= GETDATE() ''' # zhe800拼团过期下架 self.sql_str8 = ''' select top 500 goods_id, site_id from dbo.zhe_800_pintuan where MainGoodsID is not null and is_delete=0 and miaosha_end_time <= GETDATE() ''' # mia拼团 self.sql_str9 = ''' select top 500 goods_id, site_id from dbo.mia_pintuan where MainGoodsID is not null and is_delete=0 and miaosha_end_time <= GETDATE() ''' # 聚美优品拼团 self.sql_str10 = ''' select top 500 goods_id, site_id from dbo.jumeiyoupin_pintuan where MainGoodsID is not null and is_delete=0 and miaosha_end_time <= GETDATE() ''' async def _fck_run(self): while True: try: print('now_time: {}'.format(get_shanghai_time())) self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() if not self.sql_cli.is_connect_success: raise SqlServerConnectionException else: pass await self.db_script0( select_sql_str=self.sql_str0, update_sql_str=self.sql_str1, func_get_params=self.get_params0, ) await self.db_script0( select_sql_str=self.sql_str2, update_sql_str=self.sql_str3, func_get_params=self.get_params1, ) await self.db_script0( select_sql_str=self.sql_str4, update_sql_str=self.sql_str5, func_get_params=self.get_params0, ) # tb天天特价 await self.db_script0( select_sql_str=self.sql_str6, update_sql_str=tb_update_str_5, func_get_params=self.get_params2, ) # zhe800秒杀 await self.db_script0( select_sql_str=self.sql_str7, update_sql_str=z8_update_str_6, func_get_params=self.get_params2, ) # zhe800拼团 await self.db_script0( select_sql_str=self.sql_str8, update_sql_str=z8_update_str_4, func_get_params=self.get_params2, ) # mia拼团 await self.db_script0( select_sql_str=self.sql_str9, update_sql_str=mia_update_str_7, func_get_params=self.get_params2, ) # 聚美优品拼团 await self.db_script0( select_sql_str=self.sql_str10, update_sql_str=jm_update_str_5, func_get_params=self.get_params2, ) except Exception as e: print(e) finally: print('休眠{}s ...'.format(self.sleep_time)) await async_sleep(self.sleep_time) async def db_script0(self, select_sql_str: str, update_sql_str: str, func_get_params,): get_current_func_info_by_traceback(self=self) db_res = self.sql_cli._select_table( sql_str=select_sql_str, ) db_res = [] if db_res is None else db_res if db_res == []: print('目标db_res为空list! 跳过此次!') return None for item in db_res: params = func_get_params(k=item) self.sql_cli._update_table( sql_str=update_sql_str, params=params, ) try: del db_res except: pass return None def get_params0(self, k) -> tuple: now_time = str(get_shanghai_time()) goods_id = k[0] site_id = k[1] print('goods_id: {}, site_id: {}'.format(goods_id, site_id)) return tuple([ now_time, now_time, goods_id, ]) def get_params1(self, k) -> tuple: now_time = str(get_shanghai_time()) goods_id = k[0] site_id = k[1] print('goods_id: {}, site_id: {}'.format(goods_id, site_id)) return tuple([ now_time, goods_id, ]) def get_params2(self, k) -> tuple: now_time = str(get_shanghai_time()) goods_id = k[0] site_id = k[1] print('goods_id: {}, site_id: {}'.format(goods_id, site_id)) return tuple([ now_time, goods_id, ]) def __del__(self): try: pass except: pass collect()
async def deal_with_all_goods_id(self): ''' 获取每个详细分类的商品信息 :return: None ''' sort_data = await self.get_all_goods_list() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # my_pipeline = SqlPools() index = 1 if my_pipeline.is_connect_success: # 普通sql_server连接(超过3000无返回结果集) self.my_lg.info('正在获取天天特价db原有goods_id, 请耐心等待...') db_ = list(my_pipeline._select_table(sql_str=tb_select_str_6)) db_goods_id_list = [[item[0], item[2]] for item in db_] self.my_lg.info('获取完毕!!!') # print(db_goods_id_list) db_all_goods_id = [i[0] for i in db_goods_id_list] for item in sort_data: tejia_goods_list = await self.get_tiantiantejia_goods_list( data=item.get('data', [])) self.my_lg.info(str(tejia_goods_list)) for tmp_item in tejia_goods_list: if tmp_item.get( 'goods_id', '' ) in db_all_goods_id: # 处理如果该goods_id已经存在于数据库中的情况 try: tmp_end_time = [ i[1] for i in db_goods_id_list if tmp_item.get('goods_id', '') == i[0] ][0] # print(tmp_end_time) except: tmp_end_time = '' if tmp_end_time != '' and tmp_end_time < datetime.datetime.now( ): ''' * 处理由常规商品又转换为天天特价商品 * ''' self.my_lg.info('##### 该商品由常规商品又转换为天天特价商品! #####') # 先删除,再重新插入 _ = await my_pipeline.delete_taobao_tiantiantejia_expired_goods_id( goods_id=tmp_item.get('goods_id', ''), logger=self.my_lg) if _ is False: continue index = await self.insert_into_table( tmp_item=tmp_item, category=item['category'], current_page=item['current_page'], my_pipeline=my_pipeline, index=index, ) await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) else: self.my_lg.info('该goods_id已经存在于数据库中, 此处跳过') pass else: if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # my_pipeline = SqlPools() self.my_lg.info('与数据库的新连接成功建立...') if my_pipeline.is_connect_success: index = await self.insert_into_table( tmp_item=tmp_item, category=item['category'], current_page=item['current_page'], my_pipeline=my_pipeline, index=index, ) await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) else: self.my_lg.error('数据库连接失败!') pass else: self.my_lg.error('数据库连接失败!') pass gc.collect() return True
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = ''' select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=2 and GETDATE()-ModfiyTime>1 ''' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse() for item in result: # 实时更新数据 if index % 5 == 0: ali_1688 = ALi1688LoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data = ali_1688.get_ali_1688_data(item[0]) if isinstance(data, int) is True: # 单独处理返回tt为4041 continue else: pass if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(1.5) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中''' # 业务逻辑 # 公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格 data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(.3) # 避免服务器更新太频繁 else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del ali_1688 # except: # pass gc.collect() sleep(2.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
class ZWMSpider(AsyncCrawler): def __init__(self): AsyncCrawler.__init__( self, ip_pool_type=IP_POOL_TYPE, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/zwm/_/', ) self.init_zwm_pwd() self.concurrency = 20 self.num_retries = 6 self.max_transaction_details_page_num = 20 # 交易截止抓取页 self.max_business_settlement_records_page_num = 20 # 商户结算记录截止抓取页 self.max_business_manage_page_num = 80 # 商户及门店管理截止抓取页(单数据也超过此数量就得进行修改) self.login_cookies_dict = {} self.sleep_time = 5 def init_zwm_pwd(self): ori_data = '' with open(ZWM_PWD_PATH, 'r') as f: for line in f: ori_data += line.replace('\n', '').replace(' ', '') data = json_2_dict( json_str=ori_data, logger=self.lg, default_res={},) self.zwm_username, self.zwm_pwd = data['username'], data['pwd'] assert self.zwm_username != '' and self.zwm_pwd != '' async def _fck_run(self) -> None: while True: try: login_res = await self._login() assert login_res is True, '登录失败, 退出后续同步操作!' # 获取所有交易明细(自己有接口, 不需要了) # all_transaction_details = await self._get_all_transaction_details() # pprint(all_transaction_details) # self.lg.info('len_all_transaction_details: {}'.format(len(all_transaction_details))) # await self._wash_and_save_all_transaction_details(target_list=all_transaction_details) # 获取所有商户结算记录 self.lg.info('获取所有商户结算记录...') all_business_settlement_records = await self._get_all_business_settlement_records_by_something() # pprint(all_business_settlement_records) self.lg.info('len_now_business_settlement_records: {}'.format(len(all_business_settlement_records))) await self._wash_save_all_business_settlement_records(target_list=all_business_settlement_records) self.lg.info('\n') # 获取所有商户及门店管理记录 self.lg.info('获取所有商户及门店管理记录 ...') all_business_manage_records = await self._get_all_business_manage_records_by_something() # pprint(all_business_manage_records) self.lg.info('len_all_business_manage_records: {}'.format(len(all_business_manage_records))) await self._wash_save_all_business_manage_records(target_list=all_business_manage_records) self.lg.info('\n') except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('## 同步完成 ##') self.lg.info('休眠 {} minutes ...'.format(self.sleep_time)) # 定时 await async_sleep(60 * self.sleep_time) async def _login(self) -> bool: """ 登录 :return: """ headers = await self._get_random_pc_headers() headers.update({ 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/loginNew.jsp', }) file_load = { 'loginName': self.zwm_username, 'userPassword': self.zwm_pwd, } m = MultipartEncoder(fields=file_load) # self.lg.info(m) headers.update({ 'Content-Type': m.content_type }) login_url = 'https://agent.yrmpay.com/JHAdminConsole/foreigncard/permissionsLogin.do' with session() as _session: try: response = _session.post( url=login_url, headers=headers, data=m, proxies=self._get_proxies(),) login_res = json_2_dict( json_str=response.text, default_res={}, logger=self.lg, ).get('message', '') assert login_res == '登录成功', '登录失败!' self.lg.info(login_res) self.login_cookies_dict = response.cookies.get_dict() assert self.login_cookies_dict != {}, 'self.login_cookies_dict != 空dict!' # pprint(self.login_cookies_dict) except Exception: self.lg.error('遇到错误:', exc_info=True) return False return True async def _wash_save_all_business_manage_records(self, target_list: list): """ 清洗并存储所有未存储的 or 更新所有已存储的business manage records :param target_list: :return: """ all_res = [] for item in target_list: try: now_time = get_shanghai_time() create_time, modify_time, approval_status_change_time = now_time, now_time, now_time agent_name = item['agentName'] top_agent_name = item['topAgentName'] shop_type = item['merType'] is_high_quality_shop = item['isHighQualityMer'] if is_high_quality_shop == '否': is_high_quality_shop = 0 elif is_high_quality_shop == '是': is_high_quality_shop = 1 else: raise ValueError('is_high_quality_shop value: {} 异常!'.format(is_high_quality_shop)) shop_id = item.get('jhmid', '') assert shop_id != '' shop_chat_name = item.get('merchantName', '') assert shop_chat_name != '' phone_num = item.get('phone', '') assert phone_num != '' shop_chant_num = int(item['merchantNum']) sale = item['sale'] is_real_time = 0 if item['isRealTime'] == '未开通' else 1 approve_date = date_parse(item['approveDate']) rate = Decimal(item['rate']).__round__(4) account_type = item['accType'] apply_time = date_parse(item['applyTime']) # 可为空值 process_context = item.get('processContext', '') is_non_contact = 0 if item['isNonContact'] == '未开通' else 1 approval_status = item['approvalStatus'] if approval_status == '待审核': approval_status = 1 elif approval_status == '审核通过': approval_status = 0 elif approval_status == '退回': approval_status = 2 else: raise ValueError('approval_status value: {} 异常'.format(approval_status)) # 用其原值为定值不变, 且唯一 unique_id = item['id'] except Exception: self.lg.error('遇到错误:', exc_info=True) continue zwm_item = ZWMBusinessManageRecordItem() zwm_item['unique_id'] = unique_id zwm_item['create_time'] = create_time zwm_item['modify_time'] = modify_time zwm_item['agent_name'] = agent_name zwm_item['top_agent_name'] = top_agent_name zwm_item['shop_type'] = shop_type zwm_item['is_high_quality_shop'] = is_high_quality_shop zwm_item['shop_id'] = shop_id zwm_item['shop_chat_name'] = shop_chat_name zwm_item['phone_num'] = phone_num zwm_item['shop_chant_num'] = shop_chant_num zwm_item['sale'] = sale zwm_item['is_real_time'] = is_real_time zwm_item['approve_date'] = approve_date zwm_item['rate'] = rate zwm_item['account_type'] = account_type zwm_item['apply_time'] = apply_time zwm_item['process_context'] = process_context zwm_item['is_non_contact'] = is_non_contact zwm_item['approval_status'] = approval_status zwm_item['approval_status_change_time'] = approval_status_change_time all_res.append(dict(zwm_item)) # 查看 # if shop_id == 'YRMPAY100038574': # if phone_num == '18192242001': # if shop_chat_name == '哇哇叫': # pprint(dict(zwm_item)) # pprint(all_res) await self._insert_or_update_shop_manage_records_table(all_res=all_res) try: del all_res except: pass return None async def _insert_or_update_shop_manage_records_table(self, all_res: list): """ 插入or update原数据 :param all_res: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_2, params=None, logger=self.lg, ) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None new_add_count = 0 for item in all_res: unique_id = item['unique_id'] if unique_id not in db_unique_id_list: # 插入 self.lg.info('inserting unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params2(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_2, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue else: db_old_approval_status, db_old_approval_status_change_time = await self._get_dd_old_approval_status_and_approval_status_change_time( db_data=db_data, unique_id=unique_id,) item['approval_status_change_time'] = await self._get_new_approval_status_change_time( db_old_approval_status=db_old_approval_status, db_old_approval_status_change_time=db_old_approval_status_change_time, new_approval_status=item['approval_status'], new_approval_status_change_time=item['approval_status_change_time']) # 更新 self.lg.info('updating unique_id: {} ...'.format(unique_id)) params = await self._get_update_item_params(item=item) try: res = self.sql_cli._update_table_2( sql_str=zwm_update_str_1, params=params, logger=self.lg) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass try: del db_data del db_unique_id_list except: pass self.lg.info('table.zwm_buss_manage_records新增个数: {}'.format(new_add_count)) async def _get_new_approval_status_change_time(self, db_old_approval_status, db_old_approval_status_change_time, new_approval_status, new_approval_status_change_time): """ 获取新的approval_status_change_time :return: """ if db_old_approval_status_change_time is not None: new_approval_status_change_time = db_old_approval_status_change_time \ if db_old_approval_status == new_approval_status \ else get_shanghai_time() else: pass return new_approval_status_change_time async def _get_dd_old_approval_status_and_approval_status_change_time(self, db_data: list, unique_id: str) -> tuple: """ 获取db 原先的approval_status :param db_data: :param unique_id: :return: """ for item in db_data: if unique_id == item[0]: return item[1], item[2] else: continue async def _get_all_business_manage_records_by_something(self,): """ 获取所有商户及门店管理记录 :return: """ async def get_tasks_params_list(max_business_manage_page_num) -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, max_business_manage_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where page_num: {}]...'.format(k['page_num']) def get_now_args(k) -> list: return [ k['page_num'], ] res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list( max_business_manage_page_num=self.max_business_manage_page_num), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self._get_one_page_business_manage_records_by_something, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res, one_default_res=[], step=self.concurrency, logger=self.lg, get_all_res=True,) return res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_business_manage_records_by_something(self, page_num: int, start_date: str = None, end_date: str = None,): """ 获取单页商户及门店管理记录 :param page_num: :param start_date: 默认设置前一个月27号, eg: '2019-01-27 00:00' :param end_date: eg: '2019-07-20 09:39' :return: """ # todo 获取最开始->至今的, 即采集所有, 避免老店铺的审核状态变动, 而后台无法同步状态, 审核时间 # start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] + ' 00:00' start_date = '2018-01-01 00:00' end_date = (str(get_shanghai_time()) if end_date is None else end_date)[0:16] self.lg.info('start_date: {}, end_date: {}'.format(start_date, end_date)) headers = self.get_random_pc_headers() headers.update({ 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/page.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'merchantCode': '', 'accType': '', 'phone': '', 'approveDate': '', 'merchantName': '', 'processStatus': '', 'startTime': start_date, 'endTime': end_date, 'agentName': '', 'page': str(page_num), 'start': str((page_num - 1) * 100), # 开始位置0, 100, 200 'limit': '100', } url = 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/materialList.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('materialList', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res async def _wash_save_all_business_settlement_records(self, target_list): """ 清洗并存储 未被存储的所有商户结算记录 :param target_list: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_1, params=None, logger=self.lg,) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None all_res = [] for item in target_list: # pprint(item) try: create_time = get_shanghai_time() shop_name = item.get('merName', '') assert shop_name != '' shop_id = item.get('mid', '') assert shop_id != '' agent_name = item['agentName'] top_agent_name = item['topAgentName'] date_settle_type = item['settleType'] trans_amount = item.get('transAmt', '') assert trans_amount != '' trans_amount = Decimal(trans_amount).__round__(2) service_charge = Decimal(item['mda']).__round__(2) accounting_amount = Decimal(item['mnamt']).__round__(2) # 正常情况为: '20190704', 异常为'20190824-20190824' txn_day = item['txnDay'] if re.compile('-').findall(txn_day) != []: txn_day = txn_day.split('-')[0] else: pass trans_date = date_parse(txn_day) trans_status = item['status'] if trans_status == '已结算': trans_status = 0 else: raise ValueError('trans_status: {}, 未知交易状态!'.format(trans_status)) settle_type = item['type'] settle_date = date_parse(item['minDay']) # 生成唯一标识码 unique_id = get_uuid3( target_str=shop_id + str(date_settle_type) + str(trans_amount) + \ str(service_charge) + str(trans_date) + \ str(settle_type) + str(settle_date),) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if unique_id in db_unique_id_list: # self.lg.info('该record[unique_id: {}]已存在!'.format(unique_id)) continue settle_record_item = ZWMBusinessSettlementRecordItem() settle_record_item['unique_id'] = unique_id settle_record_item['create_time'] = create_time settle_record_item['shop_name'] = shop_name settle_record_item['shop_id'] = shop_id settle_record_item['agent_name'] = agent_name settle_record_item['top_agent_name'] = top_agent_name settle_record_item['date_settle_type'] = date_settle_type settle_record_item['trans_amount'] = trans_amount settle_record_item['service_charge'] = service_charge settle_record_item['accounting_amount'] = accounting_amount settle_record_item['trans_date'] = trans_date settle_record_item['trans_status'] = trans_status settle_record_item['settle_type'] = settle_type settle_record_item['settle_date'] = settle_date all_res.append(dict(settle_record_item)) # pprint(all_res) self.lg.info('未存储个数: {}'.format(len(all_res))) await self._save_all_business_settlement_records(all_res=all_res) try: del all_res except: pass return None async def _save_all_business_settlement_records(self, all_res) -> None: """ 存储新增的商家提现记录 :param all_res: :return: """ new_add_count = 0 for item in all_res: # 处理未存储的新数据 unique_id = item['unique_id'] self.lg.info('saving unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_1, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass self.lg.info('新增个数: {}'.format(new_add_count)) return None async def _get_insert_item_params(self, item) -> tuple: """ 待插入对象 :param item: :return: """ return tuple([ item['unique_id'], item['create_time'], item['shop_name'], item['shop_id'], item['agent_name'], item['top_agent_name'], item['date_settle_type'], item['trans_amount'], item['service_charge'], item['accounting_amount'], item['trans_date'], item['trans_status'], item['settle_type'], item['settle_date'], ]) async def _get_insert_item_params2(self, item) -> tuple: """ 待插入对象, zwm_buss_manage_records table :param item: :return: """ return tuple([ item['unique_id'], item['create_time'], item['modify_time'], item['agent_name'], item['top_agent_name'], item['shop_type'], item['is_high_quality_shop'], item['shop_id'], item['shop_chat_name'], item['phone_num'], item['shop_chant_num'], item['sale'], item['is_real_time'], item['approve_date'], item['rate'], item['account_type'], item['apply_time'], item['process_context'], item['is_non_contact'], item['approval_status'], item['approval_status_change_time'], ]) async def _get_update_item_params(self, item: dict) -> tuple: """ 更新对象, zwm_buss_manage_records table :param item: :return: """ return tuple([ item['modify_time'], item['agent_name'], item['top_agent_name'], item['shop_type'], item['is_high_quality_shop'], item['shop_id'], item['shop_chat_name'], item['phone_num'], item['shop_chant_num'], item['sale'], item['is_real_time'], item['approve_date'], item['rate'], item['account_type'], item['apply_time'], item['process_context'], item['is_non_contact'], item['approval_status'], item['approval_status_change_time'], item['unique_id'], ]) async def _wash_and_save_all_transaction_details(self, target_list: list): """ 清洗并存储所有交易明细 :param target_list: :return: """ pass async def _get_all_business_settlement_records_by_something(self): """ 获取所有商户结算记录 :return: """ async def get_tasks_params_list(max_business_settlement_records_page_num) -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, max_business_settlement_records_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where page_num: {}]...'.format(k['page_num']) def get_now_args(k) -> list: return [ k['page_num'], ] res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list( max_business_settlement_records_page_num=self.max_business_settlement_records_page_num), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self._get_one_page_business_settlement_records_by_something, func_name_where_get_now_args=get_now_args, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res, one_default_res=[], step=self.concurrency, logger=self.lg, get_all_res=True,) return res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_business_settlement_records_by_something(self, page_num :int, start_date: str=None, end_date: str=None, mid: str='', agent_name: str='') -> list: """ 得到单页商户结算记录 :param page_num: :param start_date: 默认设置前一个月27号, eg: '2019-07-01' :param end_date: eg: '2019-07-16' :param mid: 商户编号 :param agent_name: 顶级机构名称 :return: """ start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] # start_date = '2018-01-01' end_date = (str(get_shanghai_time()) if end_date is None else end_date).split(' ')[0] self.lg.info('start_date: {}, end_date: {}'.format(start_date, end_date)) headers = self.get_random_pc_headers() headers.update({ 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merSettle/querySettleJsp.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'startDate': start_date, 'endDate': end_date, 'mid': mid, 'agentName': agent_name, 'loginAgentId': self.zwm_username[0:8], # 前8位 'page': str(page_num), 'start': str((page_num - 1) * 100), # 开始位置, 0, 100, 200 'limit': '100', } url = 'https://agent.yrmpay.com/JHAdminConsole/merSettle/queryMerSettleList.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) # self.lg.info(body) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('data', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res async def _get_all_transaction_details(self) -> list: """ 获取所有交易流水 :return: """ async def _get_tasks_params_list() -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, self.max_transaction_details_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list tasks_params_list = await _get_tasks_params_list() tasks_params_list_obj = TasksParamsListObj( tasks_params_list=tasks_params_list, step=self.concurrency,) all_res = [] while True: try: slice_params_list = tasks_params_list_obj.__next__() except AssertionError: break tasks = [] for k in slice_params_list: page_num = k['page_num'] self.lg.info('create task[where page_num: {}]...'.format(page_num)) func_args = [ page_num, ] tasks.append(self.loop.create_task( unblock_func( func_name=self._get_one_page_transaction_details_by_something, func_args=func_args, logger=self.lg,))) one_res = await async_wait_tasks_finished(tasks=tasks) try: del tasks except: pass for i in one_res: for j in i: all_res.append(j) return all_res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_transaction_details_by_something(self, page_num: int, start_date: str=None, end_date: str=None, transaction_status: str='', mer_name: str='', order_no: str='', mid: str='', agent_name: str='', pay_channel: str ='', sale_name: str='',) -> list: """ 获取单页交易流水 :param page_num: 开始页面, eg: 1, 2, 3 :param start_date: eg: '2019-07-16 00:00' :param end_data: eg: '2019-07-16 10:02' :param transaction_status: 交易状态 | 选择全部: '' or 交易成功: '1' or 退款成功: '3' :param mer_name: 待查询的商户名称 :param order_no: 订单号 :param mid: 商户编号 :param agent_name: 顶级机构名称 :param pay_channel: 支付渠道 | 请选择: '' or 微信: '50' or 支付宝: '51' or 微信条码: '55' or 支付宝条码: '56' or 微信小程序: '67' :param sale_name: 销售名称 :return: """ res = [] start_date = self.get_0_00_on_the_day() if start_date is None else start_date end_date = str(get_shanghai_time()) if end_date is None else end_date headers = self.get_random_pc_headers() headers.update({ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': '*/*', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/transflow.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'startDate': start_date, 'endDate': end_date, 'type': '2', 'status': transaction_status, 'payChannel': pay_channel, 'orderNo': order_no, 'merName': mer_name, 'mid': mid, 'agentName': agent_name, 'saleName': sale_name, 'page': str(page_num), 'start': str((page_num - 1) * 20), # 开始位置, 0, 20, 40 'limit': '20', } url = 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/querylimafuTransFlow.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('data', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res def get_0_00_on_the_day(self) -> str: """ 获取当天的0点 :return: """ now_time = get_shanghai_time() return str(datetime( year=now_time.year, month=now_time.month, day=now_time.day)) def get_1_on_the_month(self) -> str: """ 获取当月的第一天 :return: """ now_time = get_shanghai_time() # 避免月底流水无法获取 day = 5 now_month = now_time.month if now_month > 1: now_month -= 1 else: # now_month为1月份 now_month = 12 return str(datetime( year=now_time.year, month=now_month, day=day,)) def _get_proxies(self) -> dict: """ 获取代理 :return: """ proxies = Requests._get_proxies(ip_pool_type=self.ip_pool_type, ) assert proxies != {}, 'proxies不为空dict!' return proxies async def _get_random_pc_headers(self) -> dict: """ :return: """ return self.get_random_pc_headers() @staticmethod def get_random_pc_headers() -> dict: headers = get_random_headers( upgrade_insecure_requests=False, cache_control='',) headers.update({ 'Origin': 'https://agent.yrmpay.com', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundarytSJCAoaErjNY4IbM', 'accept': 'text/plain, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', }) return headers def __del__(self): try: del self.lg del self.login_cookies_dict except: pass try: del self.loop except: pass collect()
async def _wash_save_all_business_settlement_records(self, target_list): """ 清洗并存储 未被存储的所有商户结算记录 :param target_list: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_1, params=None, logger=self.lg,) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None all_res = [] for item in target_list: # pprint(item) try: create_time = get_shanghai_time() shop_name = item.get('merName', '') assert shop_name != '' shop_id = item.get('mid', '') assert shop_id != '' agent_name = item['agentName'] top_agent_name = item['topAgentName'] date_settle_type = item['settleType'] trans_amount = item.get('transAmt', '') assert trans_amount != '' trans_amount = Decimal(trans_amount).__round__(2) service_charge = Decimal(item['mda']).__round__(2) accounting_amount = Decimal(item['mnamt']).__round__(2) # 正常情况为: '20190704', 异常为'20190824-20190824' txn_day = item['txnDay'] if re.compile('-').findall(txn_day) != []: txn_day = txn_day.split('-')[0] else: pass trans_date = date_parse(txn_day) trans_status = item['status'] if trans_status == '已结算': trans_status = 0 else: raise ValueError('trans_status: {}, 未知交易状态!'.format(trans_status)) settle_type = item['type'] settle_date = date_parse(item['minDay']) # 生成唯一标识码 unique_id = get_uuid3( target_str=shop_id + str(date_settle_type) + str(trans_amount) + \ str(service_charge) + str(trans_date) + \ str(settle_type) + str(settle_date),) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if unique_id in db_unique_id_list: # self.lg.info('该record[unique_id: {}]已存在!'.format(unique_id)) continue settle_record_item = ZWMBusinessSettlementRecordItem() settle_record_item['unique_id'] = unique_id settle_record_item['create_time'] = create_time settle_record_item['shop_name'] = shop_name settle_record_item['shop_id'] = shop_id settle_record_item['agent_name'] = agent_name settle_record_item['top_agent_name'] = top_agent_name settle_record_item['date_settle_type'] = date_settle_type settle_record_item['trans_amount'] = trans_amount settle_record_item['service_charge'] = service_charge settle_record_item['accounting_amount'] = accounting_amount settle_record_item['trans_date'] = trans_date settle_record_item['trans_status'] = trans_status settle_record_item['settle_type'] = settle_type settle_record_item['settle_date'] = settle_date all_res.append(dict(settle_record_item)) # pprint(all_res) self.lg.info('未存储个数: {}'.format(len(all_res))) await self._save_all_business_settlement_records(all_res=all_res) try: del all_res except: pass return None
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, page, goods_url from dbo.jumeiyoupin_xianshimiaosha where site_id=26' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 jumeiyoupin_spike = JuMeiYouPinSpike() # 获取cookies my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) cookies = my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 jumeiyoupin_miaosha = JuMeiYouPinParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] this_page_all_goods_list = self.get_one_page_all_goods_list( item[2]) if this_page_all_goods_list == '网络错误!': print('网络错误!先跳过') continue elif this_page_all_goods_list == []: print( '#### 该page对应得到的this_page_all_goods_list为空[]!') print('** 该商品已被下架限时秒杀活动, 此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url( item[3]) jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get( 'begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = jumeiyoupin_spike.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time'] ) # print(goods_data) jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(JUMEIYOUPIN_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
async def _insert_or_update_shop_manage_records_table(self, all_res: list): """ 插入or update原数据 :param all_res: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_2, params=None, logger=self.lg, ) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None new_add_count = 0 for item in all_res: unique_id = item['unique_id'] if unique_id not in db_unique_id_list: # 插入 self.lg.info('inserting unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params2(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_2, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue else: db_old_approval_status, db_old_approval_status_change_time = await self._get_dd_old_approval_status_and_approval_status_change_time( db_data=db_data, unique_id=unique_id,) item['approval_status_change_time'] = await self._get_new_approval_status_change_time( db_old_approval_status=db_old_approval_status, db_old_approval_status_change_time=db_old_approval_status_change_time, new_approval_status=item['approval_status'], new_approval_status_change_time=item['approval_status_change_time']) # 更新 self.lg.info('updating unique_id: {} ...'.format(unique_id)) params = await self._get_update_item_params(item=item) try: res = self.sql_cli._update_table_2( sql_str=zwm_update_str_1, params=params, logger=self.lg) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass try: del db_data del db_unique_id_list except: pass self.lg.info('table.zwm_buss_manage_records新增个数: {}'.format(new_add_count))
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) data = self._get_one_session_id_data( base_session_id=base_session_id) sleep(.5) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 base_session_id += 2 continue try: begin_times_timestamp = self._get_begin_times_timestamp(data) except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp)) is_recent_time = self.is_recent_time( timestamp=begin_times_timestamp) if not is_recent_time: # 说明秒杀日期合法 base_session_id += 2 continue try: data = [ item_s.get('deal', {}) for item_s in data.get('data', {}).get('blocks', []) ] except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = self._get_db_goods_id_list(my_pipeline) for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str( item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url(tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = str(item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get( 'taobao_price') goods_data['sub_title'] = item.get('sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get('miaosha_time')) goods_data['session_id'] = str(base_session_id) # print(goods_data) res = zhe_800.insert_into_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 sleep(4) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') pass base_session_id += 2
def deal_with_data(self, *params): ''' 处理并存储相关拼团商品的数据 :param params: 待传参数 :return: ''' goods_list = params[0] mogujie = MoGuJieParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=mg_select_str_1)) db_goods_id_list = [item[0] for item in _] print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://shop.mogujie.com/detail/' + str( goods_id) mogujie.get_goods_data(goods_id=str(goods_id)) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 # 规范化 goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['pintuan_time'] = item.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get('pintuan_time', {})) goods_data['all_sell_count'] = item.get( 'all_sell_count', '') goods_data['fcid'] = str(item.get('fcid')) goods_data['page'] = str(item.get('page')) goods_data['sort'] = str(item.get('sort', '')) # pprint(goods_data) # print(goods_data) _r = mogujie.insert_into_mogujie_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass collect()
class TMUpdater(AsyncCrawler): """tm 实时更新""" def __init__(self, *params, **kwargs): AsyncCrawler.__init__(self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/') self.sql_cli = None self.crawl_type = CRAWL_TYPE_ASYNCIO # 并发量, 控制在50个, 避免更新is_delete=1时大量丢包!! self.concurrency = 100 self.concurrent_type = CONCURRENT_TYPE # 0 sqlserver | 1 new_my_server | 2 redis self.db_res_from = 2 if 'armv7l-with-debian' in platform.platform(): self.server_ip = 'http://0.0.0.0:80' else: self.server_ip = 'http://118.31.39.97' # self.server_ip = 'http://0.0.0.0:5000' async def _update_db(self): while True: # 长期运行报: OSError: [Errno 24] Too many open files, 故不采用每日一志 # self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() except AssertionError: # 全部提取完毕, 正常退出 break one_res, index = await self._get_one_res( slice_params_list=slice_params_list, index=index) await self._except_sleep(res=one_res) self.lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * .5) else: await async_sleep(5.) try: # del self.lg del result except: pass collect() async def _get_db_old_data(self) -> (list, None): """ 获取db需求更新的数据 :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: if self.db_res_from == 0: result = list( self.sql_cli._select_table(sql_str=tm_select_str_3)) elif self.db_res_from == 1: result = await get_waited_2_update_db_data_from_server( server_ip=self.server_ip, _type='tm', child_type=0, ) elif self.db_res_from == 2: # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新 result = get_waited_2_update_db_data_from_redis_server( spider_name='tm0', logger=self.lg, slice_num=800, ) else: raise ValueError('self.db_res_from value异常!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result def _get_tmp_item(self, site_id, goods_id): tmp_item = [] # 从数据库中取出时,先转换为对应的类型 if site_id == 3: tmp_item.append(0) elif site_id == 4: tmp_item.append(1) elif site_id == 6: tmp_item.append(2) tmp_item.append(goods_id) return tmp_item async def _get_one_res(self, slice_params_list, index) -> tuple: """ 获取slice_params_list对应的one_res :param slice_params_list: :return: (list, int) """ def get_tasks_params_list(slice_params_list: list, index: int) -> list: tasks_params_list = [] for item in slice_params_list: db_goods_info_obj = TMDbGoodsInfoObj(item=item, logger=self.lg) tmp_item = self._get_tmp_item( site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id, ) tasks_params_list.append({ 'db_goods_info_obj': db_goods_info_obj, 'index': index, 'tmp_item': tmp_item, }) index += 1 return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where is goods_id: {}, index: {}] ...'.format( k['db_goods_info_obj'].goods_id, k['index'], ) def get_now_args(k) -> list: return [ 'tm', k['tmp_item'], k['index'], self.lg, ] async def handle_one_res(one_res: list): """ one_res后续处理 :param one_res: :return: """ nonlocal slice_params_list # 获取新new_slice_params_list new_slice_params_list = [] for item in slice_params_list: goods_id = item[1] for i in one_res: # self.lg.info(str(i)) try: goods_id2 = i[1] index = i[2] if goods_id == goods_id2: new_slice_params_list.append({ 'index': index, 'before_goods_data': i[3], 'end_goods_data': i[4], 'item': item, }) break else: continue except IndexError: continue # 阻塞方式进行存储, 避免db高并发导致大量死锁 tasks = [] for k in new_slice_params_list: item = k['item'] index = k['index'] db_goods_info_obj = TMDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info( 'create task[where is goods_id: {}, index: {}]...'.format( db_goods_info_obj.goods_id, index)) tasks.append( self.loop.create_task( self._update_one_goods_info_in_db( db_goods_info_obj=db_goods_info_obj, index=index, before_goods_data=k['before_goods_data'], end_goods_data=k['end_goods_data'], ))) one_res = await _get_async_task_result(tasks=tasks, logger=self.lg) # pprint(one_res) try: del new_slice_params_list except: pass return one_res tasks = [] if self.crawl_type == CRAWL_TYPE_ASYNCIO: """asyncio""" # # method 1 # for item in slice_params_list: # index += 1 # db_goods_info_obj = TMDbGoodsInfoObj(item=item, logger=self.lg) # self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id)) # tasks.append(self.loop.create_task(self._update_one_goods_info( # db_goods_info_obj=db_goods_info_obj, # index=index,))) # res = await _get_async_task_result(tasks=tasks, logger=self.lg) # method 2 one_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=get_tasks_params_list( slice_params_list=slice_params_list, index=index, ), func_name_where_get_create_task_msg=get_create_task_msg, func_name=block_get_one_goods_info_task_by_external_type, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res= default_add_one_res_2_all_res2, one_default_res=(), step=self.concurrency, logger=self.lg, get_all_res=True, concurrent_type=self.concurrent_type, ) # pprint(one_res) res = await handle_one_res(one_res=one_res) elif self.crawl_type == CRAWL_TYPE_CELERY: """celery""" for item in slice_params_list: index += 1 db_goods_info_obj = TMDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format( db_goods_info_obj.goods_id)) tmp_item = self._get_tmp_item( site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id, ) try: async_obj = await self._create_celery_obj( goods_id=tmp_item, index=index, ) tasks.append(async_obj) except: continue one_res = await _get_celery_async_results(tasks=tasks) res = await handle_one_res(one_res=one_res) else: raise NotImplemented return (res, index) async def _create_celery_obj(self, **kwargs): """ 创建celery obj :param kwargs: :return: """ goods_id = kwargs.get('goods_id', []) index = kwargs['index'] async_obj = _get_tm_one_goods_info_task.apply_async( args=[ goods_id, index, ], expires=5 * 60, retry=False, ) return async_obj async def _update_one_goods_info_in_db(self, db_goods_info_obj, index, before_goods_data, end_goods_data): """ 更新单个goods :param item: :param index: :param before_goods_data: :param end_goods_data: :return: """ res = False self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg, remainder=25) if self.sql_cli.is_connect_success: self.lg.info('*' * 20 + ' updating goods_id: {}, index: {} ...'.format( db_goods_info_obj.goods_id, index, )) # 避免下面解析data错误休眠 before_goods_data_is_delete = before_goods_data.get('is_delete', 0) if end_goods_data != {}: data = get_goods_info_change_data( target_short_name='tm', logger=self.lg, data=end_goods_data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tm_data(data=data, pipeline=self.sql_cli, logger=self.lg) else: # 表示返回的data值为空值 if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format( db_goods_info_obj.goods_id, )) await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=5, loop=self.loop) await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) collect() return [db_goods_info_obj.goods_id, res] async def _update_one_goods_info(self, db_goods_info_obj, index): """ 更新单个goods :param db_goods_info_obj: :param index: :return: """ res = False tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, remainder=50, ) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'. format(db_goods_info_obj.goods_id, index)) tmp_item = self._get_tmp_item(site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id) # self.lg.info(str(tmp_item)) # ** 阻塞方式运行 oo = tmall.get_goods_data(goods_id=tmp_item) # ** 非阻塞方式运行 # oo = await unblock_func( # func_name=tmall.get_goods_data, # func_args=[ # tmp_item, # ], # default_res={}, # logger=self.lg,) before_goods_data_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 # 阻塞方式 data = tmall.deal_with_data() if data != {}: data = get_goods_info_change_data( target_short_name='tm', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tm_data(data=data, pipeline=self.sql_cli, logger=self.lg) else: if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('------>>>| 阻塞休眠7s中...') await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=5, loop=self.loop) try: del tmall except: pass collect() await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) return [ db_goods_info_obj.goods_id, res, ] async def _except_sleep(self, res): """ 异常休眠 :param res: :return: """ count = 0 all_count_fail_sleep_time = 100. # 本来是40., 此处不休眠 sleep_time = 0. # pprint(res) for item in res: try: if not item[1]: count += 1 except IndexError: pass self.lg.info('Fail count: {}个, 并发量: {}个'.format( count, self.concurrency)) if count / self.concurrency >= .96: # 全失败的休眠方式 self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time)) await async_sleep(all_count_fail_sleep_time) else: if count >= int(self.concurrency / 5): self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time)) await async_sleep(sleep_time) return None def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/小米有品/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=yp_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 yp = YouPinParse(logger=my_lg) for item in result: if index % 5 == 0: try: del yp except: pass yp = YouPinParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yp._get_target_data(goods_id=item[1]) data = yp._handle_target_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') tmp_sql_server._update_table_2( sql_str=yp_update_str_2, params=(item[1], ), logger=my_lg) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[7]), site_id=31) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=31), is_price_change=item[8] if item[8] is not None else 0) yp._to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) gc.collect()
def get_goods_data(self, goods_id: str) -> '重载获取数据的方法': ''' 模拟构造得到data的url :param goods_id: :return: data dict类型 ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: if re.compile(r'/rushdetail/').findall(goods_id) != []: tmp_url = goods_id print('------>>>| 原pc地址为: ', tmp_url) goods_id = re.compile( 'https://shop.mogujie.com/rushdetail/(.*?)\?.*?').findall( goods_id)[0] print('------>>>| 得到的蘑菇街商品id为:', goods_id) else: print('获取到的蘑菇街买哦啥地址错误!请检查') self.result_data = {} return {} data = {} body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('获取到的body为空str!') self.result_data = {} return {} try: goods_info = re.compile( r'var detailInfo = (.*?);</script>').findall(body)[0] # print(goods_info) item_info = re.compile( r'itemInfo:(.*?) ,priceRuleImg').findall(goods_info)[0] # print(item_info) sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall( goods_info)[0] # print(sku_info) shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall( goods_info)[0] # print(shop_info) item_info = json.loads(item_info) sku_info = json.loads(sku_info) shop_info = json.loads(shop_info) # pprint(item_info) # pprint(sku_info) # pprint(shop_info) data['title'] = item_info.get('title', '') if data['title'] == '': print('title为空!') raise Exception data['sub_title'] = '' data['shop_name'] = shop_info.get('name', '') # print(data['shop_name']) # 获取所有示例图片 all_img_url = [{ 'img_url': item } for item in item_info.get('topImages', [])] # pprint(all_img_url) data['all_img_url'] = all_img_url ''' 获取p_info ''' p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str( goods_id) tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url, headers=self.headers, had_referer=True) # print(tmp_p_info_body) if tmp_p_info_body == '': print('获取到的tmp_p_info_body为空值, 请检查!') raise Exception p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body) # pprint(p_info) # if p_info == []: # print('获取到的p_info为空list') # self.result_data = {} # return {} # else: # 不做上面判断了因为存在没有p_info的商品 data['p_info'] = p_info # 获取每个商品的div_desc div_desc = self.get_goods_div_desc( tmp_p_info_body=tmp_p_info_body) # print(div_desc) if div_desc == '': print('获取到的div_desc为空str, 请检查!') self.result_data = {} return {} else: data['div_desc'] = div_desc ''' 获取去detail_name_list ''' detail_name_list = self.get_goods_detail_name_list( sku_info=sku_info) # print(detail_name_list) if detail_name_list == '': print('获取detail_name_list出错, 请检查!') self.result_data = {} return {} else: data['detail_name_list'] = detail_name_list ''' 获取每个规格对应价格跟规格以及其库存 ''' price_info_list = self.get_price_info_list(sku_info=sku_info) # pprint(price_info_list) if price_info_list == '': raise Exception else: # pprint(price_info_list) data['price_info_list'] = price_info_list if price_info_list == []: print('该商品已售完,此处将商品状态改为1') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() try: sql_str = r'update dbo.mogujie_xianshimiaosha set is_delete=1 where goods_id = %s' my_pipeline._update_table(sql_str=sql_str, params=(goods_id)) except: print('将该商品逻辑删除时出错!') pass print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |') self.result_data = {} return {} # 商品价格和淘宝价 try: tmp_price_list = sorted([ round(float(item.get('detail_price', '')), 2) for item in data['price_info_list'] ]) price = Decimal(tmp_price_list[-1]).__round__(2) # 商品价格 taobao_price = Decimal(tmp_price_list[0]).__round__( 2) # 淘宝价 # print('商品的最高价: ', price, ' 最低价: ', taobao_price) except IndexError: print('获取price和taobao_price时出错! 请检查') raise Exception data['price'] = price data['taobao_price'] = taobao_price except Exception as e: print('遇到错误: ', e) self.result_data = {} return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
class ALUpdater(AsyncCrawler): """1688常规商品数据更新""" def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/1688/实时更新/', ) self.sql_cli = None self.goods_index = 1 # 并发量 self.concurrency = 10 async def _get_db_old_data(self) -> (list, None): """ 获取db需求更新的数据 :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: result = list(self.sql_cli._select_table(sql_str=al_select_str_6)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_new_ali_obj(self, index) -> None: if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.ali_1688 except: pass collect() self.ali_1688 = ALi1688LoginAndParse(logger=self.lg) async def _update_one_goods_info(self, db_goods_info_obj, index) -> list: """ 更新一个goods的信息 :param db_goods_info_obj: :param index: 索引值 :return: ['goods_id', bool:'成功与否'] """ res = False await self._get_new_ali_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(db_goods_info_obj.goods_id, index)) data = self.ali_1688.get_ali_1688_data( goods_id=db_goods_info_obj.goods_id) if isinstance(data, int): # 单独处理返回tt为4041 self.goods_index += 1 return [db_goods_info_obj.goods_id, res] if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = db_goods_info_obj.goods_id data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=db_goods_info_obj.is_delete, shelf_time=db_goods_info_obj.shelf_time, delete_time=db_goods_info_obj.delete_time, ) try: self.ali_1688.to_right_and_update_data( data, pipeline=self.sql_cli) except Exception: self.lg.error(exc_info=True) await async_sleep(1.5) self.goods_index += 1 res = True return [db_goods_info_obj.goods_id, res] data = self.ali_1688.deal_with_data() if data != {}: data = get_goods_info_change_data( target_short_name='al', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = self.ali_1688.to_right_and_update_data( data, pipeline=self.sql_cli) await async_sleep(.3) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(2.) # 避免被发现使用代理 return [db_goods_info_obj.goods_id, res] async def _update_db(self): """ 常规数据实时更新 :return: """ while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.ali_1688 = ALi1688LoginAndParse(logger=self.lg) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: goods_id = item[1] db_goods_info_obj = ALDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format(goods_id)) tasks.append( self.loop.create_task( self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index))) index += 1 if tasks != []: await _get_async_task_result(tasks=tasks, logger=self.lg) else: pass self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(5.5) try: del self.ali_1688 except: pass collect() def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
print('Changing ' + cmd_str) system(cmd_str) except Exception as e: print('遇到错误:', e) # 消费券队列 coupon_queue = Queue() # 存储goods_id, 领券点击地址的dict子元素的list goods_id_and_coupon_url_list = [] # 待处理的goods_id, 对应coupon_url的子元素的队列 goods_id_and_coupon_url_queue = Queue() # 已存储的唯一优惠券的list unique_coupon_id_list = [] print('正在获取db_unique_coupon_id_list ...') # 从db中获取已存在的id sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_res = list( sql_cli._select_table( sql_str='select unique_id from dbo.coupon_info')) unique_coupon_id_list = [item[0] for item in db_res] print('unique_coupon_id_list_len: {}'.format( len(unique_coupon_id_list))) except Exception as e: print(e) finally: try: del sql_cli except: pass print('获取完毕!')
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_4) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_3)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mogujie_miaosha = MoGuJieMiaoShaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0], )) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] item_list = self.get_item_list(event_time=str(item[2])) if item_list == '': # 可能网络状况导致, 先跳过 pass elif item_list == []: print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 该event_time中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('iid', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('iid', '') == item[0]: spider_url = item[3] mogujie_miaosha.get_goods_data( goods_id=spider_url) goods_data = mogujie_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) # price设置为原价 try: tmp_price_list = sorted([ round( float( item_4.get( 'normal_price', '')), 2) for item_4 in goods_data[ 'price_info_list'] ]) price = Decimal( tmp_price_list[-1] ).__round__(2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int( item_2.get( 'startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int( item_2.get( 'endTime', 0))), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # print(goods_data['title']) # pprint(goods_data) # print(goods_data) mogujie_miaosha.update_mogujie_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
async def get_coupon_url_list_by_goods_id_list(self, slice_params_list) -> list: """ 根据给与的goods_id_list来获取对应的coupon_url_list :return: """ def get_create_task_msg(k) -> str: return 'create task[where goods_id: {}, site_id: {}] ...'.format( k['goods_id'], k['site_id'], ) def get_now_args(k) -> list: return [ k['goods_id'], ] all_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=slice_params_list, func_name_where_get_create_task_msg=get_create_task_msg, func_name=self.get_tm_coupon_url_from_lq5u, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res= default_add_one_res_2_all_res2, one_default_res='', step=self.concurrency, logger=self.lg, concurrent_type=self.concurrent_type, func_timeout=25, ) res = [] for item in all_res: if item != '': res.append(item) # 修改对应的goods_id的coupon_check_time sql_str = 'update dbo.GoodsInfoAutoGet set coupon_check_time=%s where GoodsID=%s' sql_cli = SqlServerMyPageInfoSaveItemPipeline() for item in slice_params_list: goods_id = item['goods_id'] coupon_check_time_change_res = False try: coupon_check_time_change_res = sql_cli._update_table_2( sql_str=sql_str, params=( get_shanghai_time(), goods_id, ), logger=self.lg, ) except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('[{}] update goods_id: {} coupon_check_time'.format( '+' if coupon_check_time_change_res else '-', goods_id, )) try: del sql_cli except: pass try: del all_res except: pass collect() return res
async def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = await tmp_sql_server.select_jumeiyoupin_pintuan_all_goods_id( logger=self.my_lg) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(result) self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await tmp_sql_server.delete_jumeiyoupin_pintuan_expired_goods_id( goods_id=item[0], logger=self.my_lg) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.my_lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.my_lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.my_lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: my_phantomjs = MyPhantomjs() item_list = await jumeiyoupin_2.get_one_page_goods_list( my_phantomjs=my_phantomjs, tab=item[2], index=item[3]) try: del my_phantomjs except: pass if item_list == []: self.my_lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.my_lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=tmp_sql_server) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=tmp_sql_server) else: self.my_lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() return None
class GoodsCouponSpider(AsyncCrawler): def __init__(self): AsyncCrawler.__init__( self, user_agent_type=PHONE, ip_pool_type=IP_POOL_TYPE, log_print=True, logger=None, log_save_path=MY_SPIDER_LOGS_PATH + '/coupon/_/', headless=True, ) # 不宜过大, 官网会发现 self.concurrency = 10 # 不可太大 电脑卡死 self.concurrency2 = 3 self.req_num_retries = 7 self.proxy_type = PROXY_TYPE_HTTPS self.driver_load_images = DRIVER_LOAD_IMAGES # 用线程模式长期运行报: too many open files self.concurrent_type = 0 self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.init_sql_str() async def _fck_run(self): """ main :return: """ while True: try: if get_shanghai_time().hour == 0: await async_sleep(60 * 60 * 3.5) continue self.db_res = await self.get_db_res() all_tasks_params_list_obj = await self.get_all_tasks_params_list_obj( ) tasks_params_list_obj = TasksParamsListObj( tasks_params_list=all_tasks_params_list_obj, step=self.concurrency, slice_start_index=0, ) while True: try: slice_params_list = tasks_params_list_obj.__next__() except AssertionError: break coupon_url_list = await self.get_coupon_url_list_by_goods_id_list( slice_params_list=slice_params_list) # pprint(coupon_url_list) # 测试 # coupon_url = 'https://uland.taobao.com/coupon/edetail?e=5M3kt6O%2FfZqa2P%2BN2ppgB2X2iX5OaVULVb9%2F1Hxlj5NQYhkEFAI5hGSlkL8%2BFO6JZSEGEhAo6u3FrE8HH4fiD8KUixUTTLeu0WMS0ZKY%2BzmLVIDjuHwzlw%3D%3D&af=1&pid=mm_55371245_39912139_149806421' # coupon_url_list = [coupon_url for i in range(6)] # # goods_id得对应上面的领券地址 # goods_id_and_coupon_url_queue.put({ # 'goods_id': '562016826663', # 'coupon_url': coupon_url, # }) if coupon_url_list == []: # 此处也回收下 collect() self.lg.info('coupon_url_list为空list, 跳过!') random_sleep_time = random_uniform(3., 6.) self.lg.info('休眠{}s ...'.format(random_sleep_time)) await async_sleep(random_sleep_time) continue # 划分coupon_url_list, 避免多开使内存崩溃 tasks_params_list_obj2 = TasksParamsListObj( tasks_params_list=coupon_url_list, step=self.concurrency2, slice_start_index=0, ) while True: try: slice_params_list2 = tasks_params_list_obj2.__next__( ) except AssertionError: break tasks = [] for coupon_url in slice_params_list2: self.lg.info( 'create task[where coupon_url: {}] ...'.format( coupon_url)) tasks.append( self.loop.create_task( self.intercept_target_api( coupon_url=coupon_url))) try: one_res = await wait_for( fut=async_wait_tasks_finished(tasks=tasks), timeout=60 * 2, ) except AsyncTimeoutError: self.lg.error('遇到错误:', exc_info=True) continue # 成功总数 success_count = 0 for item in one_res: if item: success_count += 1 self.lg.info('成功个数: {}, 成功概率: {:.3f}'.format( success_count, success_count / self.concurrency2)) collect() collect() self.lg.info('一次大循环结束!!') except Exception: self.lg.error('遇到错误:', exc_info=True) await async_sleep(30) finally: self.lg.info('休眠6s...') await async_sleep(6.) collect() async def get_all_tasks_params_list_obj(self) -> list: """ 根据db 给与的数据获取到所有的目标数据 :return: """ global unique_coupon_id_list all_tasks_params_list_obj = [] for item in self.db_res: goods_id = item[0] # 因为现在只取单件购买优惠券, 不处理多件的, 所以此处可去除已存在的 coupon_unique_id = str(get_uuid3(target_str=goods_id)) if coupon_unique_id in unique_coupon_id_list: self.lg.info( 'coupon_info 表中已存在coupon_unique_id: {}, goods_id: {}, pass' .format( coupon_unique_id, goods_id, )) continue all_tasks_params_list_obj.append({ 'goods_id': goods_id, 'site_id': item[1], }) return all_tasks_params_list_obj async def get_coupon_url_list_by_goods_id_list(self, slice_params_list) -> list: """ 根据给与的goods_id_list来获取对应的coupon_url_list :return: """ def get_create_task_msg(k) -> str: return 'create task[where goods_id: {}, site_id: {}] ...'.format( k['goods_id'], k['site_id'], ) def get_now_args(k) -> list: return [ k['goods_id'], ] all_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=slice_params_list, func_name_where_get_create_task_msg=get_create_task_msg, func_name=self.get_tm_coupon_url_from_lq5u, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res= default_add_one_res_2_all_res2, one_default_res='', step=self.concurrency, logger=self.lg, concurrent_type=self.concurrent_type, func_timeout=25, ) res = [] for item in all_res: if item != '': res.append(item) # 修改对应的goods_id的coupon_check_time sql_str = 'update dbo.GoodsInfoAutoGet set coupon_check_time=%s where GoodsID=%s' sql_cli = SqlServerMyPageInfoSaveItemPipeline() for item in slice_params_list: goods_id = item['goods_id'] coupon_check_time_change_res = False try: coupon_check_time_change_res = sql_cli._update_table_2( sql_str=sql_str, params=( get_shanghai_time(), goods_id, ), logger=self.lg, ) except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('[{}] update goods_id: {} coupon_check_time'.format( '+' if coupon_check_time_change_res else '-', goods_id, )) try: del sql_cli except: pass try: del all_res except: pass collect() return res async def get_db_res(self) -> list: """ 获取目标goods_id_list :return: """ get_current_func_info_by_traceback(self=self, logger=self.lg) db_res = [] try: self.lg.info('清除过期优惠券ing ...') # 清除过期优惠券 self.sql_cli._delete_table( sql_str= 'delete from dbo.coupon_info where GETDATE()-end_time >= 3', params=None, ) self.lg.info('休眠15s ...') await async_sleep(15) self.lg.info('获取新待检测的goods数据ing...') db_res = list(self.sql_cli._select_table(sql_str=self.sql_tr0, )) except Exception: self.lg.error('遇到错误:', exc_info=True) self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() assert db_res != [] self.lg.info('db_res_len: {}'.format(len(db_res))) return db_res async def intercept_target_api(self, coupon_url: str): """ 拦截目标接口 :param coupon_url: :return: """ chromium_puppeteer = ChromiumPuppeteer( load_images=self.driver_load_images, executable_path=PYPPETEER_CHROMIUM_DRIVER_PATH, ip_pool_type=self.ip_pool_type, headless=self.headless, user_agent_type=self.user_agent_type, ) driver = await chromium_puppeteer.create_chromium_puppeteer_browser() # self.lg.info('chromium version: {}'.format(await driver.version())) # self.lg.info('初始user_agent: {}'.format(await driver.userAgent())) page = await driver.newPage() await bypass_chrome_spiders_detection(page=page) # ** 截获 request 和 response, 劫持请求跟应答必须都设置! # ** puppeteer官网事件api: https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md # 搜索class: Page, 找到需求事件进行重写 await page.setRequestInterception(True) network_interceptor = NetworkInterceptorTest() page.on(event='request', f=network_interceptor.intercept_request) page.on(event='response', f=network_interceptor.intercept_response) page.on(event='requestfailed', f=network_interceptor.request_failed) # page.on(event='requestfinished', f=network_interceptor.request_finished) res = False try: await goto_plus( page=page, url=coupon_url, options={ 'timeout': 1000 * 45, # unit: ms 'waitUntil': [ # 页面加载完成 or 不再有网络连接 'domcontentloaded', 'networkidle0', ] }, num_retries=2, ) # 全屏截图 # await page.screenshot({ # 'path': 'screen.png', # 'type': 'png', # 'fullPage': True, # }) # 目标元素定位截图 # target_ele = await page.querySelector(selector='div.board') # await target_ele.screenshot({ # 'path': 'target_ele.png', # 'type': 'png', # }) # 如果网页内有用iframe等标签,这时page对象是无法读取<iframe>里面的内容的,需要用到下面 # frames_list = page.frames # pprint(frames_list) body = Requests._wash_html(await page.content()) # print('[{:8s}] {}'.format( # colored('body', 'red'), # body, )) res = True if body != '' else res except (WebsocketsConnectionClosed, InvalidStateError): pass except Exception: self.lg.error('遇到错误:', exc_info=True) try: await driver.close() except: try: await driver.close() except: pass try: del page except: try: del page except: pass try: del chromium_puppeteer except: try: del chromium_puppeteer except: pass collect() return res @catch_exceptions_with_class_logger(default_res='') def get_tm_coupon_url_from_lq5u( self, goods_id='', goods_name_or_m_url: str = '', ) -> str: """ 从领券无忧根据goods_id搜索tm优惠券, 并返回领券地址 url: http://www.lq5u.com :param goods_id: 推荐使用商品id来查券 :param goods_name_or_m_url: 商品名 or 商品地址 :param proxy_type: :param num_retries: :return: 优惠券领取地址 """ global goods_id_and_coupon_url_queue # todo 测试发现无需搜索, 只需把goods_id 改为领券无忧的对应的url即可查询是否有券 # 基于领券无忧来根据商品名获取其优惠券 # headers = get_random_headers( # user_agent_type=1, # connection_status_keep_alive=False, # ) # headers.update({ # 'Proxy-Connection': 'keep-alive', # 'Origin': 'http://www.lq5u.com', # 'Content-Type': 'application/x-www-form-urlencoded', # 'Referer': 'http://www.lq5u.com/', # }) # # 只搜索天猫的 # data = { # 'p': '1', # 'cid': '0', # 'sort': '0', # 'b2c': '1', # '0'为搜索tb, tm | '1'为只搜索tm # 'coupon': '1', # 'k': goods_name_or_m_url, # } # body = Requests.get_url_body( # method='post', # url='http://www.lq5u.com/', # headers=headers, # # cookies=cookies, # data=data, # verify=False, # ip_pool_type=IP_POOL_TYPE, # num_retries=num_retries, # proxy_type=proxy_type,) # assert body != '' # # print(body) # # lq5u_url_list_sel = { # 'method': 'css', # 'selector': 'li a ::attr("onmousedown")', # } # ori_lq5u_url_list = parse_field( # parser=lq5u_url_list_sel, # target_obj=body, # is_first=False,) # lq5u_url_list = [] # for item in ori_lq5u_url_list: # try: # url = re.compile('this.href=\'(.*?)\'').findall(item)[0] # assert url != '' # except Exception: # continue # # lq5u_url_list.append('http://www.lq5u.com' + url) # # assert lq5u_url_list != [] # pprint(lq5u_url_list) # 领券无忧对应页面如下 # url = 'http://www.lq5u.com/item/index/iid/{}.html'.format(goods_id) # body = Requests.get_url_body( # method='get', # url=url, # headers=headers, # verify=False, # ip_pool_type=IP_POOL_TYPE, # num_retries=num_retries, # proxy_type=proxy_type, ) # assert body != '' # print(body) # # coupon_info_sel = { # 'method': 'css', # 'selector': 'span.b.red ::text', # } # coupon_info = parse_field( # parser=coupon_info_sel, # target_obj=body, # ) # if '很遗憾,该商品没有优惠券' in coupon_info: # return [] # else: # _print(msg='goods_id: {}, 存在优惠券'.format(goods_id), logger=logger) # return [] # 查看某商品是否含有优惠券 # 地址: http://www.i075.com/item/index/iid/562016826663.html # 可以从下面网站拿商品测试 # http://www.i075.com/index/cate/cid/1.html # tm # goods_id = '562016826663' # goods_id = '565122084412' # tb # goods_id = '573406377569' # # 根据领券无忧接口 # # base_url = 'www.i075.com' # base_url = 'quan.mmfad.com' # headers = get_random_headers( # user_agent_type=1, # connection_status_keep_alive=False, # upgrade_insecure_requests=False, # cache_control='',) # headers.update({ # 'accept': 'application/json, text/javascript, */*; q=0.01', # 'Referer': 'http://{}/item/index/iid/{}.html'.format(base_url, goods_id), # 'Origin': 'http://{}'.format(base_url), # 'X-Requested-With': 'XMLHttpRequest', # 'Content-Type': 'application/x-www-form-urlencoded', # 'Proxy-Connection': 'keep-alive', # }) # params = ( # ('rnd', str(random_uniform(0, 1))), # eg: '0.4925945510743117' # ) # data = { # 'iid': goods_id, # } # body = Requests.get_url_body( # method='post', # url='http://{}/item/ajax_get_auction_code.html'.format(base_url), # headers=headers, # params=params, # data=data, # verify=False, # ip_pool_type=self.ip_pool_type, # num_retries=self.req_num_retries, # proxy_type=self.proxy_type, ) # assert body != '' # # self.lg.info(body) # # data = json_2_dict( # json_str=body, # default_res={}, # logger=self.lg,).get('data', {}) # # pprint(data) # # 处理data = '' # data = data if not isinstance(data, str) else {} # coupon_url = data.get('coupon_click_url', '') # 通过全优惠网(https://www.quanyoubuy.com) headers = get_random_headers( user_agent_type=1, connection_status_keep_alive=False, cache_control='', ) headers.update({ 'authority': 'm.quanyoubuy.com', }) url = 'https://m.quanyoubuy.com/item/index/iid/{}.html'.format( goods_id) body = Requests.get_url_body( url=url, headers=headers, ip_pool_type=self.ip_pool_type, proxy_type=self.proxy_type, num_retries=self.req_num_retries, ) assert body != '' # self.lg.info(body) # pc 的 # qrcode_url_sel = { # 'method': 'css', # 'selector': 'img#qrcode ::attr("src")', # } # qrcode_url = parse_field( # parser=qrcode_url_sel, # target_obj=body, # logger=self.lg,) # assert qrcode_url != '' # # self.lg.info(qrcode_url) # coupon_url_sel = { # 'method': 're', # 'selector': 'text=(.*)', # } # coupon_url = parse_field( # parser=coupon_url_sel, # target_obj=qrcode_url, # logger=self.lg,) # m coupon_url_sel = { 'method': 'css', 'selector': 'div.goods_quan a.getGoodsLink ::attr("href")', } coupon_url = parse_field( parser=coupon_url_sel, target_obj=body, logger=self.lg, is_print_error=False, ) # self.lg.info(coupon_url) if 'uland.taobao.com' not in coupon_url: # 地址含有上诉的才为领券地址 coupon_url = '' else: pass if coupon_url != '': self.lg.info('[+] 该goods_id: {} 含 有优惠券, coupon领取地址: {}'.format( goods_id, coupon_url, )) # 队列录值 goods_id_and_coupon_url_queue.put({ 'goods_id': goods_id, 'coupon_url': coupon_url, }) else: self.lg.info('[-] 该goods_id: {} 不含 有优惠券'.format(goods_id)) try: del body except: pass collect() return coupon_url def init_sql_str(self): self.sql_tr0 = ''' select top 800 GoodsID, SiteID from dbo.GoodsInfoAutoGet where MainGoodsID is not null and IsDelete=0 and (SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6) and GoodsID not in (select goods_id from dbo.coupon_info) -- and MainGoodsID=143509 -- and GoodsID='18773718545' order by coupon_check_time asc ''' def __del__(self): try: del self.concurrency del self.loop except: pass collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_zhe_800_pintuan_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) zhe_800_pintuan.get_goods_data(goods_id=item[0]) data = zhe_800_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] if item[1] == 1: tmp_sql_server.delete_zhe_800_pintuan_expired_goods_id( goods_id=item[0]) print('该goods_id[{0}]已过期,删除成功!'.format(item[0])) zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del zhe_800_pintuan # except: # pass gc.collect() sleep(.7) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
def run(self): global coupon_queue, goods_id_and_coupon_url_list, unique_coupon_id_list while True: sql_cli = None try: if coupon_queue.qsize() >= 1: # todo 有些领券url 为预付定金商品, 此处不处理 coupon_item = coupon_queue.get() ori_coupon_list = json_2_dict( json_str=re.compile('\((.*)\)').findall(coupon_item) [0], default_res={}, ).get('data', {}).get('resultList', []) assert ori_coupon_list != [] # pprint(ori_coupon_list) # todo: 测试发现, 返回数据中, 若有多买几件的优惠券在字段'nCouponInfoMap'中 # 现只支持1件, 不支持多件的券 coupon_list = [] for item in ori_coupon_list: try: goods_id = str(item.get('itemId', '')) assert goods_id != '' # 一个账户优惠券只能使用一次 # 优惠券展示名称, eg: '优惠券' coupon_display_name = '优惠券' # 优惠券的值, 即优惠几元 ori_coupon_value = item.get('couponAmount', '') assert ori_coupon_value != '' coupon_value = str( float(ori_coupon_value).__round__(2)) # 使用门槛 ori_thresold = item.get('couponStartFee', '') assert ori_thresold != '' threshold = str(float(ori_thresold).__round__(2)) begin_time = str( timestamp_to_regulartime( int( item.get('couponEffectiveStartTime', '')[0:10]))) end_time = str( timestamp_to_regulartime( int( item.get('couponEffectiveEndTime', '')[0:10]))) # 使用方法 use_method = '满{}元, 减{}元'.format( threshold, coupon_value) if string_to_datetime( end_time) <= get_shanghai_time(): print('该券已过期[goods_id: {}]'.format(goods_id)) # 已过期的 continue if datetime_to_timestamp(string_to_datetime(end_time)) - datetime_to_timestamp(string_to_datetime(begin_time)) \ <= 60 * 60 * 36: print('该券小于1.5天[goods_id: {}], pass'.format( goods_id)) continue # todo 测试发现, 同一商品可能存在不同活动时间段的同一优惠券(但是活动时间不同), 导致一个商品有多个优惠券 # 所以取值时, 按结束时间最大那个来取值 # 上面还是会有问题, 导致价格重复减, 所以生成唯一id, 所以在一次转换价格后要把所有的该goods_id券都标记为1 # 生成唯一id # unique_id = str(get_uuid3( # target_str=goods_id \ # + coupon_value \ # + threshold \ # + str(datetime_to_timestamp(string_to_datetime(begin_time)))[0:10]\ # + str(datetime_to_timestamp(string_to_datetime(end_time)))[0:10])) # todo 根据上诉存在多张券导致价格被多次修改的情况,故表中一个goods_id,只允许存一张券, 就不会出现价格被多次修改的情况 # 解释就说: 只存储优惠力度最大的券 unique_id = str(get_uuid3(target_str=goods_id)) # 领券地址 # pprint(goods_id_and_coupon_url_list) coupon_url = '' for j in goods_id_and_coupon_url_list: tmp_goods_id = j['goods_id'] tmp_coupon_url = j['coupon_url'] if goods_id == tmp_goods_id: print('@@@ 成功匹配到goods_id: {} 的领券地址: {}!!'. format(goods_id, tmp_coupon_url)) coupon_url = tmp_coupon_url break else: continue assert coupon_url != '' coupon_list.append({ 'unique_id': unique_id, 'goods_id': goods_id, 'coupon_url': coupon_url, 'coupon_display_name': coupon_display_name, 'coupon_value': coupon_value, 'threshold': threshold, 'begin_time': begin_time, 'end_time': end_time, 'use_method': use_method, }) except Exception as e: print(e) continue # pprint(coupon_list) if coupon_list != []: # 存储 sql_cli = SqlServerMyPageInfoSaveItemPipeline() if not sql_cli.is_connect_success: raise SqlServerConnectionException for item in coupon_list: unique_id = item['unique_id'] goods_id = item['goods_id'] if unique_id not in unique_coupon_id_list: save_res = sql_cli._insert_into_table( sql_str= 'insert into dbo.coupon_info(unique_id, create_time, goods_id, coupon_url, coupon_display_name, coupon_value, threshold, begin_time, end_time, use_method) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', params=( unique_id, str(get_shanghai_time()), goods_id, item['coupon_url'], item['coupon_display_name'], Decimal( item['coupon_value']).__round__(2), Decimal( item['threshold']).__round__(2), item['begin_time'], item['end_time'], item['use_method'], ), repeat_insert_default_res=False, # 避免重复改价 ) if save_res: # todo 只更新一次价格, 避免重复更新导致价格错误 # 去重 unique_coupon_id_list.append(unique_id) # 更新常规表中的商品价格变动 sql_str = ''' select top 1 Price, TaoBaoPrice, SKUInfo from dbo.GoodsInfoAutoGet where GoodsID=%s ''' db_res = [] try: db_res = list( sql_cli._select_table( sql_str=sql_str, params=(goods_id, ), )) except Exception as e: print(e) if db_res != []: # 标记常规商品由于优惠券带来的价格变动 try: # 减去优惠券的价格 coupon_value = float( item['coupon_value']) threshold = float( item['threshold']) # 还原为原始价格 db_price = float( db_res[0][0]) * (1 - CP_PROFIT) db_taobao_price = float( db_res[0][1]) * (1 - CP_PROFIT) # 减去优惠券价, 并且加上CP_PROFIT, 得到最终待存储价格 new_price = ( (db_price - coupon_value if db_price >= threshold else db_price) * (1 + CP_PROFIT)).__round__(2) new_taobao_price = ( (db_taobao_price - coupon_value if db_taobao_price >= threshold else db_taobao_price) * (1 + CP_PROFIT)).__round__(2) new_sku_info = get_new_sku_info_from_old_sku_info_subtract_coupon_and_add_cp_profit( old_sku_info=json_2_dict( json_str=db_res[0][2], default_res=[], ), threshold=threshold, coupon_value=coupon_value, ) sql_str2 = ''' update dbo.GoodsInfoAutoGet set Price=%s, TaoBaoPrice=%s, SKUInfo=%s, ModfiyTime=%s, sku_info_trans_time=%s, IsPriceChange=1, PriceChangeInfo=SKUInfo where GoodsID=%s ''' now_time = get_shanghai_time() sql_cli._update_table( sql_str=sql_str2, params=( Decimal(new_price ).__round__(2), Decimal(new_taobao_price). __round__(2), dumps(new_sku_info, ensure_ascii=False), now_time, now_time, goods_id, ), ) except Exception as e: print(e) else: pass else: continue else: continue else: continue except IndexError: # 跳过相同接口得索引异常 continue except Exception as e: print(e) finally: try: del sql_cli except: pass
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, fcid, page from dbo.mogujie_pintuan where site_id=23' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # print(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'detail_price': '', 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = price_info_list # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [ { 'spec_value': item_4.get( 'spec_value'), 'pintuan_price': item_4.get( 'detail_price'), 'detail_price': '', 'normal_price': item_4.get( 'normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get( 'rest_number'), } for item_4 in tmp_price_info_list ] goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = price_info_list goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
class GX8899Spider(object): def __init__(self, logger=None): self._set_sort_type_name() self._set_logger(logger) self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s' self.phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) self.id_list = [] self.update_index = 0 def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/gx8899/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_sort_type_name(self): ''' 设置抓取的分类名 :return: ''' self.sort_type_name_list = [ # 'weixin', # 'nansheng', # 'nvsheng', 'fengjing', 'jingxuan', 'wupin', 'oumei', 'weimei', 'heibai', 'baqi', 'xiaoqingxin', 'yijing', 'beiying', 'chouyan', 'sumiao', 'gexing', 'xiaohai', 'qiche', 'zhiwu', 'shouhui', 'weshen', 'mingxing', 'jianzhu', 'renwu', ] def _get_gx8899_all_img_url(self): self.my_lg.info('即将开始采集gx8899...') fz = [] for sort_type_name in self.sort_type_name_list: tmp = self._get_one_sort_type_name_page_info(sort_type_name) if tmp != []: fz += tmp self.my_lg.info('@@@ 全部头像抓取完毕!') self.fz = fz return fz def _get_new_wait_2_handle_id_list(self): ''' 获取新的带处理的 :return: ''' sql_str = ''' select top 1000 id from dbo.sina_weibo where sina_type = 'bilibili' and modify_time is null ''' if self.id_list == []: self.my_lg.info('@@@ 重新获取id_list...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() try: wait = self.my_pipeline._select_table(sql_str=sql_str) self.id_list = [i[0] for i in wait] except TypeError or IndexError: sleep(8) return [] else: pass return self.id_list @fz_set_timeout(6) def oo(self, id, img_url): try: self.my_pipeline._update_table_2(sql_str=self.update_sql, params=(img_url, get_shanghai_time(), id), logger=self.my_lg) except Exception: return False return True def _get_one_sort_type_name_page_info(self, sort_type_name): ''' 得到一个分类的某页信息 :return: ''' base_url = 'http://m.gx8899.com/{0}/'.format(sort_type_name) headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_pc_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Referer': 'http://m.gx8899.com/weixin/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } index = 0 res = [] while True: if index == 0: url = base_url index += 1 # 第二页index_2开始 else: url = base_url + 'index_{0}.html'.format(index) self.my_lg.info('正在抓取{0}'.format(url)) # 太慢, 改用phantomjs # body = self._get_loop_run_result(url=url, headers=headers) if index % 15 == 0: try: del self.phantomjs except: pass gc.collect() self.phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) self.my_lg.info('[+] phantomjs已重置!') body = self.phantomjs.use_phantomjs_to_get_url_body(url=url) # self.my_lg.info(str(body)) if re.compile(r'<title>404 - 找不到文件或目录。</title>').findall( body) != []: break need = Selector(text=body).css( 'div#con_tabone_1 li.last a:last-child ::attr(href)').extract( ) pprint(need) if need == []: self.my_lg.error('获取到的need为空list!出错地址:{0}'.format(url)) continue for article_url in need: _ = self._get_one_article_page_info(article_url) if _ != []: res += _ self.my_lg.info('#### 已更新{0}个id !'.format(self.update_index)) index += 1 return res def _get_one_article_page_info(self, url): ''' 得到一个推荐地址里面所有图片list :param url: :return: ''' headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } # body = self._get_loop_run_result(url=url, headers=headers) body = self.phantomjs.use_phantomjs_to_get_url_body(url=url) if body == '': self.my_lg.info('获取到img list为空list!出错地址:{}'.format(url)) return [] need = Selector( text=body).css('div.content p img ::attr(src)').extract() # pprint(need) # self.my_lg.info(str(need)) if need != []: self.my_lg.info('[+] crawl子地址success') else: self.my_lg.info('[-] crawl子地址fail') # 数据更新操作 for img_url in need: try: random_id_index = randint( 0, len(self._get_new_wait_2_handle_id_list()) - 1) except: sleep(5) continue res = self.oo( id=self.id_list[random_id_index], img_url=img_url, ) if res: self.id_list.pop(random_id_index) self.update_index += 1 return need async def _get_one_page_body(self, url, headers): ''' 异步获取body :param url: :param headers: :return: ''' body = await MyAiohttp.aio_get_url_body(url=url, headers=headers) return body def _get_loop_run_result(self, **kwargs): loop = get_event_loop() result = loop.run_until_complete( self._get_one_page_body(url=kwargs.get('url', ''), headers=kwargs.get('headers', {}))) return result def __del__(self): try: del self.phantomjs del self.my_lg except: pass gc.collect()
def __init__(self): AsyncCrawler.__init__( self, user_agent_type=PHONE, ip_pool_type=IP_POOL_TYPE, log_print=True, logger=None, log_save_path=MY_SPIDER_LOGS_PATH + '/goods_sort_by_shop_type/_/', headless=True, ) self.req_num_retries = 6 # 必须是新开m站超市页面来获取 # tm m站超市分类 url: https://chaoshi.m.tmall.com/ # tm每次使用请先更换 # 直接复制对应接口得新cookies # 并修改cp_utils中的block_calculate_tb_right_sign中的t(该t为params参数中的t), _m_h5_tk为最新请求对应的值即可 # eg: # 天猫超市入口的t, _m_h5_tk # t = '1590387891307' # _m_h5_tk = '6f594c22870353cede88c2796cc28ee9' self.tm_new_chrome_t = '1590545557798' self.tm_new_chrome_sign = 'c2d1fced4d7b1333d0f19b6b637fed9f' self.tm_new_chrome_cookies_str = 'hng=CN%7Czh-CN%7CCNY%7C156; cna=wRsVFTj6JEoCAXHXtCqXOzC7; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=MXX6theE39REQu4vFae7f5vi8A8GAdt5pdcQAJY7eR3zuOxwTSUu0zQGRWpBLbzxbJUsLvdHk4vB8ZWvQR%2BjQg%3D%3D; l=eB_zn817vA2VK0x_BOfZnurza779_IRAguPzaNbMiOCPOdfH5H0fWZAGqqTMCnGVh6uk83JDb3ZQBeYBcBdKnxvOnrZgURDmn; sm4=330100; csa=0_0_0_0_0_0_0_0_0_0_0_0_0; sgcookie=EbIdqdSy36jBPHKaO%2FPZS; uc3=id2=UUplY9Ft9xwldQ%3D%3D&lg2=W5iHLLyFOGW7aA%3D%3D&vt3=F8dBxGZjLZslLqBqC3E%3D&nk2=rUtEoY7x%2Bk8Rxyx1ZtN%2FAg%3D%3D; t=c413bd0891628c3269938122b2bee15f; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; uc4=id4=0%40U2gvLJ3%2BK6kqeorNX%2B21sXN8x3lW&nk4=0%40r7rCNeQ4%2Bj7fAj%2BMcdPH4%2B0X9x%2FwQLp0Sd4%2F; lgc=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=ee5587773876d; cookie2=13ed682f1aa10261d267e8e5a9e8e223; _m_h5_tk=883da77eaee1f1b25a7fb1f4c95b68e6_1590554541015; _m_h5_tk_enc=d531110c50a3daed05299dbb0b6dc3f0; isg=BBISyyg5mGC0AuZBht_2zq0HY970Ixa9xZzn1dxrPkWw77LpxLNmzRgJWw32n45V' self.tm_first_sort_list = [ { 'name': '休闲零食', 'icon_type': 'categoryxiuxianlingshi', 'level1_id': 78, }, { 'name': '粮油米面', 'icon_type': 'categoryliangyoumimian', 'level1_id': 80, }, { 'name': '乳饮酒水', 'icon_type': 'categorynaipinshuiyin', 'level1_id': 79, }, { 'name': '日用百货', 'icon_type': 'categorychufangriyong', 'level1_id': 81, }, { 'name': '母婴用品', 'icon_type': 'categorymuyingyongping', 'level1_id': 82, }, { 'name': '个人护理', 'icon_type': 'categorygerenhuli', 'level1_id': 83, }, { 'name': '纸品家清', 'icon_type': 'categoryjiaqingjiaju', 'level1_id': 84, }, { 'name': '美容护肤', 'icon_type': 'categorymeironghufu', 'level1_id': 94, }, { 'name': '方便速食', 'icon_type': 'categoryfangbiansushi', 'level1_id': 92, }, { 'name': '中外名酒', 'icon_type': 'categoryzhongwaimingjiu', 'level1_id': 87, }, { 'name': '童装童鞋', 'icon_type': 'categorytongzhuang', 'level1_id': 138, }, { 'name': '成人用品', 'icon_type': 'categorychengrenyongpin', 'level1_id': 93, }, { 'name': '家纺内衣', 'icon_type': 'categoryjiafangneiyi', 'level1_id': 90, }, { 'name': '宠物生活', 'icon_type': 'categorychongwuyongpin', 'level1_id': 91, }, { 'name': '电器数码', 'icon_type': 'category3cqipei', 'level1_id': 95, }, { 'name': '进口好货', 'icon_type': 'categoryjinkouhaohuo', 'level1_id': 85, }, { 'name': '医疗保健', 'icon_type': 'categoryzibubaojian', 'level1_id': 89, }, ] self.tm_skip_name_tuple = ( '好货', '为你推荐', '热销榜单', '每日特惠', '一件包邮', '新品尝鲜', '年中大赏', '喵九八', '新品推荐', '特惠', '尝新', '精致好货', '超值爆款', '包邮', '优选', '直播', '尖叫单品', '品牌专区', '大牌', '网红爆款', '新品', '清凉一夏', '热销', '推荐', '国家馆', '优惠', '折', '送', # eg: 买一送一 '精选', '爆款', '上新', '秒杀', '热门', '减', '满减', ) self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self._init_sql_str() self.db_existed_goods_id_list = [] self.sql_cli_remainder = 20
async def deal_with_data(self): ''' 处理并存储相关拼团商品的数据 :return: ''' goods_list = await self.get_pintuan_goods_info() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( await my_pipeline.select_jumeiyoupin_pintuan_all_goods_id( logger=self.my_lg)) ] # self.my_lg.info(str(db_goods_id_list)) index = 1 for item in goods_list: if index % 20 == 0: my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if item.get('goods_id', '') in db_goods_id_list: self.my_lg.info('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://s.h5.jumei.com/yiqituan/detail?item_id={0}&type={1}'.format( goods_id, item.get('type', '')) s_time = time.time() jumeiyoupin = JuMeiYouPinPinTuanParse(logger=self.my_lg) goods_data = await jumeiyoupin.deal_with_data( jumei_pintuan_url=tmp_url) if goods_data == {} or goods_data.get('is_delete', 0) == 1: pass else: # 规范化 goods_data['goods_id'] = goods_id goods_data['pintuan_time'] = item.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = await self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=item.get('pintuan_time', {})) goods_data['sort'] = item.get('sort') goods_data['page'] = item.get('page') goods_data['tab'] = item.get('tab') # pprint(goods_data) # print(goods_data) await jumeiyoupin.insert_into_jumeiyoupin_pintuan_table( data=goods_data, pipeline=my_pipeline, logger=self.my_lg) e_time = time.time() if e_time - s_time > JUMEIYOUPIN_SLEEP_TIME: # 使其更智能点 pass else: await asyncio.sleep(JUMEIYOUPIN_SLEEP_TIME - (e_time - s_time)) index += 1 else: self.my_lg.error('数据库连接失败,此处跳过!') pass gc.collect() return None
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=3 or SiteID=4 or SiteID=6 order by ID desc' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) result_2 = list(tmp_sql_server.select_old_table_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None result_2 = [] if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 tmall = TmallParse() for item in result_2: # 实时更新数据 data = {} if index % 5 == 0: tmall = TmallParse() gc.collect() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = tmall.get_goods_id_from_url(item[0]) if goods_id == []: print('@@@ 原地址为: ', item[0]) continue else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id[1], index)) data = tmall.get_goods_data(goods_id=goods_id) if isinstance(data, int): continue if data.get('is_delete') == 1: data['goods_id'] = goods_id[1] # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[ 1] # 构造成标准干净的天猫商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[ 1] elif goods_id[ 0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = str( goods_id[2]) + '?id=' + goods_id[1] else: continue data['goods_url'] = wait_to_deal_with_url data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) result = tmall.old_tmall_goods_insert_into_new_table( data, pipeline=tmp_sql_server) if result is False: print('出错商品的地址为: ', item[0]) else: pass index += 1 gc.collect() sleep(1.2) continue else: pass data = tmall.deal_with_data() if data != {}: data['goods_id'] = goods_id[1] # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[ 1] # 构造成标准干净的天猫商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[ 1] elif goods_id[ 0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = str( goods_id[2]) + goods_id[1] else: continue data['goods_url'] = wait_to_deal_with_url data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) tmall.old_tmall_goods_insert_into_new_table( data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass gc.collect() sleep(2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
class TaoBaoWeiTaoShareParse(AsyncCrawler): def __init__( self, logger=None, *params, **kwargs, ): AsyncCrawler.__init__( self, *params, **kwargs, logger=logger, ip_pool_type=IP_POOL_TYPE, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/微淘/', ) self._set_headers() self.msg = '' self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() def _set_headers(self): self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', 'referer': 'https://market.m.taobao.com/apps/market/content/index.html?ut_sk=1.VmYadv9DXkkDAFZm0VV4JBNq_21380790_1527298517854.Copy.33¶ms=%7B%22csid%22%3A%2254a52aea54b7c29d289a0e36b2bf2f51%22%7D&wh_weex=true&contentId=200668154273&source=weitao_2017_nocover&data_prefetch=true&suid=3D763077-A7BF-43BC-9092-C17B35E896F9&wx_navbar_transparent=false&wx_navbar_hidden=false&sourceType=other&un=bc80c9f324602d31384c4a342af87869&share_crt_v=1&sp_tk=o6R2Q0ZDMHZvaDBlS6Ok&cpp=1&shareurl=true&spm=a313p.22.68.948703884987&short_name=h.WAjz5RP&app=chrome', 'authority': 'h5api.m.taobao.com', # cookie得注释掉, 否则为非法请求 # 'cookie': '' } async def _get_target_url_and_content_id_and_csid(self, taobao_short_url): ''' 根据给与的淘宝分享短链接, 得到target_url, content_id, csid :param taobao_short_url: :return: ''' if re.compile(r'contentId').findall(taobao_short_url) != []: # 先检查是否已为目标地址 target_url = taobao_short_url else: body = Requests.get_url_body( url=taobao_short_url, headers=self.headers, ip_pool_type=self.ip_pool_type, ) # self.lg.info(str(body)) if body == '': self.lg.error('获取到的body为空值, 出错短链接地址: {0}'.format( str(taobao_short_url))) return '', '', '' try: # 获取短连接的目标地址 target_url = re.compile('var url = \'(.*?)\';').findall( body)[0] self.lg.info('获取到原始连接: {}'.format(target_url)) except IndexError: self.lg.error('获取target_url的时候IndexError! 出错短链接地址: {0}'.format( str(taobao_short_url))) target_url = '' try: # 得到contentId content_id = re.compile('contentId=(\d+)').findall(target_url)[0] self.lg.info(content_id) except IndexError: self.lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format( str(taobao_short_url))) content_id = '' try: # 得到csid csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall( target_url)[0] # self.lg.info(csid) except IndexError: self.lg.info('此链接为无csid情况的链接...') # self.lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) csid = '' try: tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0] except IndexError: tag_name = '' try: tag = re.compile('tag=(.*?)&').findall(target_url)[0] except IndexError: tag = '' return target_url, content_id, csid, tag_name, tag async def _get_api_body(self, taobao_short_url): ''' 获取该页面api返回的文件 :param taobao_short_url: :return: body 类型 str ''' base_url = 'https://h5api.m.taobao.com/h5/mtop.taobao.beehive.detail.contentservicenewv2/1.0/' try: target_url, content_id, csid, tag_name, tag = await self._get_target_url_and_content_id_and_csid( taobao_short_url) except ValueError: self.lg.error('遇到ValueError!', exc_info=True) return '' if content_id == '' and csid == '': # 异常退出 return '' data = dumps({ 'businessSpm': '', 'business_spm': '', 'contentId': content_id, 'params': dumps({ "csid": csid, }) if csid != '' else '', # 没有csid时,就不传这个参数 'source': 'weitao_2017_nocover', 'tagName': tag_name, # 这个是我自己额外加的用于获取tags的api接口 'track_params': '', 'type': 'h5', }) params = { 'AntiCreep': 'true', 'AntiFlood': 'true', 'api': 'mtop.taobao.beehive.detail.contentservicenewv2', 'appKey': '12574478', 'callback': 'mtopjsonp1', # 'data': '{"contentId":"200668154273","source":"weitao_2017_nocover","type":"h5","params":"{\\"csid\\":\\"54a52aea54b7c29d289a0e36b2bf2f51\\"}","businessSpm":"","business_spm":"","track_params":""}', 'data': data, 'dataType': 'jsonp', 'data_2': '', 'jsv': '2.4.11', # 'sign': 'e8cb623e58bab0ceb10e9edffdacd5b2', # 't': '1527300457911', 'type': 'jsonp', 'v': '1.0' } # TODO 新版 # 必传参数(无cookies, sign正确也无结果!) # 而且登录后的cookies, 但是继续采集, tb会报: 亲,访问被拒绝了哦!请检查是否使用了代理软件或VPN哦~ result_1 = await get_taobao_sign_and_body( base_url=base_url, headers=self.headers, params=params, data=data, logger=self.lg, ip_pool_type=self.ip_pool_type) _m_h5_tk = result_1[0] if _m_h5_tk == '': self.lg.error( '获取到的_m_h5_tk为空str! 出错短链接地址: {0}'.format(taobao_short_url)) # 带上_m_h5_tk, 和之前请求返回的session再次请求得到需求的api数据 result_2 = await get_taobao_sign_and_body( base_url=base_url, headers=self.headers, params=params, data=data, _m_h5_tk=_m_h5_tk, session=result_1[1], logger=self.lg, ip_pool_type=self.ip_pool_type) body = result_2[2] return body async def _deal_with_api_info(self, taobao_short_url): ''' 处理api返回的信息, 并结构化存储 :param taobao_short_url: :return: ''' data = await self._get_api_body(taobao_short_url) if data == '': self.lg.error('获取到的api数据为空值!') return {} try: data = re.compile('mtopjsonp1\((.*)\)').findall(data)[0] except IndexError: self.lg.error( 're获取主信息失败, IndexError, 出错短链接地址:{0}'.format(taobao_short_url)) data = {} try: data = await self._wash_api_info(loads(data)) # pprint(data) except Exception as e: self.lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.lg.exception(e) return {} article = await self._get_article(data=data, taobao_short_url=taobao_short_url) pprint(article) if article != {} and article.get('share_id', '') != '': '''采集该文章推荐的商品''' await self._crawl_and_save_these_goods( goods_url_list=article.get('goods_url_list', [])) '''存储该文章info''' await self._save_this_article(article=article) return True else: self.lg.info('获取到的文章失败! article为空dict!') return False async def _crawl_and_save_these_goods(self, goods_url_list): ''' 采集该文章推荐的商品 :param goods_url_list: :return: ''' sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6' try: result = self.my_pipeline._select_table(sql_str=sql_str) except TypeError: result = [] self.lg.info('即将开始抓取该文章的goods, 请耐心等待...') index = 1 db_all_goods_id_list = [item[0] for item in result] for item in goods_url_list: try: goods_id = re.compile(r'id=(\d+)').findall( item.get('goods_url', ''))[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in db_all_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue else: taobao = TaoBaoLoginAndParse(logger=self.lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url( item.get('goods_url', '')) if goods_id == '': self.lg.info('@@@ 原商品的地址为: {0}'.format( item.get('goods_url', ''))) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.lg.info('该文章的商品已经抓取完毕!') return True async def _save_this_article(self, article): ''' 存储该文章info :param article: :return: ''' sql_str = 'select share_id from dbo.daren_recommend' db_share_id = [ j[0] for j in list(self.my_pipeline._select_table(sql_str=sql_str)) ] if article.get('share_id') in db_share_id: self.lg.info('该share_id({})已存在于数据库中, 此处跳过!'.format( article.get('share_id', ''))) return True else: self.lg.info('即将开始存储该文章...') if self.my_pipeline.is_connect_success: params = await self._get_db_insert_params(item=article) # pprint(params) sql_str = r'insert into dbo.daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_goods_base_info, div_body, create_time, site_id) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' self.my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.lg) return True else: self.lg.error('db连接失败!存储失败! 出错article地址:{0}'.format( article.get('gather_url', ''))) return False async def _get_db_insert_params(self, item): params = ( item['nick_name'], item['head_url'], item['profile'], item['share_id'], item['gather_url'], item['title'], item['comment_content'], # dumps(item['share_img_url_list'], ensure_ascii=False), # dumps(item['goods_id_list'], ensure_ascii=False), dumps(item['share_goods_base_info'], ensure_ascii=False), item['div_body'], item['create_time'], item['site_id'], ) return params async def _get_article(self, data, taobao_short_url): ''' 得到该文章的需求信息 :param data: :return: ''' try: nick_name = data.get('data', {}).get('models', {}).get('account', {}).get('name', '') assert nick_name != '', '获取到的nick_name为空值!' head_url = await self._get_head_url(data=data) # 推荐人的简介或者个性签名 tmp_profile = data.get('data', {}).get('models', {}).get('account', {}).get('accountDesc', '') profile = tmp_profile if tmp_profile is not None else '' title = self._wash_sensitive_info( data.get('data', {}).get('models', {}).get('content', {}).get('title', '')) # self.lg.info(title) assert title != '', '获取到的title为空值!请检查!' # 达人的评论,可用于荐好首页的文字信息 comment_content = self._wash_sensitive_info( data.get('data', {}).get('models', {}).get('content', {}).get('summary', '')) '''微淘抓包的接口: 图片,商品依次对应''' tmp_goods_list = data.get('data', {}).get('models', {}).get( 'content', {}).get('drawerList', []) assert tmp_goods_list != [], '获取到的goods_id_list为空list! 请检查! 可能该文章推荐商品为空[]!' share_img_url_list = [{ 'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', '') } for item in tmp_goods_list] goods_id_list = [{ 'goods_id': item.get('itemId', '') } for item in tmp_goods_list] # 由于微淘的图片跟商品信息一一对应,so直接存一个字段, 清除重复的推荐商品(list去重,并保持原来的顺序) share_goods_base_info = list_duplicate_remove([{ 'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', ''), 'goods_id': item.get('itemId', ''), } for item in tmp_goods_list]) # div_body div_body = self._wash_sensitive_info( await self._get_div_body(rich_text=data.get('data', {}).get( 'models', {}).get('content', {}).get('richText', []))) # print(div_body) # 待抓取的商品地址, 统一格式为淘宝的,如果是tmall地址, 浏览器会重定向到天猫 goods_url_list = [{ 'goods_url': 'https://item.taobao.com/item.htm?id=' + item.get('goods_id', '') } for item in goods_id_list] _ = ( await self._get_target_url_and_content_id_and_csid(taobao_short_url)) gather_url = _[0] share_id = _[1] # 即content_id create_time = get_shanghai_time() site_id = 2 # 淘宝微淘 # tags 额外的文章地址 tags = await self._get_tags(data=data) # pprint(tags) except Exception as e: self.lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.lg.exception(e) return {} article = WellRecommendArticle() article['nick_name'] = nick_name article['head_url'] = head_url article['profile'] = profile article['share_id'] = share_id article['title'] = title article['comment_content'] = comment_content article['share_img_url_list'] = share_img_url_list article['goods_id_list'] = goods_id_list article['div_body'] = div_body article['gather_url'] = gather_url article['create_time'] = create_time article['site_id'] = site_id article['goods_url_list'] = goods_url_list article['tags'] = tags article['share_goods_base_info'] = share_goods_base_info return article async def _get_head_url(self, data): ''' 获取头像地址 :param data: :return: ''' tmp_head_url = data.get('data', {}).get('models', {}).get('account', {}).get('accountPic', {}).get('picUrl', '') if tmp_head_url != '': if re.compile('http').findall(tmp_head_url) == []: head_url = 'https:' + tmp_head_url else: head_url = tmp_head_url else: head_url = '' return head_url def _wash_sensitive_info(self, data): ''' 清洗敏感信息 :param data: :return: ''' data = re.compile('淘宝|天猫|taobao|tmall|TAOBAO|TMALL').sub('', data) return data async def _get_tags(self, data): ''' 获得额外文章的信息 :param data: :return: ''' tags = data.get('data', {}).get('models', {}).get('tags', []) tags = [{ 'url': unquote(item.get('url', '')), 'name': item.get('name', ''), } for item in tags] return tags async def _get_div_body(self, rich_text): ''' 处理得到目标文章 :param rich_text: 待处理的原文章 :return: ''' div_body = '' for item in rich_text: if item.get('resource') is None: continue for resource_item in item.get('resource', []): # 可能是多个 # resource = item.get('resource', [])[0] text = resource_item.get('text', '') # 介绍的文字 picture = resource_item.get('picture', {}) # 介绍的图片 _goods = resource_item.get('item', {}) # 一个商品 if text != '': text = '<p style="height:auto;width:100%">' + text + '</p>' + '<br>' div_body += text continue if picture != {}: # 得到该图片的宽高,并得到图片的<img>标签 _ = r'<img src="{0}" style="height:{1}px;width:{2}px;"/>'.format( 'https:' + picture.get('picUrl', ''), picture.get('picHeight', ''), picture.get('picWidth', '')) _ = _ + '<br>' div_body += _ continue if _goods != {}: _hiden_goods_id = r'<p style="display:none;">此处有个商品[goods_id]: {0}</p>'.format( _goods.get('itemId', '')) + '<br>' div_body += _hiden_goods_id continue return '<div>' + div_body + '</div>' if div_body != '' else '' async def _wash_api_info(self, data): ''' 清洗接口 :param data: :return: ''' try: data['data']['assets'] = [] data['data']['models']['config'] = {} data['data']['modules'] = [] except Exception: pass return data def __del__(self): try: del self.lg del self.msg del self.my_pipeline except: pass gc.collect()