def goods_name_check_and_do_something(): """ 违禁物品检测下架 :return: """ sql_cli = SqlServerMyPageInfoSaveItemPipeline() sql_str0 = ''' -- select count(*) select GoodsID, GoodsName, ConvertTime, IsDelete, delete_time, MainGoodsID from dbo.GoodsInfoAutoGet where ''' sql_str1 = 'update dbo.GoodsInfoAutoGet set IsDelete=1, ModfiyTime=%s, delete_time=%s where GoodsID=%s' goods_name_like_str = ' or '.join(['GoodsName like %s' for index in range(0, len(CONTRABAND_GOODS_KEY_TUPLE))]) sql_str0 += goods_name_like_str print(sql_str0) params = ['%{}%'.format(item) for item in CONTRABAND_GOODS_KEY_TUPLE] pprint(params) res = sql_cli._select_table( sql_str=sql_str0, params=params, ) pprint(res) # 下架 assert res is not None now_time = get_shanghai_time() for item in res: goods_id = item[0] print('goods_id: {}'.format(goods_id)) sql_cli._update_table( sql_str=sql_str1, params=(now_time, now_time, goods_id) ) try: del sql_cli except: pass
def deal_with_data(self): ''' 解析data数据,得到需要的东西 :return: dict ''' data = self.result_data if data != {}: shop_name = self._get_shop_name(data=data) # 掌柜 account = '' title = self._get_title(data=data) sub_title = '' detail_name_list = self._get_detail_name_list(data=data) # print(detail_name_list) '''单独处理下架的情况''' if isinstance(detail_name_list, str): if detail_name_list == 'is_delete=1': print('该商品已下架...') sql_str = jp_update_str_1 params = (self.result_data.get('goods_id', ''), ) _ = SqlServerMyPageInfoSaveItemPipeline() result = _._update_table(sql_str=sql_str, params=params) if result: print('### 该商品已经is_delete=1 ###') else: print('is_delete=1标记失败!') if detail_name_list == {}: return self._data_error_init() price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price( data=data) all_img_url = self._get_all_img_url(data=data) p_info = self._get_p_info(data=data) div_desc = self._get_div_desc(data=data) # 商品销售时间段 schedule = self._get_goods_schedule(data=data) # pprint(schedule) is_delete = self._get_is_delete(data=data, schedule=schedule) if price == 0 or taobao_price == 0: # 没有获取到价格说明商品已经下架了 is_delete = 1 parent_dir = data.get('parent_dir', '') result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'is_delete': is_delete, # 是否下架判断 'schedule': schedule, # 商品销售时间段 'parent_dir': parent_dir, } # pprint(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) gc.collect() return result else: print('待处理的data为空的dict') return {}
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': return self._data_error_init() else: tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id) print('------>>>| 得到的商品手机版的地址为: ', tmp_url) ''' 1.原先使用requests来模拟(起初安全的运行了一个月),但是后来发现光requests会not Found,记住使用前别翻墙 ''' # try: # response = requests.get(tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=12) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # main_body = response.content.decode('utf-8') # # print(main_body) # # main_body = re.compile(r'\n').sub('', main_body) # main_body = re.compile(r'\t').sub('', main_body) # main_body = re.compile(r' ').sub('', main_body) # print(main_body) # data = re.compile(r'__PRELOADED_STATE__=(.*),window\.__SERVER_TIME__=').findall(main_body) # 贪婪匹配匹配所有 # print(data) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # return self._data_error_init() ''' 2.采用phantomjs来处理,记住使用前别翻墙 ''' # body = self.driver.use_phantomjs_to_get_url_body(url=tmp_url, css_selector='div.sc-kgoBCf.bTQvTk') # 该css为手机端标题块 body = self.driver.use_phantomjs_to_get_url_body( url=tmp_url) # 该css为手机端标题块 # print(body) if re.compile(r'<span id="t-index">页面丢失ing</span>').findall( body) != []: # 页面为空处理 _ = SqlServerMyPageInfoSaveItemPipeline() if _.is_connect_success: _._update_table(sql_str=jp_update_str_1, params=(goods_id, )) try: del _ except: pass print('@@@ 逻辑删除该商品[{0}] is_delete = 1'.format(goods_id)) return self._data_error_init() if body == '': print('获取到的body为空str!请检查!') return self._data_error_init() data = re.compile( r'__PRELOADED_STATE__ = (.*);</script> <style ').findall( body) # 贪婪匹配匹配所有 # 得到skudata # 卷皮原先的skudata请求地址1(官方放弃) # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id) # 现在卷皮skudata请求地址2 skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str( goods_id) self.skudata_headers = self.headers self.skudata_headers.update({'Host': 'webservice.juanpi.com'}) skudata_body = Requests.get_url_body( url=skudata_url, headers=self.skudata_headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if skudata_body == '': print('获取到的skudata_body为空str!请检查!') return self._data_error_init() skudata = re.compile(r'(.*)').findall(skudata_body) # 贪婪匹配匹配所有 if skudata != []: skudata = json_2_dict(json_str=skudata[0]).get('skudata', {}) if skudata == {}: return self._data_error_init() # pprint(skudata) try: if skudata.get('info') is not None: pass # 说明得到正确的skudata else: # 否则跳出 print('skudata中info的key为None, 返回空dict') return self._data_error_init() except AttributeError as e: print('遇到错误如下(先跳过!): ', e) return self._data_error_init() else: print('skudata为空!') return self._data_error_init() if data != []: main_data = json_2_dict(json_str=data[0]) if main_data == {}: return self._data_error_init() if main_data.get('detail') is not None: main_data = self._wash_main_data( main_data.get('detail', {})) main_data['skudata'] = skudata main_data['goods_id'] = goods_id main_data['parent_dir'] = _jp_get_parent_dir( phantomjs=self.driver, goods_id=goods_id) self.result_data = main_data # pprint(main_data) return main_data else: print('data中detail的key为None, 返回空dict') return self._data_error_init() else: print('data为空!') return self._data_error_init()
def get_goods_data(self, goods_id: str) -> '重载获取数据的方法': ''' 模拟构造得到data :param goods_id: :return: data dict类型 ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: data = {} # 常规商品手机地址 goods_url = 'https://m.mia.com/item-' + str(goods_id) + '.html' # 常规商品pc地址 # goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html' print('------>>>| 待抓取的地址为: ', goods_url) body = MyRequests.get_url_body(url=goods_url, headers=self.headers, had_referer=True) # print(body) if body == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} is_mia_mian_page = Selector( text=body).css('div.item-center::text').extract_first() # print(is_mia_mian_page) if isinstance( is_mia_mian_page, str ) and is_mia_mian_page == '进口母婴正品特卖': # 单独处理拼团下架被定向到手机版主页的拼团商品 print('++++++ 该拼团商品已下架,被定向到蜜芽主页, 此处将其逻辑删除!') self.result_data = {} tmp_pipeline = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'update dbo.mia_pintuan set is_delete=1 where goods_id = %s' tmp_pipeline._update_table(sql_str=sql_str, params=(goods_id)) print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |') gc.collect() return {} # 判断是否跳转,并得到跳转url, 跳转url的body, 以及is_hk(用于判断是否是全球购的商品) body, sign_direct_url, is_hk = self.get_jump_to_url_and_is_hk( body=body) try: # title, sub_title data['title'], data[ 'sub_title'] = self.get_title_and_sub_title(body=body) # 获取所有示例图片 all_img_url = self.get_all_img_url(goods_id=goods_id, is_hk=is_hk) if all_img_url == '': self.result_data = {} return {} ''' 获取p_info ''' tmp_p_info = Selector( text=body).css('div.showblock div p').extract_first() if tmp_p_info == '': print('获取到的tmp_p_info为空值, 请检查!') self.result_data = {} return {} else: tmp_p_info = re.compile('<p>|</p>').sub('', tmp_p_info) tmp_p_info = re.compile(r'<!--思源品牌,隐藏品牌-->').sub( '', tmp_p_info) p_info = [{ 'p_name': item.split(':')[0], 'p_value': item.split(':')[1] } for item in tmp_p_info.split('<br>') if item != ''] # pprint(p_info) data['p_info'] = p_info # 获取每个商品的div_desc div_desc = self.get_goods_div_desc(body=body) if div_desc == '': print('获取到的div_desc为空值! 请检查') self.result_data = {} return {} data['div_desc'] = div_desc ''' 获取每个规格的goods_id,跟规格名,以及img_url, 用于后面的处理 ''' sku_info = self.get_tmp_sku_info(body, goods_id, sign_direct_url, is_hk) if sku_info == {}: return {} ''' 由于这个拿到的都是小图,分辨率相当低,所以采用获取每个goods_id的phone端地址来获取每个规格的高清规格图 ''' # # print(Selector(text=body).css('dd.color_list li').extract()) # for item in Selector(text=body).css('dd.color_list li').extract(): # # print(item) # try: # # 该颜色的商品的goods_id # color_goods_id = Selector(text=item).css('a::attr("href")').extract_first() # # 该颜色的名字 # color_name = Selector(text=item).css('a::attr("title")').extract_first() # # 该颜色的img_url # color_goods_img_url = Selector(text=item).css('img::attr("src")').extract_first() # # color_goods_id = re.compile('(\d+)').findall(color_goods_id)[0] # except IndexError: # 表示该li为这个tmp_url的地址 (单独处理goods_id) # color_goods_id = goods_id # color_name = Selector(text=item).css('a::attr("title")').extract_first() # color_goods_img_url = Selector(text=item).css('img::attr("src")').extract_first() # print(color_goods_id, ' ', color_name, ' ', color_goods_img_url) ''' 获取每个规格对应价格跟规格以及其库存 ''' if self.get_true_sku_info(sku_info=sku_info) == {}: # 表示出错退出 return {} else: # 成功获取 true_sku_info, i_s, pintuan_time, all_sell_count = self.get_true_sku_info( sku_info=sku_info) data['price_info_list'] = true_sku_info data['pintuan_time'] = pintuan_time data['all_sell_count'] = all_sell_count # pprint(true_sku_info) # 设置detail_name_list data['detail_name_list'] = self.get_detail_name_list(i_s=i_s) # print(detail_name_list) '''单独处理all_img_url为[]的情况''' if all_img_url == []: all_img_url = [{ 'img_url': true_sku_info[0].get('img_url') }] data['all_img_url'] = all_img_url # pprint(all_img_url) ''' 单独处理得到goods_url ''' if sign_direct_url != '': goods_url = sign_direct_url data['goods_url'] = goods_url except Exception as e: print('遇到错误如下: ', e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def get_goods_data(self, goods_id:str) -> '重载获取数据的方法': ''' 模拟构造得到data的url :param goods_id: :return: data dict类型 ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: if re.compile(r'/rushdetail/').findall(goods_id) != []: tmp_url = goods_id print('------>>>| 原pc地址为: ', tmp_url) goods_id = re.compile('https://shop.mogujie.com/rushdetail/(.*?)\?.*?').findall(goods_id)[0] print('------>>>| 得到的蘑菇街商品id为:', goods_id) else: print('获取到的蘑菇街买哦啥地址错误!请检查') self.result_data = {} return {} data = {} body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('获取到的body为空str!') self.result_data = {} return {} try: goods_info = re.compile(r'var detailInfo = (.*?);</script>').findall(body)[0] # print(goods_info) item_info = re.compile(r'itemInfo:(.*?) ,priceRuleImg').findall(goods_info)[0] # print(item_info) sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall(goods_info)[0] # print(sku_info) shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall(goods_info)[0] # print(shop_info) item_info = json_2_dict(json_str=item_info) sku_info = json_2_dict(json_str=sku_info) shop_info = json_2_dict(json_str=shop_info) # pprint(item_info) # pprint(sku_info) # pprint(shop_info) data['title'] = item_info.get('title', '') if data['title'] == '': print('title为空!') raise Exception data['sub_title'] = '' data['shop_name'] = shop_info.get('name', '') # print(data['shop_name']) # 获取所有示例图片 all_img_url = [{'img_url': item} for item in item_info.get('topImages', [])] # pprint(all_img_url) data['all_img_url'] = all_img_url ''' 获取p_info ''' p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str(goods_id) tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url, headers=self.headers, had_referer=True) # print(tmp_p_info_body) if tmp_p_info_body == '': print('获取到的tmp_p_info_body为空值, 请检查!') raise Exception p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body) # pprint(p_info) # if p_info == []: # print('获取到的p_info为空list') # self.result_data = {} # return {} # else: # 不做上面判断了因为存在没有p_info的商品 data['p_info'] = p_info # 获取每个商品的div_desc div_desc = self.get_goods_div_desc(tmp_p_info_body=tmp_p_info_body) # print(div_desc) if div_desc == '': print('获取到的div_desc为空str, 请检查!') self.result_data = {} return {} else: data['div_desc'] = div_desc ''' 获取去detail_name_list ''' detail_name_list = self.get_goods_detail_name_list(sku_info=sku_info) # print(detail_name_list) if detail_name_list == '': print('获取detail_name_list出错, 请检查!') self.result_data = {} return {} else: data['detail_name_list'] = detail_name_list ''' 获取每个规格对应价格跟规格以及其库存 ''' price_info_list = self.get_price_info_list(sku_info=sku_info) # pprint(price_info_list) if price_info_list == '': raise Exception else: # pprint(price_info_list) data['price_info_list'] = price_info_list if price_info_list == []: print('该商品已售完,此处将商品状态改为1') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() try: sql_str = r'update dbo.mogujie_xianshimiaosha set is_delete=1 where goods_id = %s' my_pipeline._update_table(sql_str=sql_str, params=(goods_id)) except: print('将该商品逻辑删除时出错!') pass print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |') self.result_data = {} return {} # 商品价格和淘宝价 try: tmp_price_list = sorted([round(float(item.get('detail_price', '')), 2) for item in data['price_info_list']]) price = Decimal(tmp_price_list[-1]).__round__(2) # 商品价格 taobao_price = Decimal(tmp_price_list[0]).__round__(2) # 淘宝价 # print('商品的最高价: ', price, ' 最低价: ', taobao_price) except IndexError: print('获取price和taobao_price时出错! 请检查') raise Exception data['price'] = price data['taobao_price'] = taobao_price except Exception as e: print('遇到错误: ', e) self.result_data = {} return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
class DbTimingScript(AsyncCrawler): """数据库定时脚本""" def __init__(self): AsyncCrawler.__init__(self, ) self.sleep_time = 2. * 60 self.init_sql_str() def init_sql_str(self): # 删除下架又上架但是状态还是下架的异常数据(即下架状态但是delete_time<shelf_time)(原因后台无法更新) self.sql_str0 = ''' select top 100 GoodsID, SiteID from dbo.GoodsInfoAutoGet where MainGoodsID is not NULL and IsDelete=1 and delete_time < shelf_time ''' self.sql_str1 = ''' update dbo.GoodsInfoAutoGet set ModfiyTime=%s, delete_time=%s where GoodsID=%s ''' # 更改原先下架但是delete_time为空的商品(原因后台无法由上架变下架) self.sql_str2 = ''' select top 100 GoodsID, SiteID from dbo.GoodsInfoAutoGet where MainGoodsID is not null and IsDelete=1 and delete_time is null ''' self.sql_str3 = ''' update dbo.GoodsInfoAutoGet set delete_time=%s where GoodsID=%s ''' # 更改上架状态但是delete_time>shelf_time的商品(原因后台无法更新下架变上架) self.sql_str4 = ''' select top 100 GoodsID, SiteID from dbo.GoodsInfoAutoGet where MainGoodsID is not NUll and IsDelete=0 and shelf_time < delete_time ''' self.sql_str5 = ''' update dbo.GoodsInfoAutoGet set ModfiyTime=%s, shelf_time=%s where GoodsID=%s ''' # tb天天特价过期下架 self.sql_str6 = ''' select top 500 goods_id, site_id from dbo.taobao_tiantiantejia where MainGoodsID is not null and is_delete=0 and miaosha_end_time < GETDATE() ''' # zhe800秒杀标记下架 self.sql_str7 = ''' select top 500 goods_id, site_id from dbo.zhe_800_xianshimiaosha where MainGoodsID is not null and is_delete=0 and miaosha_end_time <= GETDATE() ''' # zhe800拼团过期下架 self.sql_str8 = ''' select top 500 goods_id, site_id from dbo.zhe_800_pintuan where MainGoodsID is not null and is_delete=0 and miaosha_end_time <= GETDATE() ''' # mia拼团 self.sql_str9 = ''' select top 500 goods_id, site_id from dbo.mia_pintuan where MainGoodsID is not null and is_delete=0 and miaosha_end_time <= GETDATE() ''' # 聚美优品拼团 self.sql_str10 = ''' select top 500 goods_id, site_id from dbo.jumeiyoupin_pintuan where MainGoodsID is not null and is_delete=0 and miaosha_end_time <= GETDATE() ''' # 周期性的把最近更新的商品进行规格跟价格变动标记 self.sql_str11 = ''' select top 200 GoodsID from dbo.GoodsInfoAutoGet where MainGoodsID is not null and IsDelete=0 ORDER BY ModfiyTime desc ''' self.sql_str12 = ''' update dbo.GoodsInfoAutoGet set is_spec_change=1, spec_trans_time=%s, ModfiyTime=%s, IsPriceChange=1, sku_info_trans_time=%s, PriceChangeInfo=SKUInfo where GoodsID=%s ''' async def _fck_run(self): while True: try: print('now_time: {}'.format(get_shanghai_time())) self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() if not self.sql_cli.is_connect_success: raise SqlServerConnectionException else: pass await self.db_script0( select_sql_str=self.sql_str0, update_sql_str=self.sql_str1, func_get_params=self.get_params0, ) await self.db_script0( select_sql_str=self.sql_str2, update_sql_str=self.sql_str3, func_get_params=self.get_params1, ) await self.db_script0( select_sql_str=self.sql_str4, update_sql_str=self.sql_str5, func_get_params=self.get_params0, ) # tb天天特价 await self.db_script0( select_sql_str=self.sql_str6, update_sql_str=tb_update_str_5, func_get_params=self.get_params2, ) # zhe800秒杀 await self.db_script0( select_sql_str=self.sql_str7, update_sql_str=z8_update_str_6, func_get_params=self.get_params2, ) # zhe800拼团 await self.db_script0( select_sql_str=self.sql_str8, update_sql_str=z8_update_str_4, func_get_params=self.get_params2, ) # mia拼团 await self.db_script0( select_sql_str=self.sql_str9, update_sql_str=mia_update_str_7, func_get_params=self.get_params2, ) # 聚美优品拼团 await self.db_script0( select_sql_str=self.sql_str10, update_sql_str=jm_update_str_5, func_get_params=self.get_params2, ) # 周期性的把最近更新的商品进行规格跟价格变动标记 await self.db_script0( select_sql_str=self.sql_str11, update_sql_str=self.sql_str12, func_get_params=self.get_params3, ) except Exception as e: print(e) finally: print('休眠{}s ...'.format(self.sleep_time)) await async_sleep(self.sleep_time) async def db_script0( self, select_sql_str: str, update_sql_str: str, func_get_params, ): get_current_func_info_by_traceback(self=self) db_res = self.sql_cli._select_table(sql_str=select_sql_str, ) db_res = [] if db_res is None else db_res if db_res == []: print('目标db_res为空list! 跳过此次!') return None for item in db_res: params = func_get_params(k=item) self.sql_cli._update_table( sql_str=update_sql_str, params=params, ) try: del db_res except: pass return None def get_params0(self, k) -> tuple: now_time = str(get_shanghai_time()) goods_id = k[0] site_id = k[1] print('goods_id: {}, site_id: {}'.format(goods_id, site_id)) return tuple([ now_time, now_time, goods_id, ]) def get_params1(self, k) -> tuple: now_time = str(get_shanghai_time()) goods_id = k[0] site_id = k[1] print('goods_id: {}, site_id: {}'.format(goods_id, site_id)) return tuple([ now_time, goods_id, ]) def get_params2(self, k) -> tuple: now_time = str(get_shanghai_time()) goods_id = k[0] site_id = k[1] print('goods_id: {}, site_id: {}'.format(goods_id, site_id)) return tuple([ now_time, goods_id, ]) def get_params3(self, k) -> tuple: now_time = str(get_shanghai_time()) goods_id = k[0] print('goods_id: {}'.format(goods_id)) return tuple([ now_time, now_time, now_time, goods_id, ]) def __del__(self): try: pass except: pass collect()
""" import sys, json, re sys.path.append('..') from pprint import pprint from my_pipeline import SqlServerMyPageInfoSaveItemPipeline _ = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select GoodsID, SiteID, GoodsUrl from dbo.GoodsInfoAutoGet where SiteID=6 order by ID desc' _s = _._select_table(sql_str=sql_str) # print(_s) import re tmp = _s tmp = [list(item) for item in tmp] for item in tmp: if re.compile('\?id=').findall(item[2]) == []: a = re.compile('(.*htm)').findall(item[2])[0] b = re.compile('.*htm(.*)').findall(item[2])[0] c = a + '?id=' + b item[2] = c # print(tmp) tmp = [{'goods_id': item[0], 'goods_url': item[2]} for item in tmp] # print(tmp) sql_str = r'update dbo.GoodsInfoAutoGet set GoodsUrl=%s where GoodsID = %s' for item in tmp: _._update_table(sql_str=sql_str, params=(item['goods_url'], item['goods_id']))
class Z8Updater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/折800/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.tmp_sql_server = None self.goods_index = 1 self.concurrency = 8 # 并发量 self.delete_sql_str = z8_delete_str_3 async def _get_db_old_data(self): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=z8_delete_str_4, params=None) await async_sleep(5) result = list( self.tmp_sql_server._select_table(sql_str=z8_select_str_4)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_begin_time(self, miaosha_time) -> int: miaosha_begin_time = json_2_dict(miaosha_time).get( 'miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_begin_time async def _get_new_z8_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.zhe_800_spike except: pass collect() self.zhe_800_spike = Zhe800Spike() async def _update_is_delete(self, goods_id) -> bool: ''' 下架商品逻辑删除 :param goods_id: :return: ''' delete_str = 'update dbo.zhe_800_xianshimiaosha set is_delete=1 where goods_id=%s' res = self.tmp_sql_server._update_table(sql_str=delete_str, params=(goods_id, )) await async_sleep(.3) return res async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] session_id = item[2] miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time) # self.lg.info(str(miaosha_begin_time)) await self._get_new_z8_obj(index=index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = await self._update_is_delete(goods_id=goods_id) self.lg.info( '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format( goods_id, json.loads(item[1]).get('miaosha_begin_time'))) index += 1 self.goods_index = index res = True await async_sleep(.3) return goods_id, res elif is_recent_time == 2: # 可能包括过期的 self.lg.info('未来时间暂时不更新! {}'.format( timestamp_to_regulartime(miaosha_begin_time))) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) try: tmp_data = self.zhe_800_spike._get_one_session_id_data( base_session_id=str(session_id)) except Exception: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index return goods_id, res try: tmp_data = tmp_data.get('data', {}).get('blocks', []) assert tmp_data != [], '该session_id不存在,此处跳过' except AssertionError: # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品 self.lg.error(msg='遇到错误:', exc_info=True) res = await self._update_is_delete(goods_id) self.lg.info( msg= '该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!' .format(goods_id, miaosha_begin_time)) index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res tmp_data = [item_s.get('deal', {}) for item_s in tmp_data] # pprint(tmp_data) try: miaosha_goods_list = await self._get_miaoshao_goods_info_list( data=tmp_data) # pprint(miaosha_goods_list) except ValueError: await async_sleep(2) index += 1 self.goods_index = index return goods_id, res # 该session_id中现有的所有zid的list miaosha_goods_all_goods_id = [ i.get('zid') for i in miaosha_goods_list ] if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 res = await self._update_is_delete(goods_id) self.lg.info( '该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format( goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 未下架的 res = await self._one_update( miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(1.5) return goods_id, res async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :return: ''' miaosha_goods_list = kwargs.get('miaosha_goods_list') goods_id = kwargs.get('goods_id') zhe_800_miaosha = Zhe800Parse() res = False for item_1 in miaosha_goods_list: if item_1.get('zid', '') == goods_id: zhe_800_miaosha.get_goods_data(goods_id=goods_id) goods_data = zhe_800_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 break else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get('stock_info') goods_data['goods_id'] = str(item_1.get('zid')) if item_1.get('stock_info').get('activity_stock') > 0: # self.lg.info(item_1.get('price')) # self.lg.info(item_1.get('taobao_price')) goods_data['price'] = item_1.get('price') goods_data['taobao_price'] = item_1.get('taobao_price') else: self.lg.info('该商品参与活动的对应库存为0') await self._update_is_delete(goods_id=goods_id) break goods_data['sub_title'] = item_1.get('sub_title') goods_data['miaosha_time'] = item_1.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get('miaosha_time')) if goods_data.get('is_delete', 0) == 1: self.lg.info('该商品[{0}]已售罄...'.format(goods_id)) # self.lg.info(str(goods_data['stock_info'])) # self.lg.info(str(goods_data['miaosha_time'])) res = zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) break else: pass collect() return res async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = datetime_to_timestamp(get_shanghai_time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -259200: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time < -172800: # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来2小时的商品信息 return 0 # 已过期恢复原价的 elif diff_time > -172800 and diff_time < 7200: return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 async def _update_db(self): ''' 秒杀数据实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.zhe_800_spike = Zhe800Spike() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.zhe_800_spike except: pass collect() async def _get_miaoshao_goods_info_list(self, data) -> list: ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] # pprint(data) for item in data: if item == {}: continue # pprint(item) tmp = {} tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int( str(item.get('begin_time'))[0:10])), 'miaosha_end_time': timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])), } # 折800商品地址 tmp['zid'] = item.get('zid') # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } # 原始价格 tmp['price'] = float(item.get('list_price')) # 秒杀的价格, float类型 tmp['taobao_price'] = float(item.get('price')) tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.zhe_800_spike except: pass collect()
def deal_with_data(self): ''' 解析data数据,得到需要的东西 :return: dict ''' data = self.result_data if data != {}: # 店铺名称 shop_name = self._get_shop_name(data=data) # 掌柜 account = '' # 商品名称 title = data.get('baseInfo', {}).get('title', '') # 子标题 sub_title = '' # 商品库存 # 商品标签属性名称 detail_name_list = self._get_detail_name_list(data=data) if isinstance(detail_name_list, str): # 单独处理下架的情况 if detail_name_list == 'is_delete=1': print('该商品已下架...') sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s' params = (self.result_data.get('goods_id', ''), ) _ = SqlServerMyPageInfoSaveItemPipeline() result = _._update_table(sql_str=sql_str, params=params) if result: print('### 该商品已经is_delete=1 ###') else: print('is_delete=1标记失败!') if detail_name_list == {}: self.result_data = {} return {} # print(detail_name_list) # 商品标签属性对应的值(pass不采集) # 要存储的每个标签对应的规格的价格及其库存 price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price( data=data) # print('最高价为: ', price) # print('最低价为: ', taobao_price) # pprint(price_info_list) # 所有示例图片的地址 # pprint(data.get('goodImages')) all_img_url = [{ 'img_url': item } for item in data.get('goodImages')] # print(all_img_url) # 详细信息标签名对应的属性 p_info = self._get_p_info(data=data) # pprint(p_info) # div_desc div_desc = self._get_div_desc(data=data) # print(div_desc) # 商品销售时间段 schedule = self._get_goods_schedule(data=data) # pprint(schedule) is_delete = self._get_is_delete(data=data, schedule=schedule) if price == 0 or taobao_price == 0: # 没有获取到价格说明商品已经下架了 is_delete = 1 # print('is_delete = ', is_delete) result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'is_delete': is_delete, # 是否下架判断 'schedule': schedule, # 商品销售时间段 } # pprint(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) gc.collect() return result else: print('待处理的data为空的dict') return {}
def get_goods_data(self, goods_id: str) -> dict: ''' 模拟构造得到data的url :param goods_id: :return: data dict类型 ''' if goods_id == '': return self._data_error() if re.compile(r'/rushdetail/').findall(goods_id) != []: tmp_url = goods_id print('------>>>| 原pc地址为: ', tmp_url) goods_id = re.compile( 'https://shop.mogujie.com/rushdetail/(.*?)\?.*?').findall( goods_id)[0] print('------>>>| 得到的蘑菇街商品id为:', goods_id) else: print('获取到的蘑菇街买哦啥地址错误!请检查') return self._data_error() data = {} body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(body) if body == '': print('获取到的body为空str!') return self._data_error() try: goods_info = re.compile( r'var detailInfo = (.*?);</script>').findall(body)[0] # print(goods_info) item_info = re.compile(r'itemInfo:(.*?) ,priceRuleImg').findall( goods_info)[0] # print(item_info) sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall( goods_info)[0] # print(sku_info) shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall( goods_info)[0] # print(shop_info) item_info = json_2_dict(json_str=item_info) sku_info = json_2_dict(json_str=sku_info) shop_info = json_2_dict(json_str=shop_info) # pprint(item_info) # pprint(sku_info) # pprint(shop_info) title = self._get_title(item_info=item_info) # print(title) data['title'] = title data['sub_title'] = '' data['shop_name'] = self._get_shop_name(shop_info=shop_info) data['all_img_url'] = self._get_all_img_url(item_info=item_info) data['p_info'], tmp_p_info_body = self._get_p_info( goods_id=goods_id) data['div_desc'] = self._get_div_desc(tmp_p_info_body) data['detail_name_list'] = self._get_detail_name_list(sku_info) ''' 获取每个规格对应价格跟规格以及其库存 ''' price_info_list = self.get_price_info_list(sku_info=sku_info) assert price_info_list != '', 'price_info_list为空值!' # pprint(price_info_list) data['price_info_list'] = price_info_list if price_info_list == []: print('该商品已售完, 此处将商品状态改为1') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() try: my_pipeline._update_table(sql_str=mg_update_str_1, params=(goods_id)) except: print('将该商品逻辑删除时出错!') pass print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |') return self._data_error() # 商品价格和淘宝价 try: tmp_price_list = sorted([ round(float(item.get('detail_price', '')), 2) for item in data['price_info_list'] ]) price = Decimal(tmp_price_list[-1]).__round__(2) # 商品价格 taobao_price = Decimal(tmp_price_list[0]).__round__(2) # 淘宝价 # print('商品的最高价: ', price, ' 最低价: ', taobao_price) except IndexError: print('获取price和taobao_price时出错! 请检查') raise Exception data['price'] = price data['taobao_price'] = taobao_price except Exception as e: print('遇到错误: ', e) return self._data_error() self.result_data = data return data
class JPUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.tmp_sql_server = None self.concurrency = 8 self.goods_index = 1 self.delete_sql_str = jp_delete_str_3 async def _get_pc_headers(self) -> dict: return { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'm.juanpi.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } async def _get_db_old_data(self) -> (None, list): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None) await async_sleep(5) result = list(self.tmp_sql_server._select_table(sql_str=jp_select_str_4)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_begin_time(self, miaosha_time) -> int: miaosha_begin_time = json_2_dict(miaosha_time).get('miaosha_begin_time') miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_begin_time async def _get_new_jp_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.juanpi_miaosha except: pass collect() self.juanpi_miaosha = JuanPiParse() async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] tab_id = item[2] page = item[3] miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time) # self.lg.info(str(miaosha_begin_time)) await self._get_new_jp_obj(index=index) self.tmp_sql_server = await _get_new_db_conn(db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,)) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format(goods_id, miaosha_begin_time)) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res elif is_recent_time == 2: self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(goods_id, index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(page), ) # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url)) body = Requests.get_url_body(url=tmp_url, headers=await self._get_pc_headers(), ip_pool_type=self.ip_pool_type) try: data = json_2_dict(body, default_res={}).get('data', {}) assert data != {}, 'data为空dict!' data = data.get('goodslist', []) assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(tab_id, page) except AssertionError: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res miaosha_goods_list = await self._get_miaoshao_goods_info_list(data=data) # self.lg.info(str(miaosha_goods_list)) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [i.get('goods_id') for i in miaosha_goods_list] self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 if miaosha_goods_all_goods_id != []: # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过! self.lg.info('该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id)) else: # 表示该tab_id,page中没有了该goods_id res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,)) self.lg.info('该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format(goods_id)) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update(miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res async def _update_db(self) -> None: ''' 秒杀数据实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency) self.juanpi_miaosha = JuanPiParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append(self.loop.create_task(self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.juanpi_miaosha except: pass collect() async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :param kwargs: :return: ''' res = False miaosha_goods_list = kwargs.get('miaosha_goods_list') goods_id = kwargs.get('goods_id') for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == goods_id: self.juanpi_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.juanpi_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 break else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get('stock_info') goods_data['goods_id'] = item_1.get('goods_id') # goods_data['username'] = '******' if item_1.get('stock_info').get('activity_stock') > 0: goods_data['price'] = item_1.get('price') # 秒杀前的原特价 goods_data['taobao_price'] = item_1.get('taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get('sub_title', '') goods_data['miaosha_time'] = item_1.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get('miaosha_time')) res = self.juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) await async_sleep(.3) # 避免太快 break else: pass return res async def _get_miaoshao_goods_info_list(self, data) -> list: ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: tmp = {} tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(item.get('start_time'))), 'miaosha_end_time': timestamp_to_regulartime(int(item.get('end_time'))), } stock = item.get('stock', 0) tmp['goods_id'] = item.get('goods_id') # 限时秒杀库存信息 tmp['stock_info'] = { 'activity_stock': int(item.get('stock', 0)*(item.get('rate', 0)/100)), 'stock': item.get('stock', 0), } # 原始价格 tmp['price'] = round(float(item.get('oprice', '0')), 2) tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2) miaosha_goods_list.append(tmp) return miaosha_goods_list async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(time.time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -259200: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time < -172800: # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来14小时的商品信息(20点到第二天10点时间间隔为14小时) return 0 # 已过期恢复原价的 elif diff_time > -172800 and diff_time < 50400: return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
def get_ali_1688_data(self, goods_id): if goods_id == '': self.result_data = {} return {} # 阿里1688手机版地址: https://m.1688.com/offer/559836312862.html wait_to_deal_with_url = 'https://m.1688.com/offer/' + str(goods_id) + '.html' print('------>>>| 待处理的阿里1688地址为: ', wait_to_deal_with_url) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=wait_to_deal_with_url, css_selector='div.d-content') # print(body) if body == '': print('获取到的body为空str!请检查!') self.result_data = {} return {} # ''' # 改用requests # ''' # body = MyRequests.get_url_body(url=wait_to_deal_with_url, headers=self.headers) # # print(body) # # if body == '': # return {} # print(body) tmp_body = body try: pull_off_shelves = Selector(text=body).css('div.d-content p.info::text').extract_first() except: pull_off_shelves = '' if pull_off_shelves == '该商品无法查看或已下架': # 表示商品已下架, 同样执行插入数据操作 # print('test') try: tmp_my_pipeline = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=2 and GoodsID=%s' is_in_db = tmp_my_pipeline._select_table(sql_str=sql_str, params=(str(goods_id),)) # print(is_in_db) except Exception as e: print('遇到错误:', e) print('数据库连接失败!') self.result_data = {} return {} if is_in_db != []: # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可 sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s' tmp_my_pipeline._update_table(sql_str=sql_str, params=(goods_id)) print('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1') tmp_data_s = self.init_pull_off_shelves_goods() # 初始化下架商品的属性 tmp_data_s['before'] = True # 用来判断原先该goods是否在db中 self.result_data = {} return tmp_data_s else: # 表示该goods_id没存在于db中 print('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...') tmp_data_s = self.init_pull_off_shelves_goods() # 初始化下架商品的属性 tmp_data_s['before'] = False self.result_data = {} return tmp_data_s body = re.compile(r'{"beginAmount"(.*?)</script></div></div>').findall(body) if body != []: body = body[0] body = r'{"beginAmount"' + body # print(body) body = json.loads(body) # pprint(body) if body.get('discountPriceRanges') is not None: self.result_data = self._wash_discountPriceRanges(body=body) return self.result_data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('解析ing..., 该商品正在参与火拼, 此处为火拼价, 为短期活动价格!') body = re.compile(r'{"activityId"(.*?)</script></div></div>').findall(tmp_body) if body != []: body = body[0] body = r'{"activityId"' + body # print(body) body = json.loads(body) # pprint(body) if body.get('discountPriceRanges') is not None: self.result_data = self._wash_discountPriceRanges(body=body) self.is_activity_goods = True return self.result_data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('这个商品对应活动属性未知, 此处不解析, 设置为跳过!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
class CCUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/楚楚街/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.tmp_sql_server = None self.concurrency = 8 # 并发量 self.goods_index = 1 self.delete_sql_str = cc_delete_str_1 async def _get_pc_headers(self): return { 'Accept': 'application/json,text/javascript,*/*;q=0.01', # 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'api.chuchujie.com', 'Referer': 'https://m.chuchujie.com/?module=99', 'Cache-Control': 'max-age=0', 'User-Agent': get_random_pc_ua(), } async def _get_db_old_data(self) -> (list, None): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=cc_delete_str_2) await async_sleep(5) result = list( self.tmp_sql_server._select_table(sql_str=cc_select_str_1)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_end_time(self, miaosha_time) -> int: miaosha_end_time = json_2_dict(miaosha_time).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_end_time async def _get_new_cc_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.chuchujie_miaosha except: pass collect() self.chuchujie_miaosha = ChuChuJie_9_9_Parse() return async def _update_is_delete(self, goods_id) -> bool: ''' 逻辑删除 :param goods_id: :return: ''' res = self.tmp_sql_server._update_table(sql_str=cc_update_str_2, params=(goods_id, )) return res async def _update_one_goods_info(self, item, index): ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] gender = item[2] page = item[3] miaosha_end_time = await self._get_miaosha_end_time(miaosha_time) # self.lg.info(str(miaosha_end_time)) await self._get_new_cc_obj(index=index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=25) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_end_time) if is_recent_time == 0: res = await self._update_is_delete(goods_id=goods_id) self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, miaosha_end_time)) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res elif is_recent_time == 2: index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) body = await self._get_one_page_goods_info(gender, page) if body == '': index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res json_body = json_2_dict(body, default_res={}) try: this_page_total_count = json_body.get('data', {}).get( 'groupList', [])[0].get('totalCount', 0) except IndexError: self.lg.error('获取this_page_total_count时出错, 请检查!') this_page_total_count = 0 item_list = await self._get_item_list( this_page_total_count=this_page_total_count, json_body=json_body) if item_list == []: self.lg.info( '#### 该gender, page对应得到的item_list为空[]!\n该商品已被下架限时秒杀活动,此处将其删除' ) res = await self._update_is_delete(goods_id=item[0]) self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res else: res = await self._one_update(goods_id=goods_id, item_list=item_list) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(CHUCHUJIE_SLEEP_TIME) return goods_id, res async def _update_db(self) -> None: ''' 秒杀数据更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.chuchujie_miaosha = ChuChuJie_9_9_Parse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.chuchujie_miaosha except: pass collect() async def _get_item_list(self, **kwargs) -> list: ''' 获取对应gender, page的商品list :return: ''' this_page_total_count = kwargs.get('this_page_total_count') json_body = kwargs.get('json_body') tmp_goods_list = json_body.get('data', {}).get('groupList', [])[0].get('dataList', []) item_list = [{ 'goods_id': str(item_s.get('chuchuId', '')), 'sub_title': item_s.get('description', ''), } for item_s in tmp_goods_list] if this_page_total_count != 0 else [] return item_list async def _one_update(self, **kwargs): ''' 未下架的更新 :param kwargs: :return: ''' res = False goods_id = kwargs.get('goods_id') item_list = kwargs.get('item_list') # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in item_list] # 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(goods_id)) # self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) # pass # else: # 未下架的 # 不更新秒杀时间和sub_title, 只更新其他相关数据 # for item_2 in item_list: # if item_2.get('goods_id', '') == goods_id: self.chuchujie_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.chuchujie_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(goods_id) # goods_data['sub_title'] = item_2.get('sub_title', '') # print(goods_data) res = self.chuchujie_miaosha.update_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) return res async def _get_one_page_goods_info(self, *params) -> str: ''' 得到一个页面的html代码 :param params: 待传入的参数 :return: '{}' or str ''' gender, page = params tmp_url = 'https://api.chuchujie.com/api/' client = { "ageGroup": "AG_0to24", "channel": "QD_web_webkit", "deviceId": "0", "gender": gender, # '0' -> 女 | '1' -> 男 "imei": "0", "packageName": "com.culiu.purchase", "platform": "wap", "sessionId": "0", "shopToken": "0", "userId": "0", "version": "1.0", "xingeToken": "" } query = {"group": 4, "module": "99", "page": page, "tab": "all"} # 切记: Query String Parameters直接这样编码发送即可 # 如果是要post的数据就得使用post的方法 data = { 'client': json.dumps(client), 'query': json.dumps(query), 'page': page } body = Requests.get_url_body(url=tmp_url, headers=self.headers, params=data, ip_pool_type=self.ip_pool_type) return body async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = datetime_to_timestamp(get_shanghai_time()) # 当前的时间戳 diff_time = time_1 - time_2 # if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 if diff_time < -100000: # 设置大点避免还在卖的被下掉 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.chuchujie_miaosha except: pass collect()
class JMYPUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/聚美优品/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.tmp_sql_server = None self.delete_sql_str = jm_delete_str_1 self.goods_index = 1 self.concurrency = 10 # 并发量 async def _get_pc_headers(self): return { 'Accept': 'application/json,text/javascript,text/plain,*/*;q=0.01', # 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'h5.jumei.com', 'Referer': 'https://h5.jumei.com/', 'Cache-Control': 'max-age=0', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } async def _get_db_old_data(self) -> (list, None): ''' 待更新数据 :return: ''' self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=jm_delete_str_2) await async_sleep(5) result = list( self.tmp_sql_server._select_table(sql_str=jm_select_str_1)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_cookies(self) -> str: ''' 获取请求需要的cookies :return: ''' # 获取cookies my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) cookies = my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': self.lg.error('!!! 获取cookies失败 !!!') self.lg.info('获取cookies成功!') return cookies async def _get_miaosha_end_time(self, miaosha_time) -> int: ''' 获取秒杀结束时间 :return: ''' miaosha_end_time = json.loads(miaosha_time).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_end_time async def _get_new_jumei_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.jumeiyoupin_miaosha except: pass collect() self.jumeiyoupin_miaosha = JuMeiYouPinParse() async def _update_is_delete(self, goods_id): ''' 逻辑删 :param goods_id: :return: ''' res = self.tmp_sql_server._update_table(sql_str=jm_update_str_4, params=(goods_id, )) return res async def _get_one_page_all_goods_list(self, *params) -> (list, str): ''' 得到一个页面地址的所有商品list :return: str | list 类型 ''' page = params[0] all_goods_list = [] tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) json_body = json_2_dict(Requests.get_url_body( url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type), default_res={}, logger=self.lg) if json_body == {}: return '网络错误!' this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: return [] for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] return all_goods_list async def _update_one_goods_info(self, item, index): ''' 更新单个 :return: ''' res = False goods_id = item[0] miaosha_time = item[1] page = item[2] goods_url = item[3] miaosha_end_time = await self._get_miaosha_end_time(miaosha_time) # self.lg.info(str(miaosha_end_time)) await self._get_new_jumei_obj(index=index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, ) if self.tmp_sql_server.is_connect_success: is_recent_time_res = await self._is_recent_time(miaosha_end_time) if is_recent_time_res == 0: res = await self._update_is_delete(goods_id) self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, json.loads(miaosha_time).get('miaosha_end_time'))) await async_sleep(.3) elif is_recent_time_res == 2: pass else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) this_page_all_goods_list = await self._get_one_page_all_goods_list( page) if isinstance(this_page_all_goods_list, str): self.lg.error('网络错误!先跳过') await async_sleep(1.5) return res elif this_page_all_goods_list == []: res = await self._update_is_delete(goods_id=goods_id) self.lg.error( '#### 该page对应得到的this_page_all_goods_list为空[]!') self.lg.error( '** 该商品已被下架限时秒杀活动, 此处将其逻辑删除, goods_id:{}'.format( goods_id)) await async_sleep(.3) else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') # res = await self._update_is_delete(goods_id=goods_id) # self.lg.info('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # else: # 未下架的 tmp_r = self.jumeiyoupin_miaosha.get_goods_id_from_url( goods_url) self.jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = self.jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = goods_id goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) res = self.jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(JUMEIYOUPIN_SLEEP_TIME) return [goods_id, res] async def _update_db(self): ''' 数据更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) cookies = await self._get_cookies() self.headers = await self._get_pc_headers() self.headers.update({ 'Cookie': cookies, }) self.jumeiyoupin_miaosha = JuMeiYouPinParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(10) try: del self.jumeiyoupin_miaosha except: pass collect() async def _is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(datetime_to_timestamp(get_shanghai_time())) diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.jumeiyoupin_miaosha except: pass collect()
def run(self): global coupon_queue, goods_id_and_coupon_url_list, unique_coupon_id_list while True: sql_cli = None try: if coupon_queue.qsize() >= 1: # todo 有些领券url 为预付定金商品, 此处不处理 coupon_item = coupon_queue.get() ori_coupon_list = json_2_dict( json_str=re.compile('\((.*)\)').findall(coupon_item) [0], default_res={}, ).get('data', {}).get('resultList', []) assert ori_coupon_list != [] # pprint(ori_coupon_list) # todo: 测试发现, 返回数据中, 若有多买几件的优惠券在字段'nCouponInfoMap'中 # 现只支持1件, 不支持多件的券 coupon_list = [] for item in ori_coupon_list: try: goods_id = str(item.get('itemId', '')) assert goods_id != '' # 一个账户优惠券只能使用一次 # 优惠券展示名称, eg: '优惠券' coupon_display_name = '优惠券' # 优惠券的值, 即优惠几元 ori_coupon_value = item.get('couponAmount', '') assert ori_coupon_value != '' coupon_value = str( float(ori_coupon_value).__round__(2)) # 使用门槛 ori_thresold = item.get('couponStartFee', '') assert ori_thresold != '' threshold = str(float(ori_thresold).__round__(2)) begin_time = str( timestamp_to_regulartime( int( item.get('couponEffectiveStartTime', '')[0:10]))) end_time = str( timestamp_to_regulartime( int( item.get('couponEffectiveEndTime', '')[0:10]))) # 使用方法 use_method = '满{}元, 减{}元'.format( threshold, coupon_value) if string_to_datetime( end_time) <= get_shanghai_time(): print('该券已过期[goods_id: {}]'.format(goods_id)) # 已过期的 continue if datetime_to_timestamp(string_to_datetime(end_time)) - datetime_to_timestamp(string_to_datetime(begin_time)) \ <= 60 * 60 * 36: print('该券小于1.5天[goods_id: {}], pass'.format( goods_id)) continue # todo 测试发现, 同一商品可能存在不同活动时间段的同一优惠券(但是活动时间不同), 导致一个商品有多个优惠券 # 所以取值时, 按结束时间最大那个来取值 # 上面还是会有问题, 导致价格重复减, 所以生成唯一id, 所以在一次转换价格后要把所有的该goods_id券都标记为1 # 生成唯一id # unique_id = str(get_uuid3( # target_str=goods_id \ # + coupon_value \ # + threshold \ # + str(datetime_to_timestamp(string_to_datetime(begin_time)))[0:10]\ # + str(datetime_to_timestamp(string_to_datetime(end_time)))[0:10])) # todo 根据上诉存在多张券导致价格被多次修改的情况,故表中一个goods_id,只允许存一张券, 就不会出现价格被多次修改的情况 # 解释就说: 只存储优惠力度最大的券 unique_id = str(get_uuid3(target_str=goods_id)) # 领券地址 # pprint(goods_id_and_coupon_url_list) coupon_url = '' for j in goods_id_and_coupon_url_list: tmp_goods_id = j['goods_id'] tmp_coupon_url = j['coupon_url'] if goods_id == tmp_goods_id: print('@@@ 成功匹配到goods_id: {} 的领券地址: {}!!'. format(goods_id, tmp_coupon_url)) coupon_url = tmp_coupon_url break else: continue assert coupon_url != '' coupon_list.append({ 'unique_id': unique_id, 'goods_id': goods_id, 'coupon_url': coupon_url, 'coupon_display_name': coupon_display_name, 'coupon_value': coupon_value, 'threshold': threshold, 'begin_time': begin_time, 'end_time': end_time, 'use_method': use_method, }) except Exception as e: print(e) continue # pprint(coupon_list) if coupon_list != []: # 存储 sql_cli = SqlServerMyPageInfoSaveItemPipeline() if not sql_cli.is_connect_success: raise SqlServerConnectionException for item in coupon_list: unique_id = item['unique_id'] goods_id = item['goods_id'] if unique_id not in unique_coupon_id_list: save_res = sql_cli._insert_into_table( sql_str= 'insert into dbo.coupon_info(unique_id, create_time, goods_id, coupon_url, coupon_display_name, coupon_value, threshold, begin_time, end_time, use_method) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', params=( unique_id, str(get_shanghai_time()), goods_id, item['coupon_url'], item['coupon_display_name'], Decimal( item['coupon_value']).__round__(2), Decimal( item['threshold']).__round__(2), item['begin_time'], item['end_time'], item['use_method'], ), repeat_insert_default_res=False, # 避免重复改价 ) if save_res: # todo 只更新一次价格, 避免重复更新导致价格错误 # 去重 unique_coupon_id_list.append(unique_id) # 更新常规表中的商品价格变动 sql_str = ''' select top 1 Price, TaoBaoPrice, SKUInfo from dbo.GoodsInfoAutoGet where GoodsID=%s ''' db_res = [] try: db_res = list( sql_cli._select_table( sql_str=sql_str, params=(goods_id, ), )) except Exception as e: print(e) if db_res != []: # 标记常规商品由于优惠券带来的价格变动 try: # 减去优惠券的价格 coupon_value = float( item['coupon_value']) threshold = float( item['threshold']) # 还原为原始价格 db_price = float( db_res[0][0]) * (1 - CP_PROFIT) db_taobao_price = float( db_res[0][1]) * (1 - CP_PROFIT) # 减去优惠券价, 并且加上CP_PROFIT, 得到最终待存储价格 new_price = ( (db_price - coupon_value if db_price >= threshold else db_price) * (1 + CP_PROFIT)).__round__(2) new_taobao_price = ( (db_taobao_price - coupon_value if db_taobao_price >= threshold else db_taobao_price) * (1 + CP_PROFIT)).__round__(2) new_sku_info = get_new_sku_info_from_old_sku_info_subtract_coupon_and_add_cp_profit( old_sku_info=json_2_dict( json_str=db_res[0][2], default_res=[], ), threshold=threshold, coupon_value=coupon_value, ) sql_str2 = ''' update dbo.GoodsInfoAutoGet set Price=%s, TaoBaoPrice=%s, SKUInfo=%s, ModfiyTime=%s, sku_info_trans_time=%s, IsPriceChange=1, PriceChangeInfo=SKUInfo where GoodsID=%s ''' now_time = get_shanghai_time() sql_cli._update_table( sql_str=sql_str2, params=( Decimal(new_price ).__round__(2), Decimal(new_taobao_price). __round__(2), dumps(new_sku_info, ensure_ascii=False), now_time, now_time, goods_id, ), ) except Exception as e: print(e) else: pass else: continue else: continue else: continue except IndexError: # 跳过相同接口得索引异常 continue except Exception as e: print(e) finally: try: del sql_cli except: pass
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': return self._data_error_init() else: tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id) print('------>>>| 得到的商品手机版的地址为: ', tmp_url) ''' 2.采用phantomjs来处理,记住使用前别翻墙 ''' # body = self.driver.get_url_body(url=tmp_url, css_selector='div.sc-kgoBCf.bTQvTk') # 该css为手机端标题块 body = self.driver.get_url_body(url=tmp_url) # 该css为手机端标题块 # print(body) if re.compile(r'<span id="t-index">页面丢失ing</span>').findall( body) != []: # 页面为空处理 _ = SqlServerMyPageInfoSaveItemPipeline() if _.is_connect_success: _._update_table(sql_str=jp_update_str_1, params=(goods_id, )) try: del _ except: pass print('@@@ 逻辑删除该商品[{0}] is_delete = 1'.format(goods_id)) return self._data_error_init() if body == '': print('获取到的body为空str!请检查!') return self._data_error_init() data = re.compile( r'__PRELOADED_STATE__ = (.*);</script> <style ').findall( body) # 贪婪匹配匹配所有 # 得到skudata # 卷皮原先的skudata请求地址1(官方放弃) # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id) # 现在卷皮skudata请求地址2 skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str( goods_id) headers = get_random_headers(upgrade_insecure_requests=False, ) headers.update({'Host': 'webservice.juanpi.com'}) skudata_body = Requests.get_url_body( url=skudata_url, headers=headers, ip_pool_type=self.ip_pool_type, ) if skudata_body == '': print('获取到的skudata_body为空str!请检查!') return self._data_error_init() skudata = re.compile(r'(.*)').findall(skudata_body) # 贪婪匹配匹配所有 if skudata != []: skudata = json_2_dict(json_str=skudata[0]).get('skudata', {}) if skudata == {}: return self._data_error_init() # pprint(skudata) try: if skudata.get('info') is not None: pass # 说明得到正确的skudata else: # 否则跳出 print('skudata中info的key为None, 返回空dict') return self._data_error_init() except AttributeError as e: print('遇到错误如下(先跳过!): ', e) return self._data_error_init() else: print('skudata为空!') return self._data_error_init() if data != []: main_data = json_2_dict(json_str=data[0]) if main_data == {}: return self._data_error_init() if main_data.get('detail') is not None: main_data = self._wash_main_data( main_data.get('detail', {})) main_data['skudata'] = skudata main_data['goods_id'] = goods_id main_data['parent_dir'] = _jp_get_parent_dir( phantomjs=self.driver, goods_id=goods_id) self.result_data = main_data # pprint(main_data) return main_data else: print('data中detail的key为None, 返回空dict') return self._data_error_init() else: print('data为空!') return self._data_error_init()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_4) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_3)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mogujie_miaosha = MoGuJieMiaoShaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0], )) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] item_list = self.get_item_list(event_time=str(item[2])) if item_list == '': # 可能网络状况导致, 先跳过 pass elif item_list == []: print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 该event_time中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('iid', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('iid', '') == item[0]: spider_url = item[3] mogujie_miaosha.get_goods_data( goods_id=spider_url) goods_data = mogujie_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) # price设置为原价 try: tmp_price_list = sorted([ round( float( item_4.get( 'normal_price', '')), 2) for item_4 in goods_data[ 'price_info_list'] ]) price = Decimal( tmp_price_list[-1] ).__round__(2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int( item_2.get( 'startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int( item_2.get( 'endTime', 0))), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # print(goods_data['title']) # pprint(goods_data) # print(goods_data) mogujie_miaosha.update_mogujie_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=jp_delete_str_1) result = list( tmp_sql_server._select_table(sql_str=jp_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: try: pintuan_end_time = json.loads( item[1])[0].get('end_time') except IndexError: print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format( item[0])) print('此处将其标记为is_delete=1') tmp_sql_server._update_table(sql_str=jp_update_str_5, params=(item[0], )) continue pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int( datetime_to_timestamp(get_shanghai_time())): tmp_sql_server._delete_table(sql_str=jp_delete_str_2, params=(item[0], )) print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0])) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) juanpi_pintuan.get_goods_data(goods_id=item[0]) data = juanpi_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] juanpi_pintuan.to_right_and_update_pintuan_data( data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()