def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33] # notice for tab_id in tab_id_list: for index in range(0, 50): tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(index)) print('待抓取的限时秒杀地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format( tab_id, index)) break else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) print(miaosha_goods_list) juanpi = JuanPiParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: if my_pipeline._select_table( sql_str=jp_select_str_5) is None: db_goods_id_list = [] else: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table( sql_str=jp_select_str_5)) ] for item in miaosha_goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://shop.juanpi.com/deal/' + item.get( 'goods_id') juanpi.get_goods_data( goods_id=item.get('goods_id')) goods_data = juanpi.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = item.get( 'goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get( 'price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get( 'taobao_price') # 秒杀价 goods_data['sub_title'] = item.get( 'sub_title', '') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get( 'miaosha_time')) goods_data['tab_id'] = tab_id goods_data['page'] = index # print(goods_data) juanpi.insert_into_juanpi_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(.4) # 短暂sleep下避免出错跳出 sleep(.65) else: pass try: del juanpi except: pass gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, schedule, is_delete from dbo.juanpi_pintuan where site_id=18' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 data = {} if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: pintuan_end_time = json.loads(item[1])[0].get('end_time') pintuan_end_time = int(str(time.mktime(time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int(time.time()): sql_str = 'delete from dbo.juanpi_pintuan where goods_id=%s' tmp_sql_server._delete_table(sql_str=sql_str, params=(item[0],)) print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0])) else: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) juanpi_pintuan.get_goods_data(goods_id=item[0]) data = juanpi_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] juanpi_pintuan.to_right_and_update_pintuan_data(data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del juanpi_pintuan # except: # pass gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=12' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi = JuanPiParse() for item in result: # 实时更新数据 if index % 5 == 0: juanpi = JuanPiParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) juanpi.get_goods_data(goods_id=item[0]) data = juanpi.deal_with_data() if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) # print('------>>>| 爬取到的数据为: ', data) juanpi.to_right_and_update_data( data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
class JPUpdater(AsyncCrawler): """卷皮常规商品实时更新""" def __init__(self, *params, **kwargs): AsyncCrawler.__init__(self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/实时更新/') self.sql_cli = None self.goods_index = 1 # 并发量 self.concurrency = 10 async def _get_db_old_data(self): self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: result = list(self.sql_cli._select_table(sql_str=jp_select_str_3)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_new_jp_obj(self, index) -> None: if index % 10 == 0: try: del self.juanpi except: pass collect() self.juanpi = JuanPiParse(is_real_times_update_call=True) async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新一个goods的信息 :param db_goods_info_obj: :param index: 索引值 :return: ['goods_id', bool:'成功与否'] ''' res = False await self._get_new_jp_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(db_goods_info_obj.goods_id, index)) self.juanpi.get_goods_data(goods_id=db_goods_info_obj.goods_id) data = self.juanpi.deal_with_data() if data != {}: data = get_goods_info_change_data( target_short_name='jp', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = self.juanpi.to_right_and_update_data( data, pipeline=self.sql_cli) else: # 表示返回的data值为空值 pass else: self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(1.2) return [db_goods_info_obj.goods_id, res] async def _update_db(self): while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.juanpi = JuanPiParse(is_real_times_update_call=True) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: db_goods_info_obj = JPDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format( db_goods_info_obj.goods_id)) tasks.append( self.loop.create_task( self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index, ))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(10.) try: del self.juanpi except: pass collect() def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=jp_delete_str_1) result = list(sql_cli._select_table(sql_str=jp_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 goods_id = item[0] if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: try: pintuan_end_time = json.loads( item[1])[0].get('end_time') except IndexError: print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format( goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, sql_cli=sql_cli, update_sql_str=jp_update_str_7, ) continue pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int( datetime_to_timestamp(get_shanghai_time())): _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, sql_cli=sql_cli, update_sql_str=jp_update_str_7, ) print('该goods_id[{0}]已过期或者售完,逻辑删除成功!'.format(goods_id)) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) juanpi_pintuan.get_goods_data(goods_id=goods_id) data = juanpi_pintuan.deal_with_data() if data == {}: continue data['goods_id'] = goods_id juanpi_pintuan.to_right_and_update_pintuan_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) gc.collect()
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, tab_id, page from dbo.juanpi_xianshimiaosha where site_id=15' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() for item in result: # 实时更新数据 miaosha_begin_time = json.loads( item[1]).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_begin_time) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_begin_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(item[2]), str(item[3]), ) # print('待爬取的tab_id, page地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'. format(item[2], item[3])) pass else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # print(miaosha_goods_list) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in miaosha_goods_list ] # print(miaosha_goods_all_goods_id) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示该tab_id,page中没有了该goods_id ''' tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # juanpi_miaosha = JuanPiParse() juanpi_miaosha.get_goods_data( goods_id=item[0]) goods_data = juanpi_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data[ 'stock_info'] = item_1.get( 'stock_info') goods_data[ 'goods_id'] = item_1.get( 'goods_id') # goods_data['username'] = '******' if item_1.get( 'stock_info' ).get('activity_stock') > 0: goods_data[ 'price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price' ) # 秒杀价 else: pass goods_data[ 'sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1 .get('miaosha_time' )) # print(goods_data) juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(.2) # 避免太快 else: pass if index % 10 == 0: # 每过几个初始化一次,既能加快速度,又能优化内存 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() gc.collect() index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: # sleep(5) pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33] # notice for tab_id in tab_id_list: for index in range(0, 50): tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(index)) print('待抓取的限时秒杀地址为: ', tmp_url) # 设置代理ip self.proxies = self.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint( 0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) try: response = requests.get( tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 data = response.content.decode('utf-8') # print(data) except Exception: print('requests.get()请求超时....') print('data为空!') break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format( tab_id, index)) break else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) print(miaosha_goods_list) juanpi = JuanPiParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: if my_pipeline.select_juanpi_xianshimiaosha_all_goods_id( ) is None: db_goods_id_list = [] else: db_goods_id_list = [ item[0] for item in list( my_pipeline. select_juanpi_xianshimiaosha_all_goods_id( )) ] for item in miaosha_goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://shop.juanpi.com/deal/' + item.get( 'goods_id') juanpi.get_goods_data( goods_id=item.get('goods_id')) goods_data = juanpi.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = item.get( 'goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get( 'price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get( 'taobao_price') # 秒杀价 goods_data['sub_title'] = item.get( 'sub_title', '') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data['tab_id'] = tab_id goods_data['page'] = index # print(goods_data) juanpi.insert_into_juanpi_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(.3) # 短暂sleep下避免出错跳出 sleep(.65) else: pass try: del juanpi except: pass gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_juanpi_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi = JuanPiParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) juanpi.get_goods_data(goods_id=item[0]) data = juanpi.deal_with_data() if data != {}: data['goods_id'] = item[0] ''' 设置最后刷新的商品状态上下架时间 ''' # 1.is_delete由0->1 为下架时间down_time 2. is_delete由1->0 为上架时间shelf_time my_shelf_and_down_time = { 'shelf_time': '', 'down_time': '', } if data['is_delete'] != item[1]: if data['is_delete'] == 0 and item[1] == 1: # is_delete由0->1 表示商品状态上架变为下架 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # is_delete由1->0 表示商品状态下架变为上架 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: if item[2] is None or item[ 2] == '{"shelf_time": "", "down_time": ""}' or len( item[2]) == 35: # 35就是那串初始str if data['is_delete'] == 0: # 上架的状态 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: # 下架的状态 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # 否则保存原始值不变 tmp_shelf_and_down_time = item[2] my_shelf_and_down_time = json.loads( tmp_shelf_and_down_time) # 先转换为dict data['my_shelf_and_down_time'] = my_shelf_and_down_time # print(my_shlef_and_down_time) # print('------>>>| 爬取到的数据为: ', data) juanpi.to_right_and_update_data( data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass gc.collect() # sleep(1) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
class JPUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.tmp_sql_server = None self.concurrency = 8 self.goods_index = 1 self.delete_sql_str = jp_delete_str_3 async def _get_pc_headers(self) -> dict: return { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'm.juanpi.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } async def _get_db_old_data(self) -> (None, list): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None) await async_sleep(5) result = list(self.tmp_sql_server._select_table(sql_str=jp_select_str_4)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_begin_time(self, miaosha_time) -> int: miaosha_begin_time = json_2_dict(miaosha_time).get('miaosha_begin_time') miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_begin_time async def _get_new_jp_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.juanpi_miaosha except: pass collect() self.juanpi_miaosha = JuanPiParse() async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] tab_id = item[2] page = item[3] miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time) # self.lg.info(str(miaosha_begin_time)) await self._get_new_jp_obj(index=index) self.tmp_sql_server = await _get_new_db_conn(db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,)) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format(goods_id, miaosha_begin_time)) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res elif is_recent_time == 2: self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(goods_id, index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(page), ) # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url)) body = Requests.get_url_body(url=tmp_url, headers=await self._get_pc_headers(), ip_pool_type=self.ip_pool_type) try: data = json_2_dict(body, default_res={}).get('data', {}) assert data != {}, 'data为空dict!' data = data.get('goodslist', []) assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(tab_id, page) except AssertionError: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res miaosha_goods_list = await self._get_miaoshao_goods_info_list(data=data) # self.lg.info(str(miaosha_goods_list)) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [i.get('goods_id') for i in miaosha_goods_list] self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 if miaosha_goods_all_goods_id != []: # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过! self.lg.info('该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id)) else: # 表示该tab_id,page中没有了该goods_id res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,)) self.lg.info('该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format(goods_id)) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update(miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res async def _update_db(self) -> None: ''' 秒杀数据实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency) self.juanpi_miaosha = JuanPiParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append(self.loop.create_task(self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.juanpi_miaosha except: pass collect() async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :param kwargs: :return: ''' res = False miaosha_goods_list = kwargs.get('miaosha_goods_list') goods_id = kwargs.get('goods_id') for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == goods_id: self.juanpi_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.juanpi_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 break else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get('stock_info') goods_data['goods_id'] = item_1.get('goods_id') # goods_data['username'] = '******' if item_1.get('stock_info').get('activity_stock') > 0: goods_data['price'] = item_1.get('price') # 秒杀前的原特价 goods_data['taobao_price'] = item_1.get('taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get('sub_title', '') goods_data['miaosha_time'] = item_1.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get('miaosha_time')) res = self.juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) await async_sleep(.3) # 避免太快 break else: pass return res async def _get_miaoshao_goods_info_list(self, data) -> list: ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: tmp = {} tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(item.get('start_time'))), 'miaosha_end_time': timestamp_to_regulartime(int(item.get('end_time'))), } stock = item.get('stock', 0) tmp['goods_id'] = item.get('goods_id') # 限时秒杀库存信息 tmp['stock_info'] = { 'activity_stock': int(item.get('stock', 0)*(item.get('rate', 0)/100)), 'stock': item.get('stock', 0), } # 原始价格 tmp['price'] = round(float(item.get('oprice', '0')), 2) tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2) miaosha_goods_list.append(tmp) return miaosha_goods_list async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(time.time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -259200: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time < -172800: # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来14小时的商品信息(20点到第二天10点时间间隔为14小时) return 0 # 已过期恢复原价的 elif diff_time > -172800 and diff_time < 50400: return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=jp_select_str_3)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi = JuanPiParse() for item in result: # 实时更新数据 if index % 5 == 0: juanpi = JuanPiParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) juanpi.get_goods_data(goods_id=item[0]) data = juanpi.deal_with_data() if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) print('上架时间:', data['shelf_time'], '下架时间:', data['delete_time']) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[6]), site_id=12) except AttributeError: # 处理已被格式化过的 old_sku_info = item[6] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=12), is_price_change=item[7] if item[7] is not None else 0) juanpi.to_right_and_update_data( data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()