def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server._select_table(sql_str=z8_select_str_4)) tmp_sql_server._delete_table(sql_str=z8_delete_str_4, params=None) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(str(result)) print('--------------------------------------------------------') self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self._update_old_goods_info(tmp_sql_server=tmp_sql_server, result=result) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(10*60) return
async def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods 数据 :param result: :return: ''' index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time') miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # self.my_lg.info(str(miaosha_begin_time)) tmall = TmallParse(logger=self.my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if await self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) self.my_lg.info('过期的goods_id为(%s)' % item[0] + ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) else: # 返回1, 表示在待更新的区间内 self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据''' goods_id = tmall.get_goods_id_from_url(item[2]) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.my_lg.info(str(item)) goods_data['goods_id'] = item[0] await tmall._update_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await asyncio.sleep(5) index += 1 try: del tmall except: pass gc.collect() return
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = ''' select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14 and is_delete = 0 ''' # 删除过期2天的的 tmp_del_str = 'delete from dbo.zhe_800_xianshimiaosha where GETDATE()-miaosha_end_time>2' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) tmp_sql_server._delete_table(sql_str=tmp_del_str, params=None) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(str(result)) print('--------------------------------------------------------') self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self._update_old_goods_info(tmp_sql_server=tmp_sql_server, result=result) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) return
def clear_db(): sql_cli = SqlServerMyPageInfoSaveItemPipeline() # 清理mia_pintuan who_is = 'mia_pintuan' print('获取 {} 目标数据中...'.format(who_is)) mia_db_target_goods_id_list = sql_cli._select_table(sql_str=''' select goods_id from.dbo.mia_pintuan where (MainGoodsID is null and miaosha_begin_time < GETDATE()-60) -- 清掉已下架的且被后台转换的data or (MainGoodsID is not null and is_delete=1 and ConvertTime > modfiy_time) ''') _len = len(mia_db_target_goods_id_list) print('Got {} target_data len: {}'.format(who_is, _len)) for item in mia_db_target_goods_id_list: goods_id = item[0] res = sql_cli._delete_table( sql_str='delete from dbo.mia_pintuan where goods_id=%s', params=(goods_id, )) print('[{}] [{}, rest_num: {}] deleting row where goods_id: {} ...'. format( '+' if res else '-', who_is, _len, goods_id, )) _len -= 1 print('clear {} over!'.format(who_is)) sleep(2.) try: del sql_cli del mia_db_target_goods_id_list except: pass collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=jp_delete_str_1) result = list(sql_cli._select_table(sql_str=jp_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 goods_id = item[0] if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: try: pintuan_end_time = json.loads( item[1])[0].get('end_time') except IndexError: print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format( goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, sql_cli=sql_cli, update_sql_str=jp_update_str_7, ) continue pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int( datetime_to_timestamp(get_shanghai_time())): _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, sql_cli=sql_cli, update_sql_str=jp_update_str_7, ) print('该goods_id[{0}]已过期或者售完,逻辑删除成功!'.format(goods_id)) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) juanpi_pintuan.get_goods_data(goods_id=goods_id) data = juanpi_pintuan.deal_with_data() if data == {}: continue data['goods_id'] = goods_id juanpi_pintuan.to_right_and_update_pintuan_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) gc.collect()
async def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=jm_delete_str_3, ) await async_sleep(5) result = sql_cli._select_table(sql_str=jm_select_str_3, logger=self.lg) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: await _print_db_old_data(result=result, logger=self.lg) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} sql_cli = await _get_new_db_conn(db_obj=sql_cli, index=index, logger=self.lg, remainder=50) if sql_cli.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await sql_cli._update_table_3( sql_str=jm_update_str_5, params=(str(get_shanghai_time()), item[0]), logger=self.lg) await async_sleep(.5) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) item_list = await jumeiyoupin_2.get_one_page_goods_list( driver=driver, tab=item[2], index=item[3]) try: del driver except: pass if item_list == []: self.lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=sql_cli) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=sql_cli) else: self.lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(10 * 60) gc.collect() return None
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=cc_delete_str_2) result = list( tmp_sql_server._select_table(sql_str=cc_select_str_1)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 chuchujie_miaosha = ChuChuJie_9_9_Parse() print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] body = self.get_one_page_goods_info(item[2], item[3]) if body == '{}': # 可能是网络原因导致, 先跳过 pass else: try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass try: this_page_total_count = json_body.get( 'data', {}).get('groupList', [])[0].get('totalCount', 0) except IndexError: print('获取this_page_total_count时出错, 请检查!') this_page_total_count = 0 # 获取对应gender, page的商品list if this_page_total_count == 0: item_list = [] else: tmp_goods_list = json_body.get('data', {}).get( 'groupList', [])[0].get('dataList', []) item_list = [{ 'goods_id': str(item_s.get('chuchuId', '')), 'sub_title': item_s.get('description', ''), } for item_s in tmp_goods_list] if item_list == []: print('#### 该gender, page对应得到的item_list为空[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in item_list] """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 ''' 不更新秒杀时间和sub_title, 只更新其他相关数据 ''' # for item_2 in item_list: # if item_2.get('goods_id', '') == item[0]: chuchujie_miaosha.get_goods_data( goods_id=item[0]) goods_data = chuchujie_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) # goods_data['sub_title'] = item_2.get('sub_title', '') # print(goods_data) chuchujie_miaosha.update_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(CHUCHUJIE_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) gc.collect()
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, tab_id, page from dbo.juanpi_xianshimiaosha where site_id=15' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() for item in result: # 实时更新数据 miaosha_begin_time = json.loads( item[1]).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_begin_time) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_begin_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(item[2]), str(item[3]), ) # print('待爬取的tab_id, page地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'. format(item[2], item[3])) pass else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # print(miaosha_goods_list) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in miaosha_goods_list ] # print(miaosha_goods_all_goods_id) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示该tab_id,page中没有了该goods_id ''' tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # juanpi_miaosha = JuanPiParse() juanpi_miaosha.get_goods_data( goods_id=item[0]) goods_data = juanpi_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data[ 'stock_info'] = item_1.get( 'stock_info') goods_data[ 'goods_id'] = item_1.get( 'goods_id') # goods_data['username'] = '******' if item_1.get( 'stock_info' ).get('activity_stock') > 0: goods_data[ 'price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price' ) # 秒杀价 else: pass goods_data[ 'sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1 .get('miaosha_time' )) # print(goods_data) juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(.2) # 避免太快 else: pass if index % 10 == 0: # 每过几个初始化一次,既能加快速度,又能优化内存 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() gc.collect() index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: # sleep(5) pass gc.collect()
class GoodsCouponSpider(AsyncCrawler): def __init__(self): AsyncCrawler.__init__( self, user_agent_type=PHONE, ip_pool_type=IP_POOL_TYPE, log_print=True, logger=None, log_save_path=MY_SPIDER_LOGS_PATH + '/coupon/_/', headless=True, ) # 不宜过大, 官网会发现 self.concurrency = 10 # 不可太大 电脑卡死 self.concurrency2 = 3 self.req_num_retries = 7 self.proxy_type = PROXY_TYPE_HTTPS self.driver_load_images = DRIVER_LOAD_IMAGES # 用线程模式长期运行报: too many open files self.concurrent_type = 0 self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.init_sql_str() async def _fck_run(self): """ main :return: """ while True: try: if get_shanghai_time().hour == 0: await async_sleep(60 * 60 * 3.5) continue self.db_res = await self.get_db_res() all_tasks_params_list_obj = await self.get_all_tasks_params_list_obj( ) tasks_params_list_obj = TasksParamsListObj( tasks_params_list=all_tasks_params_list_obj, step=self.concurrency, slice_start_index=0, ) while True: try: slice_params_list = tasks_params_list_obj.__next__() except AssertionError: break coupon_url_list = await self.get_coupon_url_list_by_goods_id_list( slice_params_list=slice_params_list) # pprint(coupon_url_list) # 测试 # coupon_url = 'https://uland.taobao.com/coupon/edetail?e=5M3kt6O%2FfZqa2P%2BN2ppgB2X2iX5OaVULVb9%2F1Hxlj5NQYhkEFAI5hGSlkL8%2BFO6JZSEGEhAo6u3FrE8HH4fiD8KUixUTTLeu0WMS0ZKY%2BzmLVIDjuHwzlw%3D%3D&af=1&pid=mm_55371245_39912139_149806421' # coupon_url_list = [coupon_url for i in range(6)] # # goods_id得对应上面的领券地址 # goods_id_and_coupon_url_queue.put({ # 'goods_id': '562016826663', # 'coupon_url': coupon_url, # }) if coupon_url_list == []: # 此处也回收下 collect() self.lg.info('coupon_url_list为空list, 跳过!') random_sleep_time = random_uniform(3., 6.) self.lg.info('休眠{}s ...'.format(random_sleep_time)) await async_sleep(random_sleep_time) continue # 划分coupon_url_list, 避免多开使内存崩溃 tasks_params_list_obj2 = TasksParamsListObj( tasks_params_list=coupon_url_list, step=self.concurrency2, slice_start_index=0, ) while True: try: slice_params_list2 = tasks_params_list_obj2.__next__( ) except AssertionError: break tasks = [] for coupon_url in slice_params_list2: self.lg.info( 'create task[where coupon_url: {}] ...'.format( coupon_url)) tasks.append( self.loop.create_task( self.intercept_target_api( coupon_url=coupon_url))) try: one_res = await wait_for( fut=async_wait_tasks_finished(tasks=tasks), timeout=60 * 2, ) except AsyncTimeoutError: self.lg.error('遇到错误:', exc_info=True) continue # 成功总数 success_count = 0 for item in one_res: if item: success_count += 1 self.lg.info('成功个数: {}, 成功概率: {:.3f}'.format( success_count, success_count / self.concurrency2)) collect() collect() self.lg.info('一次大循环结束!!') except Exception: self.lg.error('遇到错误:', exc_info=True) await async_sleep(30) finally: self.lg.info('休眠6s...') await async_sleep(6.) collect() async def get_all_tasks_params_list_obj(self) -> list: """ 根据db 给与的数据获取到所有的目标数据 :return: """ global unique_coupon_id_list all_tasks_params_list_obj = [] for item in self.db_res: goods_id = item[0] # 因为现在只取单件购买优惠券, 不处理多件的, 所以此处可去除已存在的 coupon_unique_id = str(get_uuid3(target_str=goods_id)) if coupon_unique_id in unique_coupon_id_list: self.lg.info( 'coupon_info 表中已存在coupon_unique_id: {}, goods_id: {}, pass' .format( coupon_unique_id, goods_id, )) continue all_tasks_params_list_obj.append({ 'goods_id': goods_id, 'site_id': item[1], }) return all_tasks_params_list_obj async def get_coupon_url_list_by_goods_id_list(self, slice_params_list) -> list: """ 根据给与的goods_id_list来获取对应的coupon_url_list :return: """ def get_create_task_msg(k) -> str: return 'create task[where goods_id: {}, site_id: {}] ...'.format( k['goods_id'], k['site_id'], ) def get_now_args(k) -> list: return [ k['goods_id'], ] all_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=slice_params_list, func_name_where_get_create_task_msg=get_create_task_msg, func_name=self.get_tm_coupon_url_from_lq5u, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res= default_add_one_res_2_all_res2, one_default_res='', step=self.concurrency, logger=self.lg, concurrent_type=self.concurrent_type, func_timeout=25, ) res = [] for item in all_res: if item != '': res.append(item) # 修改对应的goods_id的coupon_check_time sql_str = 'update dbo.GoodsInfoAutoGet set coupon_check_time=%s where GoodsID=%s' sql_cli = SqlServerMyPageInfoSaveItemPipeline() for item in slice_params_list: goods_id = item['goods_id'] coupon_check_time_change_res = False try: coupon_check_time_change_res = sql_cli._update_table_2( sql_str=sql_str, params=( get_shanghai_time(), goods_id, ), logger=self.lg, ) except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('[{}] update goods_id: {} coupon_check_time'.format( '+' if coupon_check_time_change_res else '-', goods_id, )) try: del sql_cli except: pass try: del all_res except: pass collect() return res async def get_db_res(self) -> list: """ 获取目标goods_id_list :return: """ get_current_func_info_by_traceback(self=self, logger=self.lg) db_res = [] try: self.lg.info('清除过期优惠券ing ...') # 清除过期优惠券 self.sql_cli._delete_table( sql_str= 'delete from dbo.coupon_info where GETDATE()-end_time >= 3', params=None, ) self.lg.info('休眠15s ...') await async_sleep(15) self.lg.info('获取新待检测的goods数据ing...') db_res = list(self.sql_cli._select_table(sql_str=self.sql_tr0, )) except Exception: self.lg.error('遇到错误:', exc_info=True) self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() assert db_res != [] self.lg.info('db_res_len: {}'.format(len(db_res))) return db_res async def intercept_target_api(self, coupon_url: str): """ 拦截目标接口 :param coupon_url: :return: """ chromium_puppeteer = ChromiumPuppeteer( load_images=self.driver_load_images, executable_path=PYPPETEER_CHROMIUM_DRIVER_PATH, ip_pool_type=self.ip_pool_type, headless=self.headless, user_agent_type=self.user_agent_type, ) driver = await chromium_puppeteer.create_chromium_puppeteer_browser() # self.lg.info('chromium version: {}'.format(await driver.version())) # self.lg.info('初始user_agent: {}'.format(await driver.userAgent())) page = await driver.newPage() await bypass_chrome_spiders_detection(page=page) # ** 截获 request 和 response, 劫持请求跟应答必须都设置! # ** puppeteer官网事件api: https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md # 搜索class: Page, 找到需求事件进行重写 await page.setRequestInterception(True) network_interceptor = NetworkInterceptorTest() page.on(event='request', f=network_interceptor.intercept_request) page.on(event='response', f=network_interceptor.intercept_response) page.on(event='requestfailed', f=network_interceptor.request_failed) # page.on(event='requestfinished', f=network_interceptor.request_finished) res = False try: await goto_plus( page=page, url=coupon_url, options={ 'timeout': 1000 * 45, # unit: ms 'waitUntil': [ # 页面加载完成 or 不再有网络连接 'domcontentloaded', 'networkidle0', ] }, num_retries=2, ) # 全屏截图 # await page.screenshot({ # 'path': 'screen.png', # 'type': 'png', # 'fullPage': True, # }) # 目标元素定位截图 # target_ele = await page.querySelector(selector='div.board') # await target_ele.screenshot({ # 'path': 'target_ele.png', # 'type': 'png', # }) # 如果网页内有用iframe等标签,这时page对象是无法读取<iframe>里面的内容的,需要用到下面 # frames_list = page.frames # pprint(frames_list) body = Requests._wash_html(await page.content()) # print('[{:8s}] {}'.format( # colored('body', 'red'), # body, )) res = True if body != '' else res except (WebsocketsConnectionClosed, InvalidStateError): pass except Exception: self.lg.error('遇到错误:', exc_info=True) try: await driver.close() except: try: await driver.close() except: pass try: del page except: try: del page except: pass try: del chromium_puppeteer except: try: del chromium_puppeteer except: pass collect() return res @catch_exceptions_with_class_logger(default_res='') def get_tm_coupon_url_from_lq5u( self, goods_id='', goods_name_or_m_url: str = '', ) -> str: """ 从领券无忧根据goods_id搜索tm优惠券, 并返回领券地址 url: http://www.lq5u.com :param goods_id: 推荐使用商品id来查券 :param goods_name_or_m_url: 商品名 or 商品地址 :param proxy_type: :param num_retries: :return: 优惠券领取地址 """ global goods_id_and_coupon_url_queue # todo 测试发现无需搜索, 只需把goods_id 改为领券无忧的对应的url即可查询是否有券 # 基于领券无忧来根据商品名获取其优惠券 # headers = get_random_headers( # user_agent_type=1, # connection_status_keep_alive=False, # ) # headers.update({ # 'Proxy-Connection': 'keep-alive', # 'Origin': 'http://www.lq5u.com', # 'Content-Type': 'application/x-www-form-urlencoded', # 'Referer': 'http://www.lq5u.com/', # }) # # 只搜索天猫的 # data = { # 'p': '1', # 'cid': '0', # 'sort': '0', # 'b2c': '1', # '0'为搜索tb, tm | '1'为只搜索tm # 'coupon': '1', # 'k': goods_name_or_m_url, # } # body = Requests.get_url_body( # method='post', # url='http://www.lq5u.com/', # headers=headers, # # cookies=cookies, # data=data, # verify=False, # ip_pool_type=IP_POOL_TYPE, # num_retries=num_retries, # proxy_type=proxy_type,) # assert body != '' # # print(body) # # lq5u_url_list_sel = { # 'method': 'css', # 'selector': 'li a ::attr("onmousedown")', # } # ori_lq5u_url_list = parse_field( # parser=lq5u_url_list_sel, # target_obj=body, # is_first=False,) # lq5u_url_list = [] # for item in ori_lq5u_url_list: # try: # url = re.compile('this.href=\'(.*?)\'').findall(item)[0] # assert url != '' # except Exception: # continue # # lq5u_url_list.append('http://www.lq5u.com' + url) # # assert lq5u_url_list != [] # pprint(lq5u_url_list) # 领券无忧对应页面如下 # url = 'http://www.lq5u.com/item/index/iid/{}.html'.format(goods_id) # body = Requests.get_url_body( # method='get', # url=url, # headers=headers, # verify=False, # ip_pool_type=IP_POOL_TYPE, # num_retries=num_retries, # proxy_type=proxy_type, ) # assert body != '' # print(body) # # coupon_info_sel = { # 'method': 'css', # 'selector': 'span.b.red ::text', # } # coupon_info = parse_field( # parser=coupon_info_sel, # target_obj=body, # ) # if '很遗憾,该商品没有优惠券' in coupon_info: # return [] # else: # _print(msg='goods_id: {}, 存在优惠券'.format(goods_id), logger=logger) # return [] # 查看某商品是否含有优惠券 # 地址: http://www.i075.com/item/index/iid/562016826663.html # 可以从下面网站拿商品测试 # http://www.i075.com/index/cate/cid/1.html # tm # goods_id = '562016826663' # goods_id = '565122084412' # tb # goods_id = '573406377569' # # 根据领券无忧接口 # # base_url = 'www.i075.com' # base_url = 'quan.mmfad.com' # headers = get_random_headers( # user_agent_type=1, # connection_status_keep_alive=False, # upgrade_insecure_requests=False, # cache_control='',) # headers.update({ # 'accept': 'application/json, text/javascript, */*; q=0.01', # 'Referer': 'http://{}/item/index/iid/{}.html'.format(base_url, goods_id), # 'Origin': 'http://{}'.format(base_url), # 'X-Requested-With': 'XMLHttpRequest', # 'Content-Type': 'application/x-www-form-urlencoded', # 'Proxy-Connection': 'keep-alive', # }) # params = ( # ('rnd', str(random_uniform(0, 1))), # eg: '0.4925945510743117' # ) # data = { # 'iid': goods_id, # } # body = Requests.get_url_body( # method='post', # url='http://{}/item/ajax_get_auction_code.html'.format(base_url), # headers=headers, # params=params, # data=data, # verify=False, # ip_pool_type=self.ip_pool_type, # num_retries=self.req_num_retries, # proxy_type=self.proxy_type, ) # assert body != '' # # self.lg.info(body) # # data = json_2_dict( # json_str=body, # default_res={}, # logger=self.lg,).get('data', {}) # # pprint(data) # # 处理data = '' # data = data if not isinstance(data, str) else {} # coupon_url = data.get('coupon_click_url', '') # 通过全优惠网(https://www.quanyoubuy.com) headers = get_random_headers( user_agent_type=1, connection_status_keep_alive=False, cache_control='', ) headers.update({ 'authority': 'm.quanyoubuy.com', }) url = 'https://m.quanyoubuy.com/item/index/iid/{}.html'.format( goods_id) body = Requests.get_url_body( url=url, headers=headers, ip_pool_type=self.ip_pool_type, proxy_type=self.proxy_type, num_retries=self.req_num_retries, ) assert body != '' # self.lg.info(body) # pc 的 # qrcode_url_sel = { # 'method': 'css', # 'selector': 'img#qrcode ::attr("src")', # } # qrcode_url = parse_field( # parser=qrcode_url_sel, # target_obj=body, # logger=self.lg,) # assert qrcode_url != '' # # self.lg.info(qrcode_url) # coupon_url_sel = { # 'method': 're', # 'selector': 'text=(.*)', # } # coupon_url = parse_field( # parser=coupon_url_sel, # target_obj=qrcode_url, # logger=self.lg,) # m coupon_url_sel = { 'method': 'css', 'selector': 'div.goods_quan a.getGoodsLink ::attr("href")', } coupon_url = parse_field( parser=coupon_url_sel, target_obj=body, logger=self.lg, is_print_error=False, ) # self.lg.info(coupon_url) if 'uland.taobao.com' not in coupon_url: # 地址含有上诉的才为领券地址 coupon_url = '' else: pass if coupon_url != '': self.lg.info('[+] 该goods_id: {} 含 有优惠券, coupon领取地址: {}'.format( goods_id, coupon_url, )) # 队列录值 goods_id_and_coupon_url_queue.put({ 'goods_id': goods_id, 'coupon_url': coupon_url, }) else: self.lg.info('[-] 该goods_id: {} 不含 有优惠券'.format(goods_id)) try: del body except: pass collect() return coupon_url def init_sql_str(self): self.sql_tr0 = ''' select top 800 GoodsID, SiteID from dbo.GoodsInfoAutoGet where MainGoodsID is not null and IsDelete=0 and (SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6) and GoodsID not in (select goods_id from dbo.coupon_info) -- and MainGoodsID=143509 -- and GoodsID='18773718545' order by coupon_check_time asc ''' def __del__(self): try: del self.concurrency del self.loop except: pass collect()
class MiaPintuanRealTimeUpdate(object): def __init__(self): self.ip_pool_type = IP_POOL_TYPE self.sql_cli = None def run_forever(self): ''' 实时更新数据 :return: ''' result = self._get_db_old_data() if result is None: sleep_time = 20 print('获取db数据失败, 休眠{}s ...'.format(sleep_time)) sleep(sleep_time) return None index = 1 for item in result: # 实时更新数据 goods_id = item[0] pid = item[2] # 2020-04-12 00:00:00 pintuan_end_time = json_2_dict(item[1]).get('end_time') pintuan_end_time = datetime_to_timestamp( string_to_datetime(pintuan_end_time)) # print(pintuan_end_time) data = {} self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli, index=index, remainder=50) if self.sql_cli.is_connect_success: is_recent_time = self.is_recent_time(pintuan_end_time) if is_recent_time == 0: # 已恢复原价的 _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mia_update_str_7, sql_cli=self.sql_cli) print('该goods拼团开始时间为({})'.format( json.loads(item[1]).get('begin_time'))) sleep(.4) elif is_recent_time == 2: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) pass else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})' .format(goods_id, index)) data['goods_id'] = goods_id try: data_list = get_mia_pintuan_one_page_api_goods_info( page_num=pid) except ResponseBodyIsNullStrException: index += 1 sleep(.4) continue # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新 # try: # assert data_list != [], 'data_list不为空list!' # except AssertionError as e: # print(e) # _handle_goods_shelves_in_auto_goods_table( # goods_id=goods_id, # update_sql_str=mia_update_str_7, # sql_cli=self.sql_cli) # sleep(.4) # index += 1 # continue pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in data_list ] # print(pintuan_goods_all_goods_id) ''' 蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍) ''' mia_pt = MiaPintuanParse(is_real_times_update_call=True) if goods_id not in pintuan_goods_all_goods_id: # 内部已经下架的 # 一律更新 try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, ) except AssertionError: # 返回的data为空则跳过 index += 1 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table(data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in data_list: if item_2.get('goods_id', '') == goods_id: sub_title = item_2.get('sub_title', '') try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, sub_title=sub_title, ) except AssertionError: # 返回的data为空则跳过 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table( data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass try: del mia_pt except: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect() def _get_mia_pt_one_goods_info(self, mia_pt_obj, goods_id, sub_title='') -> dict: """ 获取mia单个goods info :return: """ mia_pt_obj.get_goods_data(goods_id=goods_id) goods_data = mia_pt_obj.deal_with_data() assert goods_data != {}, 'goods_data不为空dict' goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = sub_title if goods_data['pintuan_time'] == {}: # 当没有拼团时间时,就表示已下架拼团 now_time = get_shanghai_time() goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = (now_time, now_time) else: goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['pintuan_time']) return goods_data def _get_db_old_data(self) -> (list, None): """ 获取db待更新data :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.sql_cli._delete_table(sql_str=mia_delete_str_2) result = list(self.sql_cli._select_table(sql_str=mia_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') _block_print_db_old_data(result=result) return result def is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(datetime_to_timestamp(get_shanghai_time())) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): collect()
class Z8Updater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/折800/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.sql_cli = None self.goods_index = 1 self.concurrency = 8 # 并发量 async def _get_db_old_data(self): self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.sql_cli._delete_table(sql_str=z8_delete_str_4, params=None) await async_sleep(5) result = list(self.sql_cli._select_table(sql_str=z8_select_str_4)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] session_id = item[2] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_z8_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg, remainder=30) if self.sql_cli.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) self.lg.info( '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res elif is_recent_time == 2: # 可能包括过期的 if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: # 处理已过期的逻辑删 res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) self.lg.info( '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) else: self.lg.info( '未来时间暂时不更新! miaosha_begin_time: {}, miaosha_end_time: {}' .format( timestamp_to_regulartime(miaosha_begin_time), timestamp_to_regulartime(miaosha_end_time), )) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) try: tmp_data = self.zhe_800_spike._get_one_session_id_data( base_session_id=str(session_id)) except Exception: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index return goods_id, res try: tmp_data = tmp_data.get('data', {}).get('blocks', []) assert tmp_data != [], '该session_id不存在,此处跳过' except AssertionError: # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品 self.lg.error(msg='遇到错误:', exc_info=True) res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) self.lg.info( msg= '该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!' .format(goods_id, miaosha_begin_time)) index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res tmp_data = [item_s.get('deal', {}) for item_s in tmp_data] # pprint(tmp_data) try: miaosha_goods_list = await self._get_miaoshao_goods_info_list( data=tmp_data) # pprint(miaosha_goods_list) except ValueError: await async_sleep(2) index += 1 self.goods_index = index return goods_id, res # 该session_id中现有的所有zid的list miaosha_goods_all_goods_id = [ i.get('zid') for i in miaosha_goods_list ] if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) self.lg.info( '该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format( goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 未下架的 res = await self._one_update( miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(1.5) return goods_id, res async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :return: ''' miaosha_goods_list = kwargs.get('miaosha_goods_list') goods_id = kwargs.get('goods_id') zhe_800_miaosha = Zhe800Parse() res = False for item_1 in miaosha_goods_list: if item_1.get('zid', '') == goods_id: zhe_800_miaosha.get_goods_data(goods_id=goods_id) goods_data = zhe_800_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 break else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get('stock_info') goods_data['goods_id'] = str(item_1.get('zid')) if item_1.get('stock_info').get('activity_stock') > 0: # self.lg.info(item_1.get('price')) # self.lg.info(item_1.get('taobao_price')) goods_data['price'] = item_1.get('price') goods_data['taobao_price'] = item_1.get('taobao_price') else: self.lg.info('该商品参与活动的对应库存为0') res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) break goods_data['sub_title'] = item_1.get('sub_title') goods_data['miaosha_time'] = item_1.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get('miaosha_time')) if goods_data.get('is_delete', 0) == 1: self.lg.info('该商品[{0}]已售罄...'.format(goods_id)) res = zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=self.sql_cli) break else: pass collect() return res async def _get_new_z8_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.zhe_800_spike except: pass collect() self.zhe_800_spike = Zhe800Spike() async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = datetime_to_timestamp(get_shanghai_time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -259200: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time < -172800: # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来2小时的商品信息 return 0 # 已过期恢复原价的 elif diff_time > -172800 and diff_time < 7200: return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 async def _update_db(self): ''' 秒杀数据实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.zhe_800_spike = Zhe800Spike() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.zhe_800_spike except: pass collect() async def _get_miaoshao_goods_info_list(self, data) -> list: ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] # pprint(data) for item in data: if item == {}: continue # pprint(item) tmp = {} tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int( str(item.get('begin_time'))[0:10])), 'miaosha_end_time': timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])), } # 折800商品地址 tmp['zid'] = item.get('zid') # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } # 原始价格 tmp['price'] = float(item.get('list_price')) # 秒杀的价格, float类型 tmp['taobao_price'] = float(item.get('price')) tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.zhe_800_spike except: pass collect()
class CCUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/楚楚街/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.sql_cli = None self.concurrency = 8 # 并发量 self.goods_index = 1 self.delete_sql_str = cc_delete_str_1 async def _get_pc_headers(self): headers = await async_get_random_headers( upgrade_insecure_requests=False, ) headers.update({ 'accept': 'application/json,text/javascript,*/*;q=0.01', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'api.chuchujie.com', 'referer': 'https://m.chuchujie.com/?module=99', }) return headers async def _get_db_old_data(self) -> (list, None): self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.sql_cli._delete_table(sql_str=cc_delete_str_2) await async_sleep(5) result = list(self.sql_cli._select_table(sql_str=cc_select_str_1)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_new_cc_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.chuchujie_miaosha except: pass collect() self.chuchujie_miaosha = ChuChuJie_9_9_Parse() return async def _update_one_goods_info(self, item, index): ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] gender = item[2] page = item[3] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_cc_obj(index=index) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, remainder=25, ) if self.sql_cli.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_end_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=cc_update_str_2, sql_cli=self.sql_cli, ) self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_end_time))) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res elif is_recent_time == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=cc_update_str_2, sql_cli=self.sql_cli, ) self.lg.info( '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_end_time))) else: pass index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) body = await self._get_one_page_goods_info(gender, page) if body == '': index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res json_body = json_2_dict(body, default_res={}) try: this_page_total_count = json_body.get('data', {}).get( 'groupList', [])[0].get('totalCount', 0) except IndexError: self.lg.error('获取this_page_total_count时出错, 请检查!') this_page_total_count = 0 item_list = await self._get_item_list( this_page_total_count=this_page_total_count, json_body=json_body) if item_list == []: self.lg.info( '#### 该gender, page对应得到的item_list为空[]!\n该商品已被下架限时秒杀活动,此处将其删除' ) res = _handle_goods_shelves_in_auto_goods_table( goods_id=item[0], logger=self.lg, update_sql_str=cc_update_str_2, sql_cli=self.sql_cli, ) self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res else: res = await self._one_update(goods_id=goods_id, item_list=item_list) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(CHUCHUJIE_SLEEP_TIME) return goods_id, res async def _update_db(self) -> None: ''' 秒杀数据更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.chuchujie_miaosha = ChuChuJie_9_9_Parse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.chuchujie_miaosha except: pass collect() async def _get_item_list(self, **kwargs) -> list: ''' 获取对应gender, page的商品list :return: ''' this_page_total_count = kwargs.get('this_page_total_count') json_body = kwargs.get('json_body') tmp_goods_list = json_body.get('data', {}).get('groupList', [])[0].get('dataList', []) item_list = [{ 'goods_id': str(item_s.get('chuchuId', '')), 'sub_title': item_s.get('description', ''), } for item_s in tmp_goods_list] if this_page_total_count != 0 else [] return item_list async def _one_update(self, **kwargs): ''' 未下架的更新 :param kwargs: :return: ''' res = False goods_id = kwargs.get('goods_id') item_list = kwargs.get('item_list') # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in item_list] # 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(goods_id)) # self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) # pass # else: # 未下架的 # 不更新秒杀时间和sub_title, 只更新其他相关数据 # for item_2 in item_list: # if item_2.get('goods_id', '') == goods_id: self.chuchujie_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.chuchujie_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(goods_id) # goods_data['sub_title'] = item_2.get('sub_title', '') # print(goods_data) res = self.chuchujie_miaosha.update_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=self.sql_cli) return res async def _get_one_page_goods_info(self, *params) -> str: ''' 得到一个页面的html代码 :param params: 待传入的参数 :return: '{}' or str ''' gender, page = params tmp_url = 'https://api.chuchujie.com/api/' client = { "ageGroup": "AG_0to24", "channel": "QD_web_webkit", "deviceId": "0", "gender": gender, # '0' -> 女 | '1' -> 男 "imei": "0", "packageName": "com.culiu.purchase", "platform": "wap", "sessionId": "0", "shopToken": "0", "userId": "0", "version": "1.0", "xingeToken": "" } query = {"group": 4, "module": "99", "page": page, "tab": "all"} # 切记: Query String Parameters直接这样编码发送即可 # 如果是要post的数据就得使用post的方法 data = { 'client': json.dumps(client), 'query': json.dumps(query), 'page': page } body = Requests.get_url_body(url=tmp_url, headers=self.headers, params=data, ip_pool_type=self.ip_pool_type) return body async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = datetime_to_timestamp(get_shanghai_time()) # 当前的时间戳 diff_time = time_1 - time_2 # if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 if diff_time < -100000: # 设置大点避免还在卖的被下掉 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.chuchujie_miaosha except: pass collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=z8_delete_str_1) result = list(sql_cli._select_table(sql_str=z8_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 for item in result: # 实时更新数据 goods_id = item[0] db_is_delete = item[1] # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, remainder=50, ) if index % 300 == 0: # 每更新300个,休眠3分钟 sleep_time = 3 * 60 sleep(sleep_time) print('休眠{}s中...'.format(sleep_time)) if sql_cli.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id) # 不用这个了因为会影响到正常情况的商品 try: # 单独处理商品页面不存在的情况 if isinstance(tmp_tmp, str) and re.compile( r'^ze').findall(tmp_tmp) != []: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) sleep(ZHE_800_PINTUAN_SLEEP_TIME) continue else: pass except: pass data = zhe_800_pintuan.deal_with_data() if data != {}: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id if db_is_delete == 1: print('该goods_id[{0}]已过期!'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) else: zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
class MIUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/蜜芽/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.delete_sql_str = mia_delete_str_3 self.concurrency = 8 # 并发量 self.tmp_sql_server = None self.goods_index = 1 async def _get_pc_headers(self) -> dict: headers = await async_get_random_headers( upgrade_insecure_requests=False, ) headers.update({ 'Host': 'm.mia.com', }) return headers async def _get_db_old_data(self): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=mia_delete_str_4) await async_sleep(5) result = list( self.tmp_sql_server._select_table(sql_str=mia_select_str_3)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_end_time(self, miaosha_time): miaosha_end_time = json.loads(miaosha_time).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_end_time async def _get_new_mia_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.mia_miaosha except: pass collect() self.mia_miaosha = MiaParse() async def _update_one_goods_info(self, item, index) -> tuple: ''' 单个更新 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] pid = item[2] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_mia_obj(index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30, ) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_end_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) await async_sleep(.5) self.goods_index = index + 1 return goods_id, res elif is_recent_time == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info( '过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) else: pass self.goods_index = index + 1 return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( pid) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(body) body = '' if body == '' or body == '[]' else body try: tmp_data = json_2_dict( json_str=body, default_res={}, logger=self.lg, ) assert tmp_data != {}, 'tmp_data为空dict!' except AssertionError: self.lg.error('遇到错误:', exc_info=True) self.goods_index = index + 1 await async_sleep(.3) return goods_id, res item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('item_id', '') for item_1 in item_list ] # self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) self.goods_index = index + 1 await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update( item_list=item_list, goods_id=goods_id, tmp_data=tmp_data, ) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 self.goods_index = index + 1 collect() return goods_id, res async def _update_db(self) -> None: ''' 秒杀实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.mia_miaosha = MiaParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.mia_miaosha except: pass collect() async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :param kwargs: :return: ''' res = False item_list = kwargs.get('item_list') goods_id = kwargs.get('goods_id') tmp_data = kwargs.get('tmp_data') begin_time, end_time = await self._get_begin_time_and_end_time(tmp_data ) for item_2 in item_list: if item_2.get('item_id', '') == goods_id: self.mia_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.mia_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(goods_id) goods_data['price'] = item_2.get('active_price') goods_data['taobao_price'] = item_2.get('active_price') goods_data['sub_title'] = item_2.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(begin_time), 'miaosha_end_time': timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) res = self.mia_miaosha.update_mia_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) break else: pass return res async def _get_begin_time_and_end_time(self, tmp_data) -> tuple: begin_time = tmp_data.get('p_info', {}).get('start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') # 把str字符串类型转换为时间戳的形式 begin_time = int( time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S'))) end_time = int( time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S'))) return begin_time, end_time async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = datetime_to_timestamp(get_shanghai_time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.mia_miaosha except: pass collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_xianshimiaosha where site_id=20' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mia_miaosha = MiaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( item[2]) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '' or body == '[]': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') begin_time = tmp_data.get('p_info', {}).get( 'start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') begin_time = int( time.mktime( time.strptime(begin_time, '%Y/%m/%d %H:%M:%S')) ) # 把str字符串类型转换为时间戳的形式 end_time = int( time.mktime( time.strptime(end_time, '%Y/%m/%d %H:%M:%S'))) item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('item_id', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('item_id', '') == item[0]: mia_miaosha.get_goods_data( goods_id=item[0]) goods_data = mia_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) goods_data['price'] = item_2.get( 'active_price') goods_data[ 'taobao_price'] = item_2.get( 'active_price') goods_data[ 'sub_title'] = item_2.get( 'short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( begin_time), 'miaosha_end_time': timestamp_to_regulartime( end_time), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # pprint(goods_data) # print(goods_data) mia_miaosha.update_mia_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
class JPUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.tmp_sql_server = None self.concurrency = 8 self.goods_index = 1 self.delete_sql_str = jp_delete_str_3 async def _get_pc_headers(self) -> dict: return { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'm.juanpi.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } async def _get_db_old_data(self) -> (None, list): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None) await async_sleep(5) result = list(self.tmp_sql_server._select_table(sql_str=jp_select_str_4)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_begin_time(self, miaosha_time) -> int: miaosha_begin_time = json_2_dict(miaosha_time).get('miaosha_begin_time') miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_begin_time async def _get_new_jp_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.juanpi_miaosha except: pass collect() self.juanpi_miaosha = JuanPiParse() async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] tab_id = item[2] page = item[3] miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time) # self.lg.info(str(miaosha_begin_time)) await self._get_new_jp_obj(index=index) self.tmp_sql_server = await _get_new_db_conn(db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,)) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format(goods_id, miaosha_begin_time)) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res elif is_recent_time == 2: self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(goods_id, index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(page), ) # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url)) body = Requests.get_url_body(url=tmp_url, headers=await self._get_pc_headers(), ip_pool_type=self.ip_pool_type) try: data = json_2_dict(body, default_res={}).get('data', {}) assert data != {}, 'data为空dict!' data = data.get('goodslist', []) assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(tab_id, page) except AssertionError: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res miaosha_goods_list = await self._get_miaoshao_goods_info_list(data=data) # self.lg.info(str(miaosha_goods_list)) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [i.get('goods_id') for i in miaosha_goods_list] self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 if miaosha_goods_all_goods_id != []: # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过! self.lg.info('该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id)) else: # 表示该tab_id,page中没有了该goods_id res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,)) self.lg.info('该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format(goods_id)) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update(miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res async def _update_db(self) -> None: ''' 秒杀数据实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency) self.juanpi_miaosha = JuanPiParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append(self.loop.create_task(self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.juanpi_miaosha except: pass collect() async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :param kwargs: :return: ''' res = False miaosha_goods_list = kwargs.get('miaosha_goods_list') goods_id = kwargs.get('goods_id') for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == goods_id: self.juanpi_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.juanpi_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 break else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get('stock_info') goods_data['goods_id'] = item_1.get('goods_id') # goods_data['username'] = '******' if item_1.get('stock_info').get('activity_stock') > 0: goods_data['price'] = item_1.get('price') # 秒杀前的原特价 goods_data['taobao_price'] = item_1.get('taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get('sub_title', '') goods_data['miaosha_time'] = item_1.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get('miaosha_time')) res = self.juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) await async_sleep(.3) # 避免太快 break else: pass return res async def _get_miaoshao_goods_info_list(self, data) -> list: ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: tmp = {} tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(item.get('start_time'))), 'miaosha_end_time': timestamp_to_regulartime(int(item.get('end_time'))), } stock = item.get('stock', 0) tmp['goods_id'] = item.get('goods_id') # 限时秒杀库存信息 tmp['stock_info'] = { 'activity_stock': int(item.get('stock', 0)*(item.get('rate', 0)/100)), 'stock': item.get('stock', 0), } # 原始价格 tmp['price'] = round(float(item.get('oprice', '0')), 2) tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2) miaosha_goods_list.append(tmp) return miaosha_goods_list async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(time.time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -259200: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time < -172800: # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来14小时的商品信息(20点到第二天10点时间间隔为14小时) return 0 # 已过期恢复原价的 elif diff_time > -172800 and diff_time < 50400: return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, schedule, is_delete from dbo.juanpi_pintuan where site_id=18' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 data = {} if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: pintuan_end_time = json.loads(item[1])[0].get('end_time') pintuan_end_time = int(str(time.mktime(time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int(time.time()): sql_str = 'delete from dbo.juanpi_pintuan where goods_id=%s' tmp_sql_server._delete_table(sql_str=sql_str, params=(item[0],)) print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0])) else: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) juanpi_pintuan.get_goods_data(goods_id=item[0]) data = juanpi_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] juanpi_pintuan.to_right_and_update_pintuan_data(data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del juanpi_pintuan # except: # pass gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select goods_id, miaosha_time, pid from dbo.mia_pintuan where site_id=21' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mia_pintuan = MiaPintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str( item[2]) + '/0/' # print(tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) if body == '': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') if tmp_data.get('data_list', []) == []: print('得到的data_list为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: data_list = [{ 'goods_id': item_2.get('sku', ''), 'sub_title': item_2.get('intro', ''), } for item_2 in tmp_data.get('data_list', [])] # pprint(data_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in data_list ] # print(pintuan_goods_all_goods_id) ''' 蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍) ''' if item[0] not in pintuan_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # 一律更新 mia_pintuan.get_goods_data( goods_id=item[0]) goods_data = mia_pintuan.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) if goods_data[ 'pintuan_time'] == {}: # 当没有拼团时间时,就表示已下架拼团(未让其正常更新进数据库, 我把拼团开始结束时间都设置为当前时间) now_time = get_shanghai_time() goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = ( now_time, now_time) else: goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) # pprint(goods_data) # print(goods_data) mia_pintuan.update_mia_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in data_list: if item_2.get('goods_id', '') == item[0]: mia_pintuan.get_goods_data( goods_id=item[0]) goods_data = mia_pintuan.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) goods_data[ 'sub_title'] = item_2.get( 'sub_title', '') if goods_data[ 'pintuan_time'] == {}: # 当没有拼团时间时,就表示已下架拼团 now_time = get_shanghai_time( ) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = ( now_time, now_time) else: goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time= goods_data[ 'pintuan_time'] ) # pprint(goods_data) # print(goods_data) mia_pintuan.update_mia_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME ) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=jm_delete_str_2) result = list(tmp_sql_server._select_table(sql_str=jm_select_str_1)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 获取cookies my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int(str(time.mktime(time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 jumeiyoupin_miaosha = JuMeiYouPinParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] this_page_all_goods_list = self.get_one_page_all_goods_list(item[2]) if this_page_all_goods_list == '网络错误!': print('网络错误!先跳过') continue elif this_page_all_goods_list == []: print('#### 该page对应得到的this_page_all_goods_list为空[]!') print('** 该商品已被下架限时秒杀活动, 此处将其删除') tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(item[3]) jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) # print(goods_data) jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) sleep(JUMEIYOUPIN_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=z8_delete_str_1) result = list( tmp_sql_server._select_table(sql_str=z8_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=item[0]) # 不用这个了因为会影响到正常情况的商品 try: # 单独处理商品页面不存在的情况 if isinstance(tmp_tmp, str) and re.compile( r'^ze').findall(tmp_tmp) != []: print('@@ 该商品的页面已经不存在!此处将其删除!') tmp_sql_server._delete_table( sql_str=z8_delete_str_2, params=(item[0], )) sleep(ZHE_800_PINTUAN_SLEEP_TIME) continue else: pass except: pass data = zhe_800_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] if item[1] == 1: tmp_sql_server._delete_table( sql_str=z8_delete_str_2, params=(item[0], )) print('该goods_id[{0}]已过期,删除成功!'.format(item[0])) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass gc.collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(self): ''' 这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=pd_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 pinduoduo_miaosha = PinduoduoParse() all_miaosha_goods_list = self.get_all_miaosha_goods_list() # 其中所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in all_miaosha_goods_list ] # print(miaosha_goods_all_goods_id) for item in result: # 实时更新数据 # 对于拼多多先拿到该商品的结束时间点 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: sql_cli._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) sleep(.3) elif self.is_recent_time(miaosha_end_time) == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示其中没有了该goods_id ''' sql_cli._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) sleep(.3) else: # 未下架的 for item_1 in all_miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # pinduoduo_miaosha = PinduoduoParse() pinduoduo_miaosha.get_goods_data( goods_id=item[0]) goods_data = pinduoduo_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 # sleep(3) pass else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get( 'stock_info') goods_data['goods_id'] = item_1.get( 'goods_id') if item_1.get('stock_info').get( 'activity_stock') > 0: goods_data['price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get( 'miaosha_time')) if item_1.get('stock_info').get( 'activity_stock') <= 1: # 实时秒杀库存小于等于1时就标记为 已售罄 print('该秒杀商品已售罄...') goods_data['is_delete'] = 1 # print(goods_data) pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table( data=goods_data, pipeline=sql_cli) sleep(PINDUODUO_SLEEP_TIME) else: pass index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(3 * 60) # del ali_1688 gc.collect()
class JMYPUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/聚美优品/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.sql_cli = None self.delete_sql_str = jm_delete_str_1 self.goods_index = 1 self.concurrency = 10 # 并发量 async def _get_pc_headers(self): headers = await async_get_random_headers( upgrade_insecure_requests=False, ) headers.update({ 'accept': 'application/json,text/javascript,text/plain,*/*;q=0.01', # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'h5.jumei.com', 'referer': 'https://h5.jumei.com/', 'X-Requested-With': 'XMLHttpRequest', }) return headers async def _get_db_old_data(self) -> (list, None): ''' 待更新数据 :return: ''' self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.sql_cli._delete_table(sql_str=jm_delete_str_2) await async_sleep(5) result = list(self.sql_cli._select_table(sql_str=jm_select_str_1)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_cookies(self) -> str: ''' 获取请求需要的cookies :return: ''' # 获取cookies my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) cookies = my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': self.lg.error('!!! 获取cookies失败 !!!') self.lg.info('获取cookies成功!') return cookies async def _get_new_jumei_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.jumeiyoupin_miaosha except: pass collect() self.jumeiyoupin_miaosha = JuMeiYouPinParse() async def _get_one_page_all_goods_list(self, *params) -> (list, str): ''' 得到一个页面地址的所有商品list :return: str | list 类型 ''' page = params[0] all_goods_list = [] tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) json_body = json_2_dict(Requests.get_url_body( url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type), default_res={}, logger=self.lg) if json_body == {}: return '网络错误!' this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: return [] for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] return all_goods_list async def _update_one_goods_info(self, item, index): ''' 更新单个 :return: ''' res = False goods_id = item[0] miaosha_time = item[1] page = item[2] goods_url = item[3] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_jumei_obj(index=index) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, ) if self.sql_cli.is_connect_success: is_recent_time_res = await self._is_recent_time(miaosha_end_time) if is_recent_time_res == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jm_update_str_4, sql_cli=self.sql_cli, ) self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_end_time))) await async_sleep(.3) elif is_recent_time_res == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jm_update_str_4, sql_cli=self.sql_cli, ) self.lg.info( '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_end_time))) else: pass else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) this_page_all_goods_list = await self._get_one_page_all_goods_list( page) if isinstance(this_page_all_goods_list, str): self.lg.error('网络错误!先跳过') await async_sleep(1.5) return res elif this_page_all_goods_list == []: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jm_update_str_4, sql_cli=self.sql_cli, ) self.lg.error( '#### 该page对应得到的this_page_all_goods_list为空[]!') self.lg.error( '** 该商品已被下架限时秒杀活动, 此处将其逻辑删除, goods_id:{}'.format( goods_id)) await async_sleep(.3) else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') # res = _handle_goods_shelves_in_auto_goods_table( # goods_id=goods_id, # logger=self.lg, # update_sql_str=jm_update_str_4, # sql_cli=self.sql_cli, ) # self.lg.info('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # else: # 未下架的 tmp_r = self.jumeiyoupin_miaosha.get_goods_id_from_url( goods_url) self.jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = self.jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = goods_id goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) res = self.jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=self.sql_cli) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(JUMEIYOUPIN_SLEEP_TIME) return [goods_id, res] async def _update_db(self): ''' 数据更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) cookies = await self._get_cookies() self.headers = await self._get_pc_headers() self.headers.update({ 'Cookie': cookies, }) self.jumeiyoupin_miaosha = JuMeiYouPinParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(10) try: del self.jumeiyoupin_miaosha except: pass collect() async def _is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(datetime_to_timestamp(get_shanghai_time())) diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.jumeiyoupin_miaosha except: pass collect()
def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods info :param result: :return: ''' index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_begin_time) data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_miaosha = Zhe800Parse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_begin_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] try: tmp_data = self.zhe_800_spike._get_one_session_id_data( base_session_id=str(item[2])) except Exception as e: print(e) continue if tmp_data.get('data', {}).get('blocks', []) == []: # session_id不存在 print('该session_id不存在,此处跳过') pass else: tmp_data = [ item_s.get('deal', {}) for item_s in tmp_data.get( 'data', {}).get('blocks', []) ] if tmp_data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list( data=tmp_data) # pprint(miaosha_goods_list) # 该session_id中现有的所有zid的list miaosha_goods_all_goods_id = [ i.get('zid') for i in miaosha_goods_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('zid', '') == item[0]: zhe_800_miaosha.get_goods_data( goods_id=item[0]) goods_data = zhe_800_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data[ 'stock_info'] = item_1.get( 'stock_info') goods_data['goods_id'] = str( item_1.get('zid')) # goods_data['username'] = '******' if item_1.get('stock_info').get( 'activity_stock') > 0: goods_data[ 'price'] = item_1.get( 'price') goods_data[ 'taobao_price'] = item_1.get( 'taobao_price') else: pass goods_data[ 'sub_title'] = item_1.get( 'sub_title') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1. get('miaosha_time')) # print(goods_data['stock_info']) # print(goods_data['miaosha_time']) zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) else: pass else: # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品 print('该sessionid没有相关key为jsons的数据') # return {} tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass # sleep(.8) gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) gc.collect() return
async def run_forever(): #### 实时更新数据 # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志 lg = set_logger(logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # 由于不处理下架的商品,所以is_delete=0 try: # todo 先不处理过期的因为后台没有同步下架会导致其无法查到数据 # 得 处理 因为只要此处会清数据了 tmp_sql_server._delete_table(sql_str=tb_delete_str_2, params=None) # await async_sleep(10) result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7)) except TypeError: lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)') return None await _print_db_old_data( result=result, logger=lg, ) index = 1 for item in result: goods_id = item[0] tejia_end_time = item[2] tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=lg, db_conn_type=1, ) if tmp_sql_server.is_connect_success: # lg.info(str(tejia_end_time)) if tejia_end_time < get_shanghai_time(): # 过期的不删除, 降为更新为常规爆款促销商品 # index = await update_expired_goods_to_normal_goods( # goods_id=goods_id, # index=index, # tmp_sql_server=tmp_sql_server, # logger=lg # ) # 过期直接下架 lg.info('@@ 过期下架[goods_id: {}]'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=lg, update_sql_str=tb_update_str_5, ) index += 1 else: # 下面为天天特价商品信息更新 ''' ** 由于天天特价不会提前下架商品,就不对应更新特价时间段 ''' # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间 # if index % 6 == 0: # try: del tmp_taobao_tiantiantejia # except: pass # collect() # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=lg) # # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3]) # if tmp_body == '': # msg = '获取到的tmp_body为空str! 出错category为: ' + item[3] # lg.error(msg) # continue # # try: # tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0] # except IndexError: # msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3] # lg.error(msg) # continue # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body) # if tmp_sort_data == 'no items': # lg.info('该api接口获取到的item_list为no items!请检查') # break # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data) # # lg.info(str(tejia_goods_list)) # await async_sleep(.45) # # lg.info('111') ''' 研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过 ''' # if is_in_child_sort(tejia_goods_list, goods_id=goods_id) is False: # 表示被官方提前下架 # # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=goods_id) # # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(goods_id)) # print('222') # pass # else: # 表示商品未被提前下架 lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) taobao = TaoBaoLoginAndParse( logger=lg, is_real_times_update_call=is_real_times_update_call) taobao.get_goods_data(goods_id) goods_data = taobao.deal_with_data(goods_id=goods_id) if goods_data != {}: # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=goods_id) # if tmp_time != []: # begin_time, end_time = tmp_time # # goods_data['goods_id'] = goods_id # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) # await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server) # else: # lg.info('该goods_id不在该api接口的商品中!!') # pass goods_data['goods_id'] = goods_id if goods_data.get('is_delete', 0) == 1: lg.info('@该商品已下架...') await taobao.update_taobao_tiantiantejia_table( data=goods_data, pipeline=tmp_sql_server) else: await async_sleep(4) await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) index += 1 collect() else: lg.error('数据库连接失败,数据库可能关闭或者维护中') pass collect() lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 # sleep(60 * 60 * .5) await async_sleep(5 * 60) else: await async_sleep(60 * 1) collect() return True
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_4) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_3)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mogujie_miaosha = MoGuJieMiaoShaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0], )) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] item_list = self.get_item_list(event_time=str(item[2])) if item_list == '': # 可能网络状况导致, 先跳过 pass elif item_list == []: print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 该event_time中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('iid', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('iid', '') == item[0]: spider_url = item[3] mogujie_miaosha.get_goods_data( goods_id=spider_url) goods_data = mogujie_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) # price设置为原价 try: tmp_price_list = sorted([ round( float( item_4.get( 'normal_price', '')), 2) for item_4 in goods_data[ 'price_info_list'] ]) price = Decimal( tmp_price_list[-1] ).__round__(2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int( item_2.get( 'startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int( item_2.get( 'endTime', 0))), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # print(goods_data['title']) # pprint(goods_data) # print(goods_data) mogujie_miaosha.update_mogujie_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_2) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()