async def deal_with_data(self): ''' 处理并存储相关拼团商品的数据 :return: ''' goods_list = await self.get_pintuan_goods_info() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [item[0] for item in list(await my_pipeline.select_jumeiyoupin_pintuan_all_goods_id(logger=self.my_lg))] # self.my_lg.info(str(db_goods_id_list)) index = 1 for item in goods_list: if index % 20 == 0: my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if item.get('goods_id', '') in db_goods_id_list: self.my_lg.info('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://s.h5.jumei.com/yiqituan/detail?item_id={0}&type={1}'.format(goods_id, item.get('type', '')) s_time = time.time() jumeiyoupin = JuMeiYouPinPinTuanParse(logger=self.my_lg) goods_data = await jumeiyoupin.deal_with_data(jumei_pintuan_url=tmp_url) if goods_data == {} or goods_data.get('is_delete', 0) == 1: pass else: # 规范化 goods_data['goods_id'] = goods_id goods_data['pintuan_time'] = item.get('pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = await self.get_pintuan_begin_time_and_pintuan_end_time(pintuan_time=item.get('pintuan_time', {})) goods_data['sort'] = item.get('sort') goods_data['page'] = item.get('page') goods_data['tab'] = item.get('tab') # pprint(goods_data) # print(goods_data) await jumeiyoupin.insert_into_jumeiyoupin_pintuan_table(data=goods_data, pipeline=my_pipeline, logger=self.my_lg) e_time = time.time() if e_time - s_time > JUMEIYOUPIN_SLEEP_TIME: # 使其更智能点 pass else: await asyncio.sleep(JUMEIYOUPIN_SLEEP_TIME - (e_time-s_time)) index += 1 else: self.my_lg.error('数据库连接失败,此处跳过!') pass gc.collect() return None
async def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = await tmp_sql_server.select_jumeiyoupin_pintuan_all_goods_id( logger=self.my_lg) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(result) self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await tmp_sql_server.delete_jumeiyoupin_pintuan_expired_goods_id( goods_id=item[0], logger=self.my_lg) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.my_lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.my_lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.my_lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=IP_POOL_TYPE) item_list = await jumeiyoupin_2.get_one_page_goods_list( driver=driver, tab=item[2], index=item[3]) try: del driver except: pass if item_list == []: self.my_lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.my_lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=tmp_sql_server) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=tmp_sql_server) else: self.my_lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() return None
async def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=jm_delete_str_3, ) await async_sleep(5) result = sql_cli._select_table(sql_str=jm_select_str_3, logger=self.lg) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: await _print_db_old_data(result=result, logger=self.lg) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} sql_cli = await _get_new_db_conn(db_obj=sql_cli, index=index, logger=self.lg, remainder=50) if sql_cli.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await sql_cli._update_table_3( sql_str=jm_update_str_5, params=(str(get_shanghai_time()), item[0]), logger=self.lg) await async_sleep(.5) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) item_list = await jumeiyoupin_2.get_one_page_goods_list( driver=driver, tab=item[2], index=item[3]) try: del driver except: pass if item_list == []: self.lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=sql_cli) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=sql_cli) else: self.lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(10 * 60) gc.collect() return None