async def run_forever(): #### 实时更新数据 # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server.select_taobao_tiantian_tejia_all_goods_id()) except TypeError: my_lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg) for item in result: # 实时更新数据 if index % 50 == 0: my_lg.info('正在重置,并与数据库建立新连接中...') # try: del tmp_sql_server # except: pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: tejia_end_time = item[2] # my_lg.info(str(tejia_end_time)) if item[1] == 1: # 原先下架的商品,扫描到不处理 # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0]) # my_lg.info('该商品goods_id[{0}]已售完, 删除成功!'.format(item[0])) my_lg.info( '&&&&&& 该商品({0})原先状态为is_delete=1, 不进行实际删除操作! 索引为({1})'. format(item[0], str(index))) index += 1 pass elif tejia_end_time < datetime.datetime.now(): # 过期的不删除, 降为更新为常规爆款促销商品 index = await update_expired_goods_to_normal_goods( goods_id=item[0], index=index, tmp_sql_server=tmp_sql_server, logger=my_lg) pass else: # 下面为天天特价商品信息更新 ''' ** 由于天天特价不会提前下架商品,就不对应更新特价时间段 ''' # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间 # if index % 6 == 0: # try: del tmp_taobao_tiantiantejia # except: pass # gc.collect() # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg) # # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3]) # if tmp_body == '': # msg = '获取到的tmp_body为空str! 出错category为: ' + item[3] # my_lg.error(msg) # continue # # try: # tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0] # except IndexError: # msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3] # my_lg.error(msg) # continue # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body) # if tmp_sort_data == 'no items': # my_lg.info('该api接口获取到的item_list为no items!请检查') # break # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data) # # my_lg.info(str(tejia_goods_list)) # await asyncio.sleep(.45) # # my_lg.info('111') ''' 研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过 ''' # if is_in_child_sort(tejia_goods_list, goods_id=item[0]) is False: # 表示被官方提前下架 # # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0]) # # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(item[0])) # print('222') # pass # else: # 表示商品未被提前下架 my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) taobao = TaoBaoLoginAndParse(logger=my_lg) taobao.get_goods_data(item[0]) goods_data = taobao.deal_with_data(goods_id=item[0]) if goods_data != {}: # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=item[0]) # if tmp_time != []: # begin_time, end_time = tmp_time # # goods_data['goods_id'] = item[0] # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) # await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server) # else: # my_lg.info('该goods_id不在该api接口的商品中!!') # pass goods_data['goods_id'] = item[0] '''不专门更新上下架时间段''' # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) await taobao.update_taobao_tiantiantejia_table( data=goods_data, pipeline=tmp_sql_server) else: await asyncio.sleep(4) # 否则休息4秒 pass await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) index += 1 gc.collect() else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') pass gc.collect() my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() return True
async def deal_with_all_goods_id(self): ''' 获取每个详细分类的商品信息 :param sort_data: 所有分类的商品信息(包括商品id跟特价开始时间跟结束时间) :return: None ''' sort_data = await self.get_all_goods_list() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # my_pipeline = SqlPools() index = 1 if my_pipeline.is_connect_success: # 普通sql_server连接(超过3000无返回结果集) self.my_lg.info('正在获取天天特价db原有goods_id, 请耐心等待...') db_ = list(my_pipeline.select_taobao_tiantian_tejia_all_goods_id()) db_goods_id_list = [[item[0], item[2]] for item in db_] self.my_lg.info('获取完毕!!!') # print(db_goods_id_list) db_all_goods_id = [i[0] for i in db_goods_id_list] for item in sort_data: tejia_goods_list = await self.get_tiantiantejia_goods_list( data=item.get('data', [])) self.my_lg.info(tejia_goods_list) for tmp_item in tejia_goods_list: if tmp_item.get( 'goods_id', '' ) in db_all_goods_id: # 处理如果该goods_id已经存在于数据库中的情况 try: tmp_end_time = [ i[1] for i in db_goods_id_list if tmp_item.get('goods_id', '') == i[0] ][0] # print(tmp_end_time) except: tmp_end_time = '' if tmp_end_time != '' and tmp_end_time < datetime.datetime.now( ): ''' * 处理由常规商品又转换为天天特价商品 * ''' self.my_lg.info('##### 该商品由常规商品又转换为天天特价商品! #####') # 先删除,再重新插入 _ = await my_pipeline.delete_taobao_tiantiantejia_expired_goods_id( goods_id=tmp_item.get('goods_id', ''), logger=self.my_lg) if _ is False: continue index = await self.insert_into_table( tmp_item=tmp_item, category=item['category'], current_page=item['current_page'], my_pipeline=my_pipeline, index=index, ) await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) else: self.my_lg.info('该goods_id已经存在于数据库中, 此处跳过') pass else: if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # my_pipeline = SqlPools() self.my_lg.info('与数据库的新连接成功建立...') if my_pipeline.is_connect_success: index = await self.insert_into_table( tmp_item=tmp_item, category=item['category'], current_page=item['current_page'], my_pipeline=my_pipeline, index=index, ) await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) else: self.my_lg.error('数据库连接失败!') pass else: self.my_lg.error('数据库连接失败!') pass gc.collect() return True