async def _deal_with_all_goods_id(self): ''' 获取每个详细分类的商品信息 :return: None ''' _data = await self._get_all_goods_list() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() index = 1 if my_pipeline.is_connect_success: self.my_lg.info('正在获取淘抢购db原有goods_id, 请耐心等待...') sql_str = r'select goods_id from dbo.tao_qianggou_xianshimiaosha where site_id=28' db_ = list(my_pipeline._select_table(sql_str=sql_str)) db_all_goods_id = [item[0] for item in db_] self.my_lg.info('获取完毕!!!') # self.my_lg.info(str(db_all_goods_id)) for item in _data: miaosha_goods_list = await self._get_taoqianggou_goods_list(data=item.get('data', [])) # self.my_lg.info(str(miaosha_goods_list)) # pprint(miaosha_goods_list) for tmp_item in miaosha_goods_list: if tmp_item.get('goods_id', '') in db_all_goods_id: # 处理如果该goods_id已经存在于数据库中的情况 self.my_lg.info('该goods_id[%s]已存在db中' % tmp_item.get('goods_id', '')) continue if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # my_pipeline = SqlPools() self.my_lg.info('与数据库的新连接成功建立...') if my_pipeline.is_connect_success: tmall = TmallParse(logger=self.my_lg) tmp_url = 'https://detail.tmall.com/item.htm?id={0}'.format(tmp_item.get('goods_id')) goods_id = tmall.get_goods_id_from_url(tmp_url) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.my_lg.info(str(tmp_item)) goods_data['goods_id'] = tmp_item.get('goods_id') goods_data['spider_url'] = tmp_url goods_data['miaosha_time'] = tmp_item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=tmp_item.get('miaosha_time')) goods_data['page'] = tmp_item.get('page') goods_data['spider_time'] = tmp_item.get('spider_time') tmall.insert_into_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await asyncio.sleep(5) try: del tmall except: pass gc.collect()
async def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods 数据 :param result: :return: ''' index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time') miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # self.my_lg.info(str(miaosha_begin_time)) tmall = TmallParse(logger=self.my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if await self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) self.my_lg.info('过期的goods_id为(%s)' % item[0] + ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) else: # 返回1, 表示在待更新的区间内 self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据''' goods_id = tmall.get_goods_id_from_url(item[2]) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.my_lg.info(str(item)) goods_data['goods_id'] = item[0] await tmall._update_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await asyncio.sleep(5) index += 1 try: del tmall except: pass gc.collect() return
def block_get_tm_one_goods_info_task(self, goods_id: list, index: int): """ 阻塞获取tm单个goods信息 :param goods_id: :param index: :return: """ tm = TmallParse(logger=self.lg) site_id, _goods_id = goods_id before_goods_data = tm.get_goods_data(goods_id=goods_id) end_goods_data = tm.deal_with_data() # 处理前后某个为1, 则为1 is_delete = 1 \ if before_goods_data.get('is_delete', 0) == 1 or end_goods_data.get('is_delete', 0) == 1 \ else 0 _label = '+' \ if end_goods_data != {} or is_delete == 1 \ else '-' self.lg.info('[{}] goods_id: {}, site_id: {}, is_delete: {}'.format( _label, _goods_id, site_id, is_delete, )) try: del tm except: pass collect() return (site_id, _goods_id, index, before_goods_data, end_goods_data)
def get_one_tm_data(**kwargs): ''' 抓取一个tm url的data :param kwargs: :return: ''' username = kwargs.get('username', DEFAULT_USERNAME) wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '') my_lg = kwargs.get('my_lg') login_tmall = TmallParse(logger=my_lg) goods_id = login_tmall.get_goods_id_from_url( wait_to_deal_with_url) # 获取goods_id, 这里返回的是一个list if goods_id == []: # 如果得不到goods_id, 则return error my_lg.info('获取到的goods_id为空!') try: del login_tmall # 每次都回收一下 except: pass gc.collect() return {'goods_id': ''} # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[ 1] # 构造成标准干净的天猫商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[ 1] elif goods_id[0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = str(goods_id[2]) + '?id=' + goods_id[1] tmp_result = login_tmall.get_goods_data(goods_id=goods_id) data = login_tmall.deal_with_data() # 如果成功获取的话, 返回的是一个data的dict对象 sleep(TMALL_SLEEP_TIME) # 这个在服务器里面可以注释掉为.5s if data == {} or tmp_result == {}: my_lg.info('获取到的data为空!') try: del login_tmall except: pass gc.collect() return {'goods_id': goods_id[1], 'msg': 'data为空!'} wait_to_save_data = add_base_info_2_processed_data( data=data, spider_url=wait_to_deal_with_url, username=username, goods_id=goods_id[1]) try: del login_tmall except: pass return wait_to_save_data
def _get_tm_one_goods_info_task(self, goods_id:list, index:int) -> tuple: """ 获取tmall单个goods信息 :param self: :return: """ tm = TmallParse(logger=lg) site_id, _goods_id = goods_id before_goods_data = tm.get_goods_data(goods_id=goods_id) end_goods_data = tm.deal_with_data() try: del tm except: pass collect() return (site_id, _goods_id, index, before_goods_data, end_goods_data)
def test_tm_m(): # todo 注意部分商品预售,当前无法购买, 不更新, 待其状态正常后会更新 goods_id = '575090086713' # data = get_tm_m_body_data(goods_id=goods_id) # pprint(data) pc_url = 'https://detail.tmall.com/item.htm?id={}'.format(goods_id) phone_url = 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id) print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url)) tm = TmallParse(is_real_times_update_call=True) goods_id = tm.get_goods_id_from_url(tmall_url=pc_url) ori_data = tm.get_goods_data(goods_id=goods_id) # pprint(ori_data) data = tm.deal_with_data() pprint(data) try:del tm except:pass
def _get_seller_id(self, type, goods_id): ''' 得到seller_id :param type: :param goods_id: :return: ''' _ = TmallParse(logger=self.lg) _g = [type, goods_id] self.g_data = _.get_goods_data(goods_id=_g) seller_id = str(self.g_data.get('seller', {}).get('userId', 0)) # self.lg.info('获取到的seller_id: ' + seller_id) try: del _ except: pass assert seller_id != 0, '获取到的seller_id为0!' return seller_id
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # and GETDATE()-ModfiyTime>0.2 sql_str = ''' select SiteID, GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time from dbo.GoodsInfoAutoGet where (SiteID=3 or SiteID=4 or SiteID=6) and MainGoodsID is not null order by ID desc''' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 tmall = TmallParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try:del tmall except: pass tmall = TmallParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) tmp_item = [] if item[0] == 3: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 4: tmp_item.append(1) elif item[0] == 6: tmp_item.append(2) tmp_item.append(item[1]) data = tmall.get_goods_data(goods_id=tmp_item) if isinstance(data, int): # 单独处理return 4041 index += 1 sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue if data.get('is_delete') == 1: # 单独处理下架商品 data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data)) tmall.to_right_and_update_data(data, pipeline=tmp_sql_server) sleep(TMALL_REAL_TIMES_SLEEP_TIME) index += 1 gc.collect() continue data = tmall.deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price'] ) # my_lg.info(str(data['_is_price_change']) + ' ' +str(data['_price_change_info'])) # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data)) tmall.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) gc.collect()
def _tmall_keywords_spider(self, **kwargs): ''' tmall对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.my_lg.error('该goods_url为空值! 此处跳过!') continue result = tmall.old_tmall_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True
async def _update_one_goods_info(self, db_goods_info_obj, index): """ 更新单个goods :param db_goods_info_obj: :param index: :return: """ res = False tmall = TmallParse(logger=self.lg) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, remainder=50, ) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'. format(db_goods_info_obj.goods_id, index)) tmp_item = self._get_tmp_item(site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id) # self.lg.info(str(tmp_item)) # ** 阻塞方式运行 oo = tmall.get_goods_data(goods_id=tmp_item) # ** 非阻塞方式运行 # oo = await unblock_func( # func_name=tmall.get_goods_data, # func_args=[ # tmp_item, # ], # default_res={}, # logger=self.lg,) before_goods_data_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 # 阻塞方式 data = tmall.deal_with_data() if data != {}: data = get_goods_info_change_data( target_short_name='tm', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tm_data(data=data, pipeline=self.sql_cli, logger=self.lg) else: if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('------>>>| 阻塞休眠7s中...') await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=5, loop=self.loop) try: del tmall except: pass collect() await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) return [ db_goods_info_obj.goods_id, res, ]
# coding:utf-8 ''' @author = super_fazai @File : test_tm_m.py @connect : [email protected] ''' """ 测试tm m """ from sys import path as sys_path sys_path.append('..') from multiplex_code import get_tm_m_body_data from tmall_parse_2 import TmallParse from pprint import pprint goods_id = '589363967773' # data = get_tm_m_body_data(goods_id=goods_id) # pprint(data) pc_url = 'https://detail.tmall.com/item.htm?id={}'.format(goods_id) phone_url = 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id) print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url)) tm = TmallParse(is_real_times_update_call=True) goods_id = tm.get_goods_id_from_url(tmall_url=pc_url) tm.get_goods_data(goods_id=goods_id) data = tm.deal_with_data() pprint(data)
def _tmall_keywords_spider(self, **kwargs): """ tmall对应关键字采集 :param kwargs: :return: """ goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url # 用于判断某个goods是否被插入的参数 result = False try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20, ) if self.sql_cli.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if not self.check_target_data_is_legal(target_data=data): return False result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.sql_cli) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True
async def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods 数据 :param result: :return: ''' index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json_2_dict( json_str=item[1], logger=self.lg, ).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # self.lg.info(str(miaosha_begin_time)) tmall = TmallParse(logger=self.lg) tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=self.lg, remainder=20, ) if tmp_sql_server.is_connect_success: if await self.is_recent_time(miaosha_begin_time) == 0: # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],)) tmp_sql_server._update_table(sql_str=tb_update_str_4, params=(item[0], )) self.lg.info('过期的goods_id为(%s)' % item[0] + ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time) await async_sleep(.3) else: # 返回1, 表示在待更新的区间内 self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据''' goods_id = tmall.get_goods_id_from_url(item[2]) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.lg.info(str(item)) goods_data['goods_id'] = item[0] await tmall._update_taoqianggou_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await async_sleep(5) index += 1 try: del tmall except: pass collect() return
async def deal_with_tmcs_goods_id_list(self): self.lg.info('即将开始抓取tmcs goods, 请耐心等待...') for item in self.db_wait_2_save_goods_id_list: # eg: '61864164616' goods_id = item if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=self.sql_cli_remainder, ) if self.sql_cli.is_connect_success: # 加spm 是为了get_goods_id_from_url能筛选, id # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id) goods_url = 'https://detail.tmall.com/item.htm?id={}'.format( goods_id) # 下面这个goods_id为类型加goods_id的list goods_id = tmall.get_goods_id_from_url(goods_url) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id, ) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if len(data['all_img_url']) <= 1: self.lg.info( '[goods_id: {}]主图个数<=1, pass'.format(goods_id)) return False result = tmall.old_tmall_goods_insert_into_new_table( data=data, pipeline=self.sql_cli) if result: # 避免后续重复采集 self.db_existed_goods_id_list.append(goods_id) else: pass else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.lg.info('tmcs已经抓取完毕!') return True
async def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods 数据 :param result: :return: ''' index = 1 for item in result: # 实时更新数据 _goods_id = item[0] miaosha_time = item[1] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) tmall = TmallParse(logger=self.lg) tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=self.lg, remainder=20, ) if tmp_sql_server.is_connect_success: if await self.is_recent_time(miaosha_begin_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=_goods_id, logger=self.lg, update_sql_str=tb_update_str_4, sql_cli=tmp_sql_server, ) self.lg.info('过期的goods_id为(%s)' % _goods_id + ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time) await async_sleep(.3) else: # 返回1, 表示在待更新的区间内 self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (_goods_id, str(index))) '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据''' goods_id = tmall.get_goods_id_from_url(item[2]) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.lg.info(str(item)) goods_data['goods_id'] = _goods_id await tmall._update_taoqianggou_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await async_sleep(5) index += 1 try: del tmall except: pass collect() return
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=tm_select_str_3)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info( '--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 tmall = TmallParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del tmall except: pass tmall = TmallParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) tmp_item = [] if item[0] == 3: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 4: tmp_item.append(1) elif item[0] == 6: tmp_item.append(2) tmp_item.append(item[1]) oo = tmall.get_goods_data(goods_id=tmp_item) oo_is_delete = oo.get('is_detele', 0) # 避免下面解析data错误休眠 if isinstance(oo, int): # 单独处理return 4041 index += 1 sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue data = tmall.deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) site_id = tmall._from_tmall_type_get_site_id( type=data['type']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[7]), site_id=site_id) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=site_id), is_price_change=item[8] if item[8] is not None else 0) tmall.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 if oo_is_delete == 1: pass else: my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()