def get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时秒杀商品信息 :return: ''' zid_list = [] for page in range(0, 100): tmp_zid_list = [] tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) tmp_data = self.get_url_body(tmp_url=tmp_url) # print(tmp_data) if tmp_data == []: print('该tmp_url得到的object为空list, 此处跳过!') break tmp_zid_list = [(item.get('product', {}).get('zid', ''), page) for item in tmp_data] # print(tmp_zid_list) for item in tmp_zid_list: if item != '': zid_list.append(item) zid_list = list(set(zid_list)) print('该zid_list的总个数为: ', len(zid_list)) print(zid_list) zhe_800_pintuan = Zhe800PintuanParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline.select_zhe_800_pintuan_all_goods_id()) ] for item in zid_list: if item[0] in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str( item[0]) goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url) zhe_800_pintuan.get_goods_data(goods_id=goods_id) goods_data = zhe_800_pintuan.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['goods_id'] = str(item[0]) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['page'] = str(item[1]) # print(goods_data) zhe_800_pintuan.insert_into_zhe_800_pintuan_table( data=goods_data, pipeline=my_pipeline) sleep(.7) else: pass try: del zhe_800_pintuan except: pass gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_zhe_800_pintuan_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=item[0]) # 不用这个了因为会影响到正常情况的商品 # try: # 单独处理商品页面不存在的情况 # if isinstance(tmp_tmp, str) and re.compile(r'^ze').findall(tmp_tmp) != []: # print('******** 该商品的页面已经不存在!此处将其删除!') # tmp_sql_server.delete_zhe_800_pintuan_expired_goods_id(goods_id=item[0]) # else: # pass # except: # pass data = zhe_800_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] if item[1] == 1: tmp_sql_server.delete_zhe_800_pintuan_expired_goods_id( goods_id=item[0]) print('该goods_id[{0}]已过期,删除成功!'.format(item[0])) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass gc.collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()