def get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' pintuan_goods_id_list = [] for page in range(0, 100): tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if body == '': body = '{}' try: tmp_data = json.loads(body) tmp_data = tmp_data.get('data', {}).get('goods', []) except: print('json.loads转换tmp_data时出错!') tmp_data = [] # print(tmp_data) sleep(.3) if tmp_data == []: print('该tmp_url得到的goods为空list, 此处跳过!') break tmp_pintuan_goods_id_list = [{ 'goods_id': item.get('goods_id', ''), 'begin_time': self.timestamp_to_regulartime(int(item.get('start_time', ''))), 'end_time': self.timestamp_to_regulartime(int(item.get('end_time', ''))), 'all_sell_count': str(item.get('join_number_int', '')), 'page': page, } for item in tmp_data] # print(tmp_pintuan_goods_id_list) for item in tmp_pintuan_goods_id_list: if item.get('goods_id', '') not in [ item2.get('goods_id', '') for item2 in pintuan_goods_id_list ]: pintuan_goods_id_list.append(item) print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list)) print(pintuan_goods_id_list) juanpi_pintuan = JuanPiParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() index = 1 if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline.select_juanpi_pintuan_all_goods_id()) ] # print(db_goods_id_list) for item in pintuan_goods_id_list: if index % 5 == 0: # 此处避免脚本占用大量内存 try: del juanpi_pintuan except: pass juanpi_pintuan = JuanPiParse() gc.collect() if db_goods_id_list != []: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: # * 注意卷皮的拼团时间跟它原先抓到的上下架时间是同一个时间 * ## 所以就不用进行替换 goods_data = self.get_pintuan_goods_data( juanpi_pintuan=juanpi_pintuan, goods_id=item.get('goods_id', ''), all_sell_count=item.get('all_sell_count', ''), page=item.get('page', 0)) if goods_data == {}: # 返回的data为空则跳过 pass else: # print(goods_data) juanpi_pintuan.insert_into_juuanpi_pintuan_table( data=goods_data, pipeline=my_pipeline) pass sleep(.6) index += 1 else: goods_data = self.get_pintuan_goods_data( juanpi_pintuan=juanpi_pintuan, goods_id=item.get('goods_id', ''), all_sell_count=item.get('all_sell_count', ''), page=item.get('page', 0)) if goods_data == {}: # 返回的data为空则跳过 pass else: # print(goods_data) juanpi_pintuan.insert_into_juuanpi_pintuan_table( data=goods_data, pipeline=my_pipeline) pass sleep(.6) index += 1 else: pass try: del juanpi_pintuan except: pass gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_juanpi_pintuan_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 data = {} if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: pintuan_end_time = json.loads(item[1])[0].get('end_time') pintuan_end_time = int(str(time.mktime(time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int(time.time()): tmp_sql_server.delete_juanpi_pintuan_expired_goods_id(goods_id=item[0]) print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0])) else: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) juanpi_pintuan.get_goods_data(goods_id=item[0]) data = juanpi_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] juanpi_pintuan.to_right_and_update_pintuan_data(data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del juanpi_pintuan # except: # pass gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()