def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_4) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_3)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mogujie_miaosha = MoGuJieMiaoShaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0], )) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] item_list = self.get_item_list(event_time=str(item[2])) if item_list == '': # 可能网络状况导致, 先跳过 pass elif item_list == []: print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 该event_time中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('iid', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('iid', '') == item[0]: spider_url = item[3] mogujie_miaosha.get_goods_data( goods_id=spider_url) goods_data = mogujie_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) # price设置为原价 try: tmp_price_list = sorted([ round( float( item_4.get( 'normal_price', '')), 2) for item_4 in goods_data[ 'price_info_list'] ]) price = Decimal( tmp_price_list[-1] ).__round__(2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int( item_2.get( 'startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int( item_2.get( 'endTime', 0))), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # print(goods_data['title']) # pprint(goods_data) # print(goods_data) mogujie_miaosha.update_mogujie_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def deal_with_data(self, *param): ''' 处理并存储相关秒杀商品的数据 :param param: 相关参数 :return: ''' print(60 * '*') event_time = param[0] item_list = param[1] print('秒杀开始时间:', timestamp_to_regulartime(event_time), '\t', '对应时间戳为: ', event_time) print(60 * '*') mogujie = MoGuJieMiaoShaParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=mg_select_str_4)) db_goods_id_list = [item[0] for item in _] for item in item_list: goods_id = str(item.get('iid', '')) if goods_id in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = item.get('link', '') # print(tmp_url) try: object_id = re.compile('objectId=(\w+)').findall( tmp_url)[0] except IndexError: # 表示匹配到的地址不是秒杀商品的地址 print('+++++++ 这个url不是秒杀的url: ', tmp_url) continue tmp_url = 'https://shop.mogujie.com/rushdetail/{0}?objectId={1}&type=rush'.format( goods_id, object_id) tmp_ = mogujie.get_goods_id_from_url(tmp_url) mogujie.get_goods_data(goods_id=tmp_) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) # price设置为原价 try: tmp_price_list = sorted([ round(float(item_4.get('normal_price', '')), 2) for item_4 in goods_data['price_info_list'] ]) price = Decimal(tmp_price_list[-1]).__round__( 2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int(item.get('startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int(item.get('endTime', 0))), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['event_time'] = str(event_time) # pprint(goods_data) # print(goods_data) res = mogujie.insert_into_mogujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect()