def _get_db_old_data(self) -> (list, None): """ 获取db待更新data :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.sql_cli._delete_table(sql_str=mia_delete_str_2) result = list(self.sql_cli._select_table(sql_str=mia_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') _block_print_db_old_data(result=result) return result
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=mia_select_str_5)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 mia = MiaParse() for item in result: goods_id = item[1] if index % 5 == 0: try: del mia except: pass mia = MiaParse() collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) mia.get_goods_data(goods_id=goods_id) data = mia.deal_with_data() db_goods_info_obj = MIADbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='mia', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) mia._to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) try: del my_lg except: pass collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=jp_delete_str_1) result = list(sql_cli._select_table(sql_str=jp_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 goods_id = item[0] if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: try: pintuan_end_time = json.loads( item[1])[0].get('end_time') except IndexError: print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format( goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, sql_cli=sql_cli, update_sql_str=jp_update_str_7, ) continue pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int( datetime_to_timestamp(get_shanghai_time())): _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, sql_cli=sql_cli, update_sql_str=jp_update_str_7, ) print('该goods_id[{0}]已过期或者售完,逻辑删除成功!'.format(goods_id)) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) juanpi_pintuan.get_goods_data(goods_id=goods_id) data = juanpi_pintuan.deal_with_data() if data == {}: continue data['goods_id'] = goods_id juanpi_pintuan.to_right_and_update_pintuan_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) gc.collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=kl_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) for item in result: # 实时更新数据 goods_id = item[1] if index % 5 == 0: try: del kaola except: pass kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) collect() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, logger=my_lg, remainder=10, ) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) db_goods_info_obj = KLDbGoodsInfoObj(item=item, logger=my_lg) data = kaola._get_goods_data(goods_id=goods_id) if data.get('is_delete', 0) == 1: # 单独处理下架商品 data['goods_id'] = goods_id data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=db_goods_info_obj.is_delete, shelf_time=db_goods_info_obj.shelf_time, delete_time=db_goods_info_obj.delete_time, ) try: kaola.to_right_and_update_data(data, pipeline=sql_cli) except Exception: my_lg.error(exc_info=True) sleep(TMALL_REAL_TIMES_SLEEP_TIME) index += 1 collect() continue data = kaola._deal_with_data() if data != {}: if data.get('is_delete', 0) == 1: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='kl', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) kaola.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠3s中...') sleep(3.) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=vip_select_str_1)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') continue _block_print_db_old_data(result=result) index = 1 for item in result: # 实时更新数据 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 vip = VipParse() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) vip.get_goods_data(goods_id=[0, item[0]]) data = vip.deal_with_data() if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) price_info_list = old_sku_info = json_2_dict( item[6], default_res=[]) try: old_sku_info = format_price_info_list( price_info_list=price_info_list, site_id=25) except AttributeError: # 处理已被格式化过的 pass new_sku_info = format_price_info_list( data['price_info_list'], site_id=25) data['_is_price_change'], data[ 'sku_info_trans_time'], price_change_info = _get_sku_price_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_price_change=item[7] if item[7] is not None else 0, db_price_change_info=json_2_dict(item[9], default_res=[]), old_price_trans_time=item[12], ) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price'], is_price_change=data['_is_price_change'], price_change_info=price_change_info, ) # 监控纯规格变动 data['is_spec_change'], data[ 'spec_trans_time'] = _get_spec_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_spec_change=item[8] if item[8] is not None else 0, old_spec_trans_time=item[13], ) # 监控纯库存变动 data['is_stock_change'], data['stock_trans_time'], data[ 'stock_change_info'] = _get_stock_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_stock_change=item[10] if item[10] is not None else 0, db_stock_change_info=json_2_dict(item[11], default_res=[]), old_stock_trans_time=item[14], ) vip.to_right_and_update_data(data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del vip except: pass gc.collect() sleep(VIP_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(30) gc.collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=yx_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 yanxuan = YanXuanParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del yanxuan except: pass yanxuan = YanXuanParse(logger=my_lg) collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yanxuan._get_goods_data(goods_id=item[1]) data = yanxuan._deal_with_data() db_goods_info_obj = YXDbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') sql_cli._update_table_2( sql_str=yx_update_str_2, params=(db_goods_info_obj.goods_id, ), logger=my_lg, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='yx', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) yanxuan.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
def run_forever(self): ''' 这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=pd_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 pinduoduo_miaosha = PinduoduoParse() all_miaosha_goods_list = self.get_all_miaosha_goods_list() # 其中所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in all_miaosha_goods_list ] # print(miaosha_goods_all_goods_id) for item in result: # 实时更新数据 # 对于拼多多先拿到该商品的结束时间点 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: sql_cli._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) sleep(.3) elif self.is_recent_time(miaosha_end_time) == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示其中没有了该goods_id ''' sql_cli._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) sleep(.3) else: # 未下架的 for item_1 in all_miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # pinduoduo_miaosha = PinduoduoParse() pinduoduo_miaosha.get_goods_data( goods_id=item[0]) goods_data = pinduoduo_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 # sleep(3) pass else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get( 'stock_info') goods_data['goods_id'] = item_1.get( 'goods_id') if item_1.get('stock_info').get( 'activity_stock') > 0: goods_data['price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get( 'miaosha_time')) if item_1.get('stock_info').get( 'activity_stock') <= 1: # 实时秒杀库存小于等于1时就标记为 已售罄 print('该秒杀商品已售罄...') goods_data['is_delete'] = 1 # print(goods_data) pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table( data=goods_data, pipeline=sql_cli) sleep(PINDUODUO_SLEEP_TIME) else: pass index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(3 * 60) # del ali_1688 gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=mg_delete_str_2) result = list(sql_cli._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) for item in result: # 实时更新数据 goods_id = item[0] pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) print( '过期的goods_id为(%s)' % goods_id, ', 拼团开始时间为(%s), 逻辑删除成功!' % json.loads(item[1]).get('begin_time')) sleep(.3) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) sleep(.3) else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if goods_id not in pintuan_goods_all_goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=sql_cli) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data[ 'goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=sql_cli) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) gc.collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=z8_delete_str_1) result = list(sql_cli._select_table(sql_str=z8_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 for item in result: # 实时更新数据 goods_id = item[0] db_is_delete = item[1] # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, remainder=50, ) if index % 300 == 0: # 每更新300个,休眠3分钟 sleep_time = 3 * 60 sleep(sleep_time) print('休眠{}s中...'.format(sleep_time)) if sql_cli.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id) # 不用这个了因为会影响到正常情况的商品 try: # 单独处理商品页面不存在的情况 if isinstance(tmp_tmp, str) and re.compile( r'^ze').findall(tmp_tmp) != []: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) sleep(ZHE_800_PINTUAN_SLEEP_TIME) continue else: pass except: pass data = zhe_800_pintuan.deal_with_data() if data != {}: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id if db_is_delete == 1: print('该goods_id[{0}]已过期!'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) else: zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_4) sleep(5) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_3)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 for item in result: # 实时更新数据 goods_id = item[0] miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mogujie_miaosha = MoGuJieMiaoShaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._update_table(sql_str=mg_update_str_1, params=(goods_id, )) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) sleep(.5) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] item_list = self.get_item_list(event_time=str(item[2])) if item_list == '': # 可能网络状况导致, 先跳过 pass elif item_list == []: print('该商品已被下架限时秒杀活动,此处将其逻辑删除') tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') sleep(.4) # 避免死锁 else: # 该event_time中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('iid', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其逻辑删除') tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') sleep(.4) else: # 未下架的 for item_2 in item_list: if item_2.get('iid', '') == item[0]: spider_url = item[3] mogujie_miaosha.get_goods_data( goods_id=spider_url) goods_data = mogujie_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) # price设置为原价 try: tmp_price_list = sorted([ round( float( item_4.get( 'normal_price', '')), 2) for item_4 in goods_data[ 'price_info_list'] ]) price = Decimal( tmp_price_list[-1] ).__round__(2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') sleep(MOGUJIE_SLEEP_TIME) continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int( item_2.get( 'startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int( item_2.get( 'endTime', 0))), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # print(goods_data['title']) # pprint(goods_data) # print(goods_data) mogujie_miaosha.update_mogujie_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) collect()