def get_one_mia_data(**kwargs): ''' 抓取一个mia地址的数据 :param kwargs: :return: ''' username = kwargs.get('username', DEFAULT_USERNAME) wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '') my_lg = kwargs.get('my_lg') mi = MiaParse() goods_id = mi.get_goods_id_from_url( wait_to_deal_with_url) # 获取goods_id, 这里返回的是一个list if goods_id == '': # 如果得不到goods_id, 则return error my_lg.info('获取到的goods_id为空!') try: del mi # 每次都回收一下 except Exception: pass gc.collect() return {'goods_id': ''} # 错误1: goods_id为空值 tmp_result = mi.get_goods_data(goods_id=goods_id) data = mi.deal_with_data() # 如果成功获取的话, 返回的是一个data的dict对象 if data == {} or tmp_result == {}: my_lg.error('获取到的data为空!出错地址: {0}'.format(wait_to_deal_with_url)) try: del mi except: pass gc.collect() return {'goods_id': goods_id, 'msg': 'data为空!'} # 错误2: 抓取失败 wait_to_deal_with_url = 'https://www.mia.com/item-{}.html'.format(goods_id) wait_to_save_data = add_base_info_2_processed_data( data=data, spider_url=wait_to_deal_with_url, username=username, goods_id=goods_id) try: del mi except: pass return wait_to_save_data
def deal_with_data(self, *param): ''' 处理并存储相关秒杀商品的数据 :param param: 相关参数 :return: ''' pid = param[0] begin_time = int(time.mktime(time.strptime(param[1], '%Y/%m/%d %H:%M:%S'))) # 把str字符串类型转换为时间戳的形式 end_time = int(time.mktime(time.strptime(param[2], '%Y/%m/%d %H:%M:%S'))) item_list = param[3] mia = MiaParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=mia_select_str_4))] # print(db_goods_id_list) for item in item_list: if item.get('item_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('item_id', '')) tmp_url = 'https://www.mia.com/item-' + str(goods_id) + '.html' mia.get_goods_data(goods_id=str(goods_id)) goods_data = mia.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_url = goods_data['goods_url'] if re.compile(r'://m.miyabaobei.hk/').findall(goods_url) != '': goods_url = 'https://www.miyabaobei.hk/item-' + str(goods_id) + '.html' else: goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html' goods_data['goods_url'] = goods_url goods_data['goods_id'] = str(goods_id) goods_data['price'] = item.get('active_price') goods_data['taobao_price'] = item.get('active_price') # 秒杀最低价 goods_data['sub_title'] = item.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(begin_time), 'miaosha_end_time': timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) goods_data['pid'] = str(pid) # pprint(goods_data) # print(goods_data) mia.insert_into_mia_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mia except: pass gc.collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=mia_select_str_5)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 mia = MiaParse() for item in result: goods_id = item[1] if index % 5 == 0: try: del mia except: pass mia = MiaParse() collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) mia.get_goods_data(goods_id=goods_id) data = mia.deal_with_data() db_goods_info_obj = MIADbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='mia', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) mia._to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) try: del my_lg except: pass collect()
class MIUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/蜜芽/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.delete_sql_str = mia_delete_str_3 self.concurrency = 8 # 并发量 self.tmp_sql_server = None self.goods_index = 1 async def _get_pc_headers(self) -> dict: headers = await async_get_random_headers( upgrade_insecure_requests=False, ) headers.update({ 'Host': 'm.mia.com', }) return headers async def _get_db_old_data(self): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=mia_delete_str_4) await async_sleep(5) result = list( self.tmp_sql_server._select_table(sql_str=mia_select_str_3)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_end_time(self, miaosha_time): miaosha_end_time = json.loads(miaosha_time).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_end_time async def _get_new_mia_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.mia_miaosha except: pass collect() self.mia_miaosha = MiaParse() async def _update_one_goods_info(self, item, index) -> tuple: ''' 单个更新 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] pid = item[2] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_mia_obj(index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30, ) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_end_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) await async_sleep(.5) self.goods_index = index + 1 return goods_id, res elif is_recent_time == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info( '过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) else: pass self.goods_index = index + 1 return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( pid) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(body) body = '' if body == '' or body == '[]' else body try: tmp_data = json_2_dict( json_str=body, default_res={}, logger=self.lg, ) assert tmp_data != {}, 'tmp_data为空dict!' except AssertionError: self.lg.error('遇到错误:', exc_info=True) self.goods_index = index + 1 await async_sleep(.3) return goods_id, res item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('item_id', '') for item_1 in item_list ] # self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) self.goods_index = index + 1 await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update( item_list=item_list, goods_id=goods_id, tmp_data=tmp_data, ) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 self.goods_index = index + 1 collect() return goods_id, res async def _update_db(self) -> None: ''' 秒杀实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.mia_miaosha = MiaParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.mia_miaosha except: pass collect() async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :param kwargs: :return: ''' res = False item_list = kwargs.get('item_list') goods_id = kwargs.get('goods_id') tmp_data = kwargs.get('tmp_data') begin_time, end_time = await self._get_begin_time_and_end_time(tmp_data ) for item_2 in item_list: if item_2.get('item_id', '') == goods_id: self.mia_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.mia_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(goods_id) goods_data['price'] = item_2.get('active_price') goods_data['taobao_price'] = item_2.get('active_price') goods_data['sub_title'] = item_2.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(begin_time), 'miaosha_end_time': timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) res = self.mia_miaosha.update_mia_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) break else: pass return res async def _get_begin_time_and_end_time(self, tmp_data) -> tuple: begin_time = tmp_data.get('p_info', {}).get('start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') # 把str字符串类型转换为时间戳的形式 begin_time = int( time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S'))) end_time = int( time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S'))) return begin_time, end_time async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = datetime_to_timestamp(get_shanghai_time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.mia_miaosha except: pass collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_xianshimiaosha where site_id=20' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mia_miaosha = MiaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( item[2]) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '' or body == '[]': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') begin_time = tmp_data.get('p_info', {}).get( 'start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') begin_time = int( time.mktime( time.strptime(begin_time, '%Y/%m/%d %H:%M:%S')) ) # 把str字符串类型转换为时间戳的形式 end_time = int( time.mktime( time.strptime(end_time, '%Y/%m/%d %H:%M:%S'))) item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('item_id', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('item_id', '') == item[0]: mia_miaosha.get_goods_data( goods_id=item[0]) goods_data = mia_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) goods_data['price'] = item_2.get( 'active_price') goods_data[ 'taobao_price'] = item_2.get( 'active_price') goods_data[ 'sub_title'] = item_2.get( 'short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( begin_time), 'miaosha_end_time': timestamp_to_regulartime( end_time), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # pprint(goods_data) # print(goods_data) mia_miaosha.update_mia_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()