def _just_run(self): while True: # 获取原先goods_db的所有已存在的goods_id try: result = list( self.my_pipeline._select_table(sql_str=kw_select_str_1)) self.lg.info('正在获取db中已存在的goods_id...') result_2 = list( self.my_pipeline._select_table(sql_str=kw_select_str_2)) self.lg.info('db中已存在的goods_id获取成功!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None result_2 = None if result is not None and result_2 is not None: self.lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.lg.info(str(result)) self.lg.info( '--------------------------------------------------------') self.lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self.add_goods_index = 0 # 用于定位增加商品的个数 self.db_existed_goods_id_list = [item[0] for item in result_2] # 即时释放资源 try: del result_2 except: pass collect() for item in result: # 每个关键字在True的接口都抓完, 再进行下一次 self.lg.info('正在处理id为{0}, 关键字为 {1} ...'.format( item[0], item[1])) for type, type_value in self.debugging_api.items( ): # 遍历待抓取的电商分类 if type_value is False: self.lg.info('api为False, 跳过!') continue self.my_pipeline = _block_get_new_db_conn( db_obj=self.my_pipeline, index=self.add_goods_index, logger=self.lg, remainder=20, ) goods_id_list = self._get_keywords_goods_id_list( type=type, keyword=item) self.lg.info( '关键字为{0}, 获取到的goods_id_list 如下: {1}'.format( item[1], str(goods_id_list))) '''处理goods_id_list''' self._deal_with_goods_id_list( type=type, goods_id_list=goods_id_list, keyword_id=item[0]) sleep(3)
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=mia_select_str_5)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 mia = MiaParse() for item in result: goods_id = item[1] if index % 5 == 0: try: del mia except: pass mia = MiaParse() collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) mia.get_goods_data(goods_id=goods_id) data = mia.deal_with_data() db_goods_info_obj = MIADbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='mia', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) mia._to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) try: del my_lg except: pass collect()
def run_forever(self): ''' 实时更新数据 :return: ''' result = self._get_db_old_data() if result is None: sleep_time = 20 print('获取db数据失败, 休眠{}s ...'.format(sleep_time)) sleep(sleep_time) return None index = 1 for item in result: # 实时更新数据 goods_id = item[0] pid = item[2] # 2020-04-12 00:00:00 pintuan_end_time = json_2_dict(item[1]).get('end_time') pintuan_end_time = datetime_to_timestamp( string_to_datetime(pintuan_end_time)) # print(pintuan_end_time) data = {} self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli, index=index, remainder=50) if self.sql_cli.is_connect_success: is_recent_time = self.is_recent_time(pintuan_end_time) if is_recent_time == 0: # 已恢复原价的 _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mia_update_str_7, sql_cli=self.sql_cli) print('该goods拼团开始时间为({})'.format( json.loads(item[1]).get('begin_time'))) sleep(.4) elif is_recent_time == 2: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) pass else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})' .format(goods_id, index)) data['goods_id'] = goods_id try: data_list = get_mia_pintuan_one_page_api_goods_info( page_num=pid) except ResponseBodyIsNullStrException: index += 1 sleep(.4) continue # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新 # try: # assert data_list != [], 'data_list不为空list!' # except AssertionError as e: # print(e) # _handle_goods_shelves_in_auto_goods_table( # goods_id=goods_id, # update_sql_str=mia_update_str_7, # sql_cli=self.sql_cli) # sleep(.4) # index += 1 # continue pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in data_list ] # print(pintuan_goods_all_goods_id) ''' 蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍) ''' mia_pt = MiaPintuanParse(is_real_times_update_call=True) if goods_id not in pintuan_goods_all_goods_id: # 内部已经下架的 # 一律更新 try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, ) except AssertionError: # 返回的data为空则跳过 index += 1 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table(data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in data_list: if item_2.get('goods_id', '') == goods_id: sub_title = item_2.get('sub_title', '') try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, sub_title=sub_title, ) except AssertionError: # 返回的data为空则跳过 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table( data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass try: del mia_pt except: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=jp_delete_str_1) result = list(sql_cli._select_table(sql_str=jp_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 goods_id = item[0] if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: try: pintuan_end_time = json.loads( item[1])[0].get('end_time') except IndexError: print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format( goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, sql_cli=sql_cli, update_sql_str=jp_update_str_7, ) continue pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int( datetime_to_timestamp(get_shanghai_time())): _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, sql_cli=sql_cli, update_sql_str=jp_update_str_7, ) print('该goods_id[{0}]已过期或者售完,逻辑删除成功!'.format(goods_id)) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) juanpi_pintuan.get_goods_data(goods_id=goods_id) data = juanpi_pintuan.deal_with_data() if data == {}: continue data['goods_id'] = goods_id juanpi_pintuan.to_right_and_update_pintuan_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) gc.collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=vip_select_str_1)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') continue _block_print_db_old_data(result=result) index = 1 for item in result: # 实时更新数据 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 vip = VipParse() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) vip.get_goods_data(goods_id=[0, item[0]]) data = vip.deal_with_data() if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) price_info_list = old_sku_info = json_2_dict( item[6], default_res=[]) try: old_sku_info = format_price_info_list( price_info_list=price_info_list, site_id=25) except AttributeError: # 处理已被格式化过的 pass new_sku_info = format_price_info_list( data['price_info_list'], site_id=25) data['_is_price_change'], data[ 'sku_info_trans_time'], price_change_info = _get_sku_price_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_price_change=item[7] if item[7] is not None else 0, db_price_change_info=json_2_dict(item[9], default_res=[]), old_price_trans_time=item[12], ) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price'], is_price_change=data['_is_price_change'], price_change_info=price_change_info, ) # 监控纯规格变动 data['is_spec_change'], data[ 'spec_trans_time'] = _get_spec_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_spec_change=item[8] if item[8] is not None else 0, old_spec_trans_time=item[13], ) # 监控纯库存变动 data['is_stock_change'], data['stock_trans_time'], data[ 'stock_change_info'] = _get_stock_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_stock_change=item[10] if item[10] is not None else 0, db_stock_change_info=json_2_dict(item[11], default_res=[]), old_stock_trans_time=item[14], ) vip.to_right_and_update_data(data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del vip except: pass gc.collect() sleep(VIP_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(30) gc.collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=kl_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) for item in result: # 实时更新数据 goods_id = item[1] if index % 5 == 0: try: del kaola except: pass kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) collect() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, logger=my_lg, remainder=10, ) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) db_goods_info_obj = KLDbGoodsInfoObj(item=item, logger=my_lg) data = kaola._get_goods_data(goods_id=goods_id) if data.get('is_delete', 0) == 1: # 单独处理下架商品 data['goods_id'] = goods_id data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=db_goods_info_obj.is_delete, shelf_time=db_goods_info_obj.shelf_time, delete_time=db_goods_info_obj.delete_time, ) try: kaola.to_right_and_update_data(data, pipeline=sql_cli) except Exception: my_lg.error(exc_info=True) sleep(TMALL_REAL_TIMES_SLEEP_TIME) index += 1 collect() continue data = kaola._deal_with_data() if data != {}: if data.get('is_delete', 0) == 1: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='kl', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) kaola.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠3s中...') sleep(3.) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=yx_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 yanxuan = YanXuanParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del yanxuan except: pass yanxuan = YanXuanParse(logger=my_lg) collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yanxuan._get_goods_data(goods_id=item[1]) data = yanxuan._deal_with_data() db_goods_info_obj = YXDbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') sql_cli._update_table_2( sql_str=yx_update_str_2, params=(db_goods_info_obj.goods_id, ), logger=my_lg, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='yx', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) yanxuan.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
def run_forever(self): ''' 这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=pd_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 pinduoduo_miaosha = PinduoduoParse() all_miaosha_goods_list = self.get_all_miaosha_goods_list() # 其中所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in all_miaosha_goods_list ] # print(miaosha_goods_all_goods_id) for item in result: # 实时更新数据 # 对于拼多多先拿到该商品的结束时间点 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: sql_cli._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) sleep(.3) elif self.is_recent_time(miaosha_end_time) == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示其中没有了该goods_id ''' sql_cli._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) sleep(.3) else: # 未下架的 for item_1 in all_miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # pinduoduo_miaosha = PinduoduoParse() pinduoduo_miaosha.get_goods_data( goods_id=item[0]) goods_data = pinduoduo_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 # sleep(3) pass else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get( 'stock_info') goods_data['goods_id'] = item_1.get( 'goods_id') if item_1.get('stock_info').get( 'activity_stock') > 0: goods_data['price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get( 'miaosha_time')) if item_1.get('stock_info').get( 'activity_stock') <= 1: # 实时秒杀库存小于等于1时就标记为 已售罄 print('该秒杀商品已售罄...') goods_data['is_delete'] = 1 # print(goods_data) pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table( data=goods_data, pipeline=sql_cli) sleep(PINDUODUO_SLEEP_TIME) else: pass index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(3 * 60) # del ali_1688 gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=mg_delete_str_2) result = list(sql_cli._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) for item in result: # 实时更新数据 goods_id = item[0] pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) print( '过期的goods_id为(%s)' % goods_id, ', 拼团开始时间为(%s), 逻辑删除成功!' % json.loads(item[1]).get('begin_time')) sleep(.3) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) sleep(.3) else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if goods_id not in pintuan_goods_all_goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=sql_cli) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data[ 'goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=sql_cli) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) gc.collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=z8_delete_str_1) result = list(sql_cli._select_table(sql_str=z8_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 for item in result: # 实时更新数据 goods_id = item[0] db_is_delete = item[1] # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, remainder=50, ) if index % 300 == 0: # 每更新300个,休眠3分钟 sleep_time = 3 * 60 sleep(sleep_time) print('休眠{}s中...'.format(sleep_time)) if sql_cli.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id) # 不用这个了因为会影响到正常情况的商品 try: # 单独处理商品页面不存在的情况 if isinstance(tmp_tmp, str) and re.compile( r'^ze').findall(tmp_tmp) != []: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) sleep(ZHE_800_PINTUAN_SLEEP_TIME) continue else: pass except: pass data = zhe_800_pintuan.deal_with_data() if data != {}: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id if db_is_delete == 1: print('该goods_id[{0}]已过期!'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) else: zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
def _tmall_keywords_spider(self, **kwargs): """ tmall对应关键字采集 :param kwargs: :return: """ goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url # 用于判断某个goods是否被插入的参数 result = False try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20, ) if self.sql_cli.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if not self.check_target_data_is_legal(target_data=data): return False result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.sql_cli) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True
def _just_run(self): while True: result = None result_2 = None # 获取原先goods_db的所有已存在的goods_id try: result = list(self.sql_cli._select_table(sql_str=kw_select_str_1)) self.lg.info('正在获取db中已存在的goods_id...') result_2 = list(self.sql_cli._select_table(sql_str=kw_select_str_2)) self.lg.info('db中已存在的goods_id获取成功!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') if result is None or result_2 is None: sleep(15) continue self.lg.info('db 已存在的goods_id_num: {}'.format(len(result_2))) # 用于定位增加商品的个数 self.add_goods_index = 0 self.db_existed_goods_id_list = [item[0] for item in result_2] # 即时释放资源 try: del result_2 except: pass collect() for item in result: keyword_id = item[0] keyword = item[1] # 每个关键字在True的接口都抓完, 再进行下一次 self.lg.info('正在处理id为{0}, 关键字为 {1} ...'.format(keyword_id, keyword)) # 筛选 if int(keyword_id) < 43: if int(keyword_id) not in (25, 26): self.lg.info('不在处理的keyword_id范围内, keyword_id: {}, keyword: {}'.format( keyword_id, keyword)) continue else: pass else: pass for type, type_value in self.debugging_api.items(): # 遍历待抓取的电商分类 if type_value is False: self.lg.info('api为False, 跳过!') continue self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20,) goods_id_list = self._get_keywords_goods_id_list( type=type, keyword=item) # pprint(goods_id_list) self.lg.info('关键字为{0}, 获取到的goods_id_list_num: {1}'.format(keyword, len(goods_id_list))) '''处理goods_id_list''' self._deal_with_goods_id_list( type=type, goods_id_list=goods_id_list, keyword_id=keyword_id) sleep(3)
async def deal_with_tmcs_goods_id_list(self): self.lg.info('即将开始抓取tmcs goods, 请耐心等待...') for item in self.db_wait_2_save_goods_id_list: # eg: '61864164616' goods_id = item if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=self.sql_cli_remainder, ) if self.sql_cli.is_connect_success: # 加spm 是为了get_goods_id_from_url能筛选, id # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id) goods_url = 'https://detail.tmall.com/item.htm?id={}'.format( goods_id) # 下面这个goods_id为类型加goods_id的list goods_id = tmall.get_goods_id_from_url(goods_url) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id, ) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if len(data['all_img_url']) <= 1: self.lg.info( '[goods_id: {}]主图个数<=1, pass'.format(goods_id)) return False result = tmall.old_tmall_goods_insert_into_new_table( data=data, pipeline=self.sql_cli) if result: # 避免后续重复采集 self.db_existed_goods_id_list.append(goods_id) else: pass else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.lg.info('tmcs已经抓取完毕!') return True