async def _get_new_ali_obj(self, index) -> None: if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.ali_1688 except: pass collect() self.ali_1688 = ALi1688LoginAndParse(logger=self.lg)
async def _update_db(self): """ 常规数据实时更新 :return: """ while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.ali_1688 = ALi1688LoginAndParse(logger=self.lg) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: goods_id = item[1] db_goods_info_obj = ALDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format(goods_id)) tasks.append( self.loop.create_task( self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index))) index += 1 if tasks != []: await _get_async_task_result(tasks=tasks, logger=self.lg) else: pass self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(5.5) try: del self.ali_1688 except: pass collect()
def get_one_1688_data(**kwargs): ''' 抓取一个1688 url 的data :param kwargs: :return: ''' username = kwargs.get('username', '18698570079') wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '') my_lg = kwargs.get('my_lg') login_ali = ALi1688LoginAndParse(logger=my_lg) goods_id = login_ali.get_goods_id_from_url( wait_to_deal_with_url) # 获取goods_id if goods_id == '': # 如果得不到goods_id, 则return error my_lg.info('获取到的goods_id为空!') try: del login_ali # 每次都回收一下 except: pass gc.collect() return {'goods_id': ''} # 错误1: goods_id为空值 tmp_result = login_ali.get_ali_1688_data(goods_id=goods_id) data = login_ali.deal_with_data() # 如果成功获取的话, 返回的是一个data的dict对象 if data == {} or tmp_result == {}: my_lg.info('获取到的data为空!') try: del login_ali except: pass gc.collect() return {'goods_id': goods_id, 'msg': 'data为空!'} # 错误2: 抓取失败 wait_to_save_data = add_base_info_2_processed_data( data=data, spider_url=wait_to_deal_with_url, username=username, goods_id=goods_id) try: del login_ali except: pass return wait_to_save_data
def _1688_keywords_spider(self, **kwargs): ''' 1688对应关键字的商品信息抓取存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://detail.1688.com/offer/{0}.html'.format(item) for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: result = False # 每次重置 try: goods_id = re.compile('offer/(.*?).html').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: ali_1688 = ALi1688LoginAndParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = ali_1688.get_goods_id_from_url(item) if goods_id == '': self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get( 'before') is False: # 处理已下架的但是还是要插入的 # 下架的商品就pass continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = None result = ali_1688.old_ali_1688_goods_insert_into_new_table( data=data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 try: del ali_1688 except: pass gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=2 order by ID desc' sql_str_2 = 'select GoodsOutUrl, goods_id from db_k85u.dbo.goodsinfo where OutGoodsType<=13 and onoffshelf=1 and not exists (select maingoodsid from gather.dbo.GoodsInfoAutoGet c where c.maingoodsid=goodsinfo.goods_id)' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) result_2 = list(tmp_sql_server._select_table(sql_str=sql_str_2)) # print(result_2) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse() # 新表 GoodsInfoAutoGet new_table_ali_1688_all_goods_id_list = list( set([item[0] for item in result])) # 新表里面的goods_id print(new_table_ali_1688_all_goods_id_list) sleep(2) # 老表 old_table_ali_1688_all_goods_list = [] for item in result_2: tmp_goods_id = ali_1688.get_goods_id_from_url(item[0]) if tmp_goods_id != '' and tmp_goods_id not in new_table_ali_1688_all_goods_id_list: old_table_ali_1688_all_goods_list.append([ 'https://detail.1688.com/offer/' + tmp_goods_id + '.html', item[1], tmp_goods_id, ]) else: print('@@@ 原地址为: ', item[0]) # print(old_table_ali_1688_all_goods_list) print('老表待转数据个数为: ', len(old_table_ali_1688_all_goods_list)) sleep(2) for item in old_table_ali_1688_all_goods_list: # 实时更新数据 if index % 10 == 0: ali_1688 = ALi1688LoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = str(item[2]) # print(goods_id) if goods_id in new_table_ali_1688_all_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过!') index += 1 gc.collect() continue # 跳过sleep else: sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=2 and GoodsID=%s' try: # 老是有重复的,索性单独检查 is_in_db = list( tmp_sql_server._select_table( sql_str=sql_str, params=(goods_id))) except: is_in_db = [] pass if is_in_db != []: print('该goods_id已经存在于数据库中, 此处跳过!') index += 1 gc.collect() continue print( '------>>>| 正在插入的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get( 'before') is False: # 处理已下架的但是还是要插入的 tt['goods_id'] = goods_id tt['goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' tt['username'] = '******' tt['main_goods_id'] = item[1] ali_1688.old_ali_1688_goods_insert_into_new_table( data=tt, pipeline=tmp_sql_server) index += 1 gc.collect() sleep(1.2) continue else: pass data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = item[1] ali_1688.old_ali_1688_goods_insert_into_new_table( data=data, pipeline=tmp_sql_server) else: # 表示返回的data为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del ali_1688 # except: # pass gc.collect() sleep(2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=2 order by ID desc' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse() for item in result: # 实时更新数据 if index % 5 == 0: ali_1688 = ALi1688LoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data = ali_1688.get_ali_1688_data(item[0]) if isinstance(data, int) is True: # 单独处理返回tt为4041 continue else: pass if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(1.5) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中''' # 业务逻辑 # 公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格 data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(.3) # 避免服务器更新太频繁 else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del ali_1688 # except: # pass gc.collect() sleep(2.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
def run_forever(): while True: my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/1688/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=al_select_str_6)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: ali_1688 = ALi1688LoginAndParse(logger=my_lg) if index % 50 == 0: my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})' .format(item[0], index)) data = ali_1688.get_ali_1688_data(item[0]) if isinstance(data, int) is True: # 单独处理返回tt为4041 continue else: pass if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) # my_lg.info('上架时间:{0}, 下架时间:{1}'.format(data['shelf_time'], data['delete_time'])) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(1.5) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) # my_lg.info('上架时间:{0}, 下架时间:{1}'.format(data['shelf_time'], data['delete_time'])) '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中''' # 业务逻辑 # 公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格 data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[6]), site_id=2) except AttributeError: # 处理已被格式化过的 old_sku_info = item[6] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['sku_map'], site_id=2), is_price_change=item[7] if item[7] is not None else 0) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(.3) # 避免服务器更新太频繁 else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(2.2) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_ali_1688_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse() for item in result: # 实时更新数据 data = {} if index % 5 == 0: ali_1688 = ALi1688LoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data = ali_1688.get_ali_1688_data(item[0]) if isinstance(data, int) is True: # 单独处理返回tt为4041 continue else: pass if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2] ) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data(data, pipeline=tmp_sql_server) sleep(1.5) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2] ) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data(data, pipeline=tmp_sql_server) sleep(.3) # 避免服务器更新太频繁 else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del ali_1688 # except: # pass gc.collect() sleep(2.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) # del ali_1688 gc.collect()