def get_one_tm_data(**kwargs): ''' 抓取一个tm url的data :param kwargs: :return: ''' username = kwargs.get('username', DEFAULT_USERNAME) wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '') my_lg = kwargs.get('my_lg') login_tmall = TmallParse(logger=my_lg) goods_id = login_tmall.get_goods_id_from_url( wait_to_deal_with_url) # 获取goods_id, 这里返回的是一个list if goods_id == []: # 如果得不到goods_id, 则return error my_lg.info('获取到的goods_id为空!') try: del login_tmall # 每次都回收一下 except: pass gc.collect() return {'goods_id': ''} # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[ 1] # 构造成标准干净的天猫商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[ 1] elif goods_id[0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = str(goods_id[2]) + '?id=' + goods_id[1] tmp_result = login_tmall.get_goods_data(goods_id=goods_id) data = login_tmall.deal_with_data() # 如果成功获取的话, 返回的是一个data的dict对象 sleep(TMALL_SLEEP_TIME) # 这个在服务器里面可以注释掉为.5s if data == {} or tmp_result == {}: my_lg.info('获取到的data为空!') try: del login_tmall except: pass gc.collect() return {'goods_id': goods_id[1], 'msg': 'data为空!'} wait_to_save_data = add_base_info_2_processed_data( data=data, spider_url=wait_to_deal_with_url, username=username, goods_id=goods_id[1]) try: del login_tmall except: pass return wait_to_save_data
async def _deal_with_all_goods_id(self): ''' 获取每个详细分类的商品信息 :return: None ''' _data = await self._get_all_goods_list() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() index = 1 if my_pipeline.is_connect_success: self.my_lg.info('正在获取淘抢购db原有goods_id, 请耐心等待...') sql_str = r'select goods_id from dbo.tao_qianggou_xianshimiaosha where site_id=28' db_ = list(my_pipeline._select_table(sql_str=sql_str)) db_all_goods_id = [item[0] for item in db_] self.my_lg.info('获取完毕!!!') # self.my_lg.info(str(db_all_goods_id)) for item in _data: miaosha_goods_list = await self._get_taoqianggou_goods_list(data=item.get('data', [])) # self.my_lg.info(str(miaosha_goods_list)) # pprint(miaosha_goods_list) for tmp_item in miaosha_goods_list: if tmp_item.get('goods_id', '') in db_all_goods_id: # 处理如果该goods_id已经存在于数据库中的情况 self.my_lg.info('该goods_id[%s]已存在db中' % tmp_item.get('goods_id', '')) continue if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # my_pipeline = SqlPools() self.my_lg.info('与数据库的新连接成功建立...') if my_pipeline.is_connect_success: tmall = TmallParse(logger=self.my_lg) tmp_url = 'https://detail.tmall.com/item.htm?id={0}'.format(tmp_item.get('goods_id')) goods_id = tmall.get_goods_id_from_url(tmp_url) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.my_lg.info(str(tmp_item)) goods_data['goods_id'] = tmp_item.get('goods_id') goods_data['spider_url'] = tmp_url goods_data['miaosha_time'] = tmp_item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=tmp_item.get('miaosha_time')) goods_data['page'] = tmp_item.get('page') goods_data['spider_time'] = tmp_item.get('spider_time') tmall.insert_into_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await asyncio.sleep(5) try: del tmall except: pass gc.collect()
async def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods 数据 :param result: :return: ''' index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time') miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # self.my_lg.info(str(miaosha_begin_time)) tmall = TmallParse(logger=self.my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if await self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) self.my_lg.info('过期的goods_id为(%s)' % item[0] + ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) else: # 返回1, 表示在待更新的区间内 self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据''' goods_id = tmall.get_goods_id_from_url(item[2]) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.my_lg.info(str(item)) goods_data['goods_id'] = item[0] await tmall._update_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await asyncio.sleep(5) index += 1 try: del tmall except: pass gc.collect() return
def test_tm_m(): # todo 注意部分商品预售,当前无法购买, 不更新, 待其状态正常后会更新 goods_id = '575090086713' # data = get_tm_m_body_data(goods_id=goods_id) # pprint(data) pc_url = 'https://detail.tmall.com/item.htm?id={}'.format(goods_id) phone_url = 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id) print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url)) tm = TmallParse(is_real_times_update_call=True) goods_id = tm.get_goods_id_from_url(tmall_url=pc_url) ori_data = tm.get_goods_data(goods_id=goods_id) # pprint(ori_data) data = tm.deal_with_data() pprint(data) try:del tm except:pass
def _tmall_keywords_spider(self, **kwargs): ''' tmall对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.my_lg.error('该goods_url为空值! 此处跳过!') continue result = tmall.old_tmall_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True
# coding:utf-8 ''' @author = super_fazai @File : test_tm_m.py @connect : [email protected] ''' """ 测试tm m """ from sys import path as sys_path sys_path.append('..') from multiplex_code import get_tm_m_body_data from tmall_parse_2 import TmallParse from pprint import pprint goods_id = '589363967773' # data = get_tm_m_body_data(goods_id=goods_id) # pprint(data) pc_url = 'https://detail.tmall.com/item.htm?id={}'.format(goods_id) phone_url = 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id) print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url)) tm = TmallParse(is_real_times_update_call=True) goods_id = tm.get_goods_id_from_url(tmall_url=pc_url) tm.get_goods_data(goods_id=goods_id) data = tm.deal_with_data() pprint(data)
def _tmall_keywords_spider(self, **kwargs): """ tmall对应关键字采集 :param kwargs: :return: """ goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url # 用于判断某个goods是否被插入的参数 result = False try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20, ) if self.sql_cli.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if not self.check_target_data_is_legal(target_data=data): return False result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.sql_cli) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True
async def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods 数据 :param result: :return: ''' index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json_2_dict( json_str=item[1], logger=self.lg, ).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # self.lg.info(str(miaosha_begin_time)) tmall = TmallParse(logger=self.lg) tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=self.lg, remainder=20, ) if tmp_sql_server.is_connect_success: if await self.is_recent_time(miaosha_begin_time) == 0: # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],)) tmp_sql_server._update_table(sql_str=tb_update_str_4, params=(item[0], )) self.lg.info('过期的goods_id为(%s)' % item[0] + ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time) await async_sleep(.3) else: # 返回1, 表示在待更新的区间内 self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据''' goods_id = tmall.get_goods_id_from_url(item[2]) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.lg.info(str(item)) goods_data['goods_id'] = item[0] await tmall._update_taoqianggou_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await async_sleep(5) index += 1 try: del tmall except: pass collect() return
async def deal_with_tmcs_goods_id_list(self): self.lg.info('即将开始抓取tmcs goods, 请耐心等待...') for item in self.db_wait_2_save_goods_id_list: # eg: '61864164616' goods_id = item if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=self.sql_cli_remainder, ) if self.sql_cli.is_connect_success: # 加spm 是为了get_goods_id_from_url能筛选, id # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id) goods_url = 'https://detail.tmall.com/item.htm?id={}'.format( goods_id) # 下面这个goods_id为类型加goods_id的list goods_id = tmall.get_goods_id_from_url(goods_url) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id, ) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if len(data['all_img_url']) <= 1: self.lg.info( '[goods_id: {}]主图个数<=1, pass'.format(goods_id)) return False result = tmall.old_tmall_goods_insert_into_new_table( data=data, pipeline=self.sql_cli) if result: # 避免后续重复采集 self.db_existed_goods_id_list.append(goods_id) else: pass else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.lg.info('tmcs已经抓取完毕!') return True
async def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods 数据 :param result: :return: ''' index = 1 for item in result: # 实时更新数据 _goods_id = item[0] miaosha_time = item[1] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) tmall = TmallParse(logger=self.lg) tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=self.lg, remainder=20, ) if tmp_sql_server.is_connect_success: if await self.is_recent_time(miaosha_begin_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=_goods_id, logger=self.lg, update_sql_str=tb_update_str_4, sql_cli=tmp_sql_server, ) self.lg.info('过期的goods_id为(%s)' % _goods_id + ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time) await async_sleep(.3) else: # 返回1, 表示在待更新的区间内 self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (_goods_id, str(index))) '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据''' goods_id = tmall.get_goods_id_from_url(item[2]) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.lg.info(str(item)) goods_data['goods_id'] = _goods_id await tmall._update_taoqianggou_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await async_sleep(5) index += 1 try: del tmall except: pass collect() return