def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=z8_delete_str_1) result = list(sql_cli._select_table(sql_str=z8_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 for item in result: # 实时更新数据 goods_id = item[0] db_is_delete = item[1] # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, remainder=50, ) if index % 300 == 0: # 每更新300个,休眠3分钟 sleep_time = 3 * 60 sleep(sleep_time) print('休眠{}s中...'.format(sleep_time)) if sql_cli.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id) # 不用这个了因为会影响到正常情况的商品 try: # 单独处理商品页面不存在的情况 if isinstance(tmp_tmp, str) and re.compile( r'^ze').findall(tmp_tmp) != []: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) sleep(ZHE_800_PINTUAN_SLEEP_TIME) continue else: pass except: pass data = zhe_800_pintuan.deal_with_data() if data != {}: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id if db_is_delete == 1: print('该goods_id[{0}]已过期!'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) else: zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] session_id = item[2] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_z8_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg, remainder=30) if self.sql_cli.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) self.lg.info( '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res elif is_recent_time == 2: # 可能包括过期的 if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: # 处理已过期的逻辑删 res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) self.lg.info( '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) else: self.lg.info( '未来时间暂时不更新! miaosha_begin_time: {}, miaosha_end_time: {}' .format( timestamp_to_regulartime(miaosha_begin_time), timestamp_to_regulartime(miaosha_end_time), )) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) try: tmp_data = self.zhe_800_spike._get_one_session_id_data( base_session_id=str(session_id)) except Exception: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index return goods_id, res try: tmp_data = tmp_data.get('data', {}).get('blocks', []) assert tmp_data != [], '该session_id不存在,此处跳过' except AssertionError: # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品 self.lg.error(msg='遇到错误:', exc_info=True) res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) self.lg.info( msg= '该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!' .format(goods_id, miaosha_begin_time)) index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res tmp_data = [item_s.get('deal', {}) for item_s in tmp_data] # pprint(tmp_data) try: miaosha_goods_list = await self._get_miaoshao_goods_info_list( data=tmp_data) # pprint(miaosha_goods_list) except ValueError: await async_sleep(2) index += 1 self.goods_index = index return goods_id, res # 该session_id中现有的所有zid的list miaosha_goods_all_goods_id = [ i.get('zid') for i in miaosha_goods_list ] if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=z8_update_str_6, sql_cli=self.sql_cli, ) self.lg.info( '该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format( goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 未下架的 res = await self._one_update( miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(1.5) return goods_id, res
def _get_goods_data(self, goods_id): ''' 得到需求数据 :param goods_id: :return: ''' data = {} url = 'https://goods.kaola.com/product/{0}.html'.format(goods_id) self.lg.info('------>>>| 正在抓取考拉地址为: {0}'.format(url)) try: assert goods_id != '', '获取到的goods_id为空值!此处跳过!' body = self.get_kl_pc_body(goods_id=goods_id) pc_goods_body = body # _ = self._get_right_body(body) # phone端 _ = self._get_pc_right_body(body) # pc端 # pprint(_) assert _ != {}, '获取body时索引异常!' _['sku_info'] = self.get_kl_pc_sku_info(goods_id=goods_id) # pprint(_) _ = self._wash_data(_) # pprint(_) # title, sub_title data['title'] = self._get_title(data=_) data['sub_title'] = '' data['shop_name'] = _.get('goodsInfoBase', {}).get('brandName', '') data['all_img_url'] = self._get_all_img_url(data=_) data['p_info'] = self._get_p_info(data=_) data['div_desc'] = self._get_div_desc(data=_) data['sell_time'] = self._get_sell_time(data=_.get('sku_info', {})) data['detail_name_list'] = self._get_detail_name_list( data=_.get('sku_info', {}).get('skuDetailList', [])) # TODO 网易考拉官方有bug, 实际规格没货的商品, 前端还在卖, 估计是下单后再去订货, 库存0: 我这边就处理为下架 # data['price_info_list'] = self._get_sku_info(data=_.get('sku_info', {}).get('skuDetailList', [])) '''获取pc端的, 价格为算上税费的''' data['price_info_list'] = self._get_pc_sku_info( data=_.get('sku_info', {}).get('skuDetailList', [])) data['price'], data[ 'taobao_price'] = self._get_price_and_taobao_price( data=_.get('sku_info', {}).get('skuPrice', {}), price_info_list=data['price_info_list']) data['is_delete'] = self._get_is_delete( price_info_list=data['price_info_list'], data=data, other=_) data['parent_dir'] = self._get_parent_dir(body=pc_goods_body) self.lg.info('parent_dir: {}'.format(data['parent_dir'])) except GoodsShelvesException: _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, logger=self.lg) return self._get_data_error_init() except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.error('出错goods_id: {0}, 地址: {1}'.format(goods_id, url)) return self._get_data_error_init() self.result_data = data return data
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': return self._data_error_init() tmp_url = 'https://web.juanpi.com/pintuan/shop/{}'.format(goods_id) print('------>>>| 得到的商品手机版的地址为: ', tmp_url) try: """ 2.采用phantomjs来处理,记住使用前别翻墙 """ body = self.driver.get_url_body( url=tmp_url, # 该css为手机端标题块 # css_selector='div.sc-kgoBCf.bTQvTk', timeout=28,) assert body != '' # print(body) if re.compile(r'<span id="t-index">页面丢失ing</span>').findall(body) != []: # 页面为空处理 raise GoodsShelvesException else: pass data = re.compile(r'__PRELOADED_STATE__ = (.*);</script> <style ').findall(body) # 贪婪匹配匹配所有 assert data != [], 'data为空list!' # 得到skudata # 卷皮原先的skudata请求地址1(官方放弃) # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id) # 现在卷皮skudata请求地址2 skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str(goods_id) headers = get_random_headers(upgrade_insecure_requests=False) headers.update({ 'Host': 'webservice.juanpi.com' }) skudata_body = Requests.get_url_body( url=skudata_url, headers=headers, ip_pool_type=self.ip_pool_type, proxy_type=self.proxy_type, num_retries=self.req_num_retries,) assert skudata_body != '', '获取到的skudata_body为空str!请检查!' skudata = re.compile(r'(.*)').findall(skudata_body) # 贪婪匹配匹配所有 assert skudata != [], 'skudata为空!' skudata = json_2_dict(json_str=skudata[0]).get('skudata', {}) # pprint(skudata) assert skudata != {} if skudata.get('info') is not None: pass # 说明得到正确的skudata else: # 否则跳出 raise AssertionError('skudata中info的key为None, 返回空dict') main_data = json_2_dict(json_str=data[0]) assert main_data != {} # pprint(main_data) goods_status = int(main_data.get('detail', {}).get('baseInfo', {}).get('status', '1')) # print('goods_status: {}'.format(goods_status)) if goods_status == 0: # 表示商品下架, 无法正常购买 raise GoodsShelvesException else: pass if main_data.get('detail') is not None: main_data = self._wash_main_data(main_data.get('detail', {})) main_data['skudata'] = skudata main_data['goods_id'] = goods_id main_data['parent_dir'] = _jp_get_parent_dir( phantomjs=self.driver, goods_id=goods_id) self.result_data = main_data # pprint(main_data) return main_data else: raise AssertionError('data中detail的key为None, 返回空dict') except GoodsShelvesException: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, ) return self._data_error_init() except Exception as e: print(e) return self._data_error_init()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=kl_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) for item in result: # 实时更新数据 goods_id = item[1] if index % 5 == 0: try: del kaola except: pass kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) collect() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, logger=my_lg, remainder=10, ) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) db_goods_info_obj = KLDbGoodsInfoObj(item=item, logger=my_lg) data = kaola._get_goods_data(goods_id=goods_id) if data.get('is_delete', 0) == 1: # 单独处理下架商品 data['goods_id'] = goods_id data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=db_goods_info_obj.is_delete, shelf_time=db_goods_info_obj.shelf_time, delete_time=db_goods_info_obj.delete_time, ) try: kaola.to_right_and_update_data(data, pipeline=sql_cli) except Exception: my_lg.error(exc_info=True) sleep(TMALL_REAL_TIMES_SLEEP_TIME) index += 1 collect() continue data = kaola._deal_with_data() if data != {}: if data.get('is_delete', 0) == 1: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='kl', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) kaola.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠3s中...') sleep(3.) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
async def run_forever(): #### 实时更新数据 # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志 lg = set_logger(logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # 由于不处理下架的商品,所以is_delete=0 try: # todo 先不处理过期的因为后台没有同步下架会导致其无法查到数据 # 得 处理 因为只要此处会清数据了 tmp_sql_server._delete_table(sql_str=tb_delete_str_2, params=None) # await async_sleep(10) result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7)) except TypeError: lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)') return None await _print_db_old_data( result=result, logger=lg, ) index = 1 for item in result: goods_id = item[0] tejia_end_time = item[2] tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=lg, db_conn_type=1, ) if tmp_sql_server.is_connect_success: # lg.info(str(tejia_end_time)) if tejia_end_time < get_shanghai_time(): # 过期的不删除, 降为更新为常规爆款促销商品 # index = await update_expired_goods_to_normal_goods( # goods_id=goods_id, # index=index, # tmp_sql_server=tmp_sql_server, # logger=lg # ) # 过期直接下架 lg.info('@@ 过期下架[goods_id: {}]'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=lg, update_sql_str=tb_update_str_5, ) index += 1 else: # 下面为天天特价商品信息更新 ''' ** 由于天天特价不会提前下架商品,就不对应更新特价时间段 ''' # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间 # if index % 6 == 0: # try: del tmp_taobao_tiantiantejia # except: pass # collect() # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=lg) # # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3]) # if tmp_body == '': # msg = '获取到的tmp_body为空str! 出错category为: ' + item[3] # lg.error(msg) # continue # # try: # tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0] # except IndexError: # msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3] # lg.error(msg) # continue # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body) # if tmp_sort_data == 'no items': # lg.info('该api接口获取到的item_list为no items!请检查') # break # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data) # # lg.info(str(tejia_goods_list)) # await async_sleep(.45) # # lg.info('111') ''' 研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过 ''' # if is_in_child_sort(tejia_goods_list, goods_id=goods_id) is False: # 表示被官方提前下架 # # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=goods_id) # # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(goods_id)) # print('222') # pass # else: # 表示商品未被提前下架 lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) taobao = TaoBaoLoginAndParse( logger=lg, is_real_times_update_call=is_real_times_update_call) taobao.get_goods_data(goods_id) goods_data = taobao.deal_with_data(goods_id=goods_id) if goods_data != {}: # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=goods_id) # if tmp_time != []: # begin_time, end_time = tmp_time # # goods_data['goods_id'] = goods_id # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) # await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server) # else: # lg.info('该goods_id不在该api接口的商品中!!') # pass goods_data['goods_id'] = goods_id if goods_data.get('is_delete', 0) == 1: lg.info('@该商品已下架...') await taobao.update_taobao_tiantiantejia_table( data=goods_data, pipeline=tmp_sql_server) else: await async_sleep(4) await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) index += 1 collect() else: lg.error('数据库连接失败,数据库可能关闭或者维护中') pass collect() lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 # sleep(60 * 60 * .5) await async_sleep(5 * 60) else: await async_sleep(60 * 1) collect() return True
async def _update_one_goods_info(self, item, index): ''' 更新单个 :return: ''' res = False goods_id = item[0] miaosha_time = item[1] page = item[2] goods_url = item[3] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_jumei_obj(index=index) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, ) if self.sql_cli.is_connect_success: is_recent_time_res = await self._is_recent_time(miaosha_end_time) if is_recent_time_res == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jm_update_str_4, sql_cli=self.sql_cli, ) self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_end_time))) await async_sleep(.3) elif is_recent_time_res == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jm_update_str_4, sql_cli=self.sql_cli, ) self.lg.info( '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_end_time))) else: pass else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) this_page_all_goods_list = await self._get_one_page_all_goods_list( page) if isinstance(this_page_all_goods_list, str): self.lg.error('网络错误!先跳过') await async_sleep(1.5) return res elif this_page_all_goods_list == []: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jm_update_str_4, sql_cli=self.sql_cli, ) self.lg.error( '#### 该page对应得到的this_page_all_goods_list为空[]!') self.lg.error( '** 该商品已被下架限时秒杀活动, 此处将其逻辑删除, goods_id:{}'.format( goods_id)) await async_sleep(.3) else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') # res = _handle_goods_shelves_in_auto_goods_table( # goods_id=goods_id, # logger=self.lg, # update_sql_str=jm_update_str_4, # sql_cli=self.sql_cli, ) # self.lg.info('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # else: # 未下架的 tmp_r = self.jumeiyoupin_miaosha.get_goods_id_from_url( goods_url) self.jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = self.jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = goods_id goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) res = self.jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=self.sql_cli) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(JUMEIYOUPIN_SLEEP_TIME) return [goods_id, res]
def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=mg_delete_str_2) result = list(sql_cli._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) for item in result: # 实时更新数据 goods_id = item[0] pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) print( '过期的goods_id为(%s)' % goods_id, ', 拼团开始时间为(%s), 逻辑删除成功!' % json.loads(item[1]).get('begin_time')) sleep(.3) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) sleep(.3) else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if goods_id not in pintuan_goods_all_goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=sql_cli) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data[ 'goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=sql_cli) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) gc.collect()
def deal_with_data(self): ''' 处理result_data, 返回需要的信息 :return: 字典类型 ''' data = self.result_data goods_id = data.get('goods_id', '') if data != {}: try: shop_name = data.get('shop_name', '') account = '' title = data.get('/app/detail/product/base', {}).get('title', '') sub_title = '' # 要存储的每个标签对应规格的价格及其库存 tmp_price_info_list = data.get('/app/detail/product/sku', {}).get('items') # pprint(tmp_price_info_list) cache = self._get_detail_name_list_and_price_info_list_and_price_and_taobao_price( data=data, tmp_price_info_list=tmp_price_info_list) all_img_url = self._get_all_img_url(tmp_all_img_url=data.get( '/app/detail/product/base', {}).get('images', [])) detail_name_list = cache[0] price_info_list = cache[1] price = cache[2] taobao_price = cache[3] # print('最高价为: ', price) # print('最低价为: ', taobao_price) # print(detail_name_list) # pprint(price_info_list) p_info = self._get_p_info(data=data) # pprint(p_info) # div_desc div_desc = data.get('/app/detail/graph/detail', '') is_delete = self._get_is_delete( price_info_list=price_info_list) schedule, is_delete = self._get_schedule(data=data, is_delete=is_delete) # pprint(schedule) parent_dir = str(data.get('parent_dir', '')) all_sell_count = '' if target_str_contain_some_char_check( target_str=title, check_char_obj=CONTRABAND_GOODS_KEY_TUPLE): print('违禁物品下架...') is_delete = 1 else: pass except GoodsShelvesException: _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, ) return self._data_error() except (AttributeError, Exception) as e: print('遇到错误:', e) return self._data_error() result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 # 'shop_name_url': shop_name_url, # 店铺主页地址 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'schedule': schedule, # 商品开卖时间和结束开卖时间 'is_delete': is_delete, # 用于判断商品是否已经下架 'parent_dir': parent_dir, 'all_sell_count': all_sell_count, } # pprint(result) # print(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) return result else: print('待处理的data为空的dict, 该商品可能已经转移或者下架') self.result_data = {} return {}
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': return self._data_error() tmp_url = 'https://th5.m.zhe800.com/gateway/app/detail/product?productId=' + str( goods_id) # print('------>>>| 得到的detail信息的地址为: ', tmp_url) body = Requests.get_url_body(url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type) data = json_2_dict( json_str=body, default_res={}, ) if body == '' \ or data == {}: return self._data_error() # 处理base base = json_2_dict(json_str=data.get('/app/detail/product/base', ''), default_res={}) # 处理profiles profiles = data.get('/app/detail/product/profiles', '') profiles = json_2_dict(json_str=profiles) if profiles == {}: print("json.loads转换出错,得到profiles值可能为空,此处跳过") profiles = '' # 处理score score = json_2_dict(json_str=data.get('/app/detail/product/score', ''), default_res={}) try: score.pop('contents') except: pass # 处理sku sku = json_2_dict(json_str=data.get('/app/detail/product/sku', ''), default_res={}) # pprint(sku) data['/app/detail/product/base'] = base data['/app/detail/product/profiles'] = profiles data['/app/detail/product/score'] = score data['/app/detail/product/sku'] = sku # pprint(base) try: # 得到手机版地址 phone_url = 'http://th5.m.zhe800.com/h5/shopdeal?id=' + str( base.get('dealId', '')) except AttributeError: # None表示获取失败, False表示已下架, True正常 can_join_cart = sku.get('canJoinCart') if can_join_cart is not None: if isinstance(can_join_cart, bool) \ and not can_join_cart: # todo 已下架! _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, ) return self._data_error() else: pass print('获取手机版地址失败,此处跳过') return self._data_error() print('------>>>| 得到商品手机版地址为: ', phone_url) # 得到并处理detail(即图文详情显示信息) tmp_detail_url = 'https://th5.m.zhe800.com/gateway/app/detail/graph?productId=' + str( goods_id) detail_data_body = Requests.get_url_body( url=tmp_detail_url, headers=self.headers, ip_pool_type=self.ip_pool_type) # print(detail_data_body) if detail_data_body == '': print('detail_data为[]!') return self._data_error() detail_data = json_2_dict(json_str=detail_data_body, default_res={}) if detail_data == {}: print('json.loads(detail_data)时报错, 此处跳过') return self._data_error() detail = json_2_dict(json_str=detail_data.get( '/app/detail/graph/detail', ''), default_res={}) try: detail.pop('small') except: pass # print(detail) # div_desc tmp_div_desc = self._get_div_desc(detail=detail, goods_id=goods_id) if tmp_div_desc == '': return self._data_error() # print(tmp_div_desc) data['/app/detail/graph/detail'] = tmp_div_desc # shop_name shop_name = self._get_shop_name(data=data) if isinstance(shop_name, dict): if shop_name == {}: return self._data_error() data['shop_name'] = shop_name ''' 得到秒杀开始时间和结束时间 ''' schedule_and_stock_url = 'https://th5.m.zhe800.com/gateway/app/detail/status?productId=' + str( goods_id) schedule_and_stock_info_body = Requests.get_url_body( url=schedule_and_stock_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if schedule_and_stock_info_body == '': print('schedule_and_stock_info为空!') return self._data_error() schedule_and_stock_info = json_2_dict( json_str=schedule_and_stock_info_body) if schedule_and_stock_info == {}: print('得到秒杀开始时间和结束时间时错误, 此处跳过') return self._data_error() schedule = json_2_dict(json_str=schedule_and_stock_info.get( '/app/detail/status/schedule', None), default_res={}) stock = json_2_dict(json_str=schedule_and_stock_info.get( '/app/detail/status/stock', None), default_res={}) data['schedule'] = schedule data['stock'] = stock data['parent_dir'] = _z8_get_parent_dir(goods_id) data['goods_id'] = goods_id self.result_data = data # pprint(data) return data
def get_goods_data(self, goods_id): ''' 得到data :param goods_id: :return: data 类型dict ''' if goods_id == []: return self._data_error_init() type = goods_id[0] # 天猫类型 # self.lg.info(str(type)) goods_id = goods_id[1] # 天猫goods_id tmp_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id) # self.lg.info('------>>>| phone_url: {}'.format(tmp_url)) self.headers.update({'Referer': tmp_url}) last_url = self._get_last_url(goods_id=goods_id) body = Requests.get_url_body( url=last_url, headers=self.headers, timeout=14, ip_pool_type=self.ip_pool_type, proxy_type=self.proxy_type, num_retries=self.req_num_retries, ) try: assert body != '', '获取到的body为空值, 此处跳过! 出错type %s: , goods_id: %s' % ( str(type), goods_id) data = json_2_dict( json_str=re.compile('mtopjsonp3\((.*)\)').findall(body)[0], default_res={}, logger=self.lg) assert data != {}, 'data为空dict, 出错type: {}, goods_id: {}'.format( str(type), str(goods_id)) # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \ and data.get('data', {}).get('seller', {}).get('evaluates') is None: raise GoodsShelvesException except GoodsShelvesException: ## 表示该商品已经下架, 原地址被重定向到新页面 self.lg.info('@@@@@@ 该商品已经下架...') _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, logger=self.lg) tmp_data_s = self.init_pull_off_shelves_goods(type) self.result_data = {} return tmp_data_s except (AssertionError, IndexError): self.lg.error('遇到错误:', exc_info=True) return self._data_error_init() # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data', {}).get('seller', {}).get('evaluates') is None: self.lg.error( 'data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错type: {}, goods_id: {}'. format(type, goods_id)) return self._data_error_init() data['data']['rate'] = '' # 这是宝贝评价 data['data']['resource'] = '' # 买家询问别人 data['data']['vertical'] = '' # 也是问和回答 data['data']['seller']['evaluates'] = '' # 宝贝描述, 卖家服务, 物流服务的评价值... result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {}) # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0][ 'value'] = self._wash_result_data_apiStack_value( goods_id=goods_id, result_data_apiStack_value=result_data_apiStack_value) # 处理mockData mock_data = result_data['mockData'] mock_data = json_2_dict(json_str=mock_data, logger=self.lg) if mock_data == {}: self.lg.error('出错type: {0}, goods_id: {1}'.format(type, goods_id)) return self._data_error_init() mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # self.lg.info(str(result_data.get('apiStack', [])[0])) # 可能会有{'name': 'esi', 'value': ''}的情况 if result_data.get('apiStack', [])[0].get('value', '') == '': self.lg.error( "result_data.get('apiStack', [])[0].get('value', '')的值为空....出错type: {}, goods_id: {}" .format(str(type), goods_id)) result_data['trade'] = {} return self._data_error_init() else: result_data['trade'] = result_data.get('apiStack', [])[0].get( 'value', {}).get('trade', {}) # 用于判断该商品是否已经下架的参数 # pprint(result_data['trade']) result_data['type'] = type result_data['goods_id'] = goods_id self.result_data = result_data # pprint(self.result_data) return result_data
async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] tab_id = item[2] page = item[3] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_jp_obj(index=index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jp_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res elif is_recent_time == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jp_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info( '过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) else: self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(page), ) # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url)) body = Requests.get_url_body(url=tmp_url, headers=await self._get_pc_headers(), ip_pool_type=self.ip_pool_type) try: data = json_2_dict(body, default_res={}).get('data', {}) assert data != {}, 'data为空dict!' data = data.get('goodslist', []) assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format( tab_id, page) except AssertionError: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res miaosha_goods_list = await self._get_miaoshao_goods_info_list( data=data) # self.lg.info(str(miaosha_goods_list)) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in miaosha_goods_list ] self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 if miaosha_goods_all_goods_id != []: # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过! self.lg.info( '该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id)) else: # 表示该tab_id,page中没有了该goods_id res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=jp_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info( '该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format( goods_id)) index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update( miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res
async def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods 数据 :param result: :return: ''' index = 1 for item in result: # 实时更新数据 _goods_id = item[0] miaosha_time = item[1] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) tmall = TmallParse(logger=self.lg) tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=self.lg, remainder=20, ) if tmp_sql_server.is_connect_success: if await self.is_recent_time(miaosha_begin_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=_goods_id, logger=self.lg, update_sql_str=tb_update_str_4, sql_cli=tmp_sql_server, ) self.lg.info('过期的goods_id为(%s)' % _goods_id + ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time) await async_sleep(.3) else: # 返回1, 表示在待更新的区间内 self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (_goods_id, str(index))) '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据''' goods_id = tmall.get_goods_id_from_url(item[2]) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.lg.info(str(item)) goods_data['goods_id'] = _goods_id await tmall._update_taoqianggou_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await async_sleep(5) index += 1 try: del tmall except: pass collect() return
def run_forever(self): ''' 实时更新数据 :return: ''' result = self._get_db_old_data() index = 1 for item in result: # 实时更新数据 goods_id = item[0] pid = item[2] # 2020-04-12 00:00:00 pintuan_end_time = json_2_dict(item[1]).get('end_time') pintuan_end_time = datetime_to_timestamp( string_to_datetime(pintuan_end_time)) # print(pintuan_end_time) data = {} self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli, index=index, remainder=50) if self.sql_cli.is_connect_success: is_recent_time = self.is_recent_time(pintuan_end_time) if is_recent_time == 0: # 已恢复原价的 _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mia_update_str_7, sql_cli=self.sql_cli) print('该goods拼团开始时间为({})'.format( json.loads(item[1]).get('begin_time'))) sleep(.4) elif is_recent_time == 2: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) pass else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})' .format(goods_id, index)) data['goods_id'] = goods_id try: data_list = get_mia_pintuan_one_page_api_goods_info( page_num=pid) except ResponseBodyIsNullStrException: index += 1 sleep(.4) continue # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新 # try: # assert data_list != [], 'data_list不为空list!' # except AssertionError as e: # print(e) # _handle_goods_shelves_in_auto_goods_table( # goods_id=goods_id, # update_sql_str=mia_update_str_7, # sql_cli=self.sql_cli) # sleep(.4) # index += 1 # continue pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in data_list ] # print(pintuan_goods_all_goods_id) ''' 蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍) ''' mia_pt = MiaPintuanParse() if goods_id not in pintuan_goods_all_goods_id: # 内部已经下架的 # 一律更新 try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, ) except AssertionError: # 返回的data为空则跳过 index += 1 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table(data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in data_list: if item_2.get('goods_id', '') == goods_id: sub_title = item_2.get('sub_title', '') try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, sub_title=sub_title, ) except AssertionError: # 返回的data为空则跳过 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table( data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass try: del mia_pt except: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
def get_goods_data(self, goods_id): ''' 得到data :param goods_id: :return: data 类型dict ''' if goods_id == []: return self._data_error_init() tm_type = goods_id[0] # 天猫类型 # self.lg.info(str(tm_type)) goods_id = goods_id[1] # 天猫goods_id phone_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id) # self.lg.info('------>>>| phone_url: {}'.format(phone_url)) # 使用获取基础数据的方式 get_base_data_method = 0 headers = get_random_headers( upgrade_insecure_requests=False, cache_control='', ) headers.update({ 'Referer': phone_url, }) last_url = self._get_last_url(goods_id=goods_id) body = Requests.get_url_body( url=last_url, headers=headers, timeout=self.req_timeout, ip_pool_type=self.ip_pool_type, proxy_type=self.proxy_type, num_retries=self.req_num_retries, ) try: assert body != '', '获取到的body为空值, 此处跳过!' data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}, logger=self.lg) try: if 'login.m.taobao.com' in data.get('data', {}).get('url', ''): # 第一种获取接口出错, 抛出异常(要求登录) raise AssertionError('被重定向到login_url...') else: pass assert data != {}, 'data为空dict!' except AssertionError: # 尝试第二种获取数据方式 self.lg.info( 'trying second method to get data[where goods_id: {}] ...'. format(goods_id)) # 修改方式 get_base_data_method = 1 data = get_tm_m_body_data(goods_id=goods_id, proxy_type=self.proxy_type, num_retries=self.req_num_retries, logger=self.lg) # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \ and data.get('data', {}).get('seller', {}).get('evaluates') is None: raise GoodsShelvesException except GoodsShelvesException: ## 表示该商品已经下架, 原地址被重定向到新页面 _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, logger=self.lg) tmp_data_s = self.init_pull_off_shelves_goods(tm_type=tm_type) self.result_data = {} return tmp_data_s except (AssertionError, IndexError): self.lg.error(msg='遇到错误[出错tm_type: {}, goods_id: {}]:'.format( tm_type, goods_id, ), exc_info=True) return self._data_error_init() # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data', {}).get('seller', {}).get('evaluates') is None: self.lg.error( 'data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错tm_type: {}, goods_id: {}'. format(tm_type, goods_id)) return self._data_error_init() # 这是宝贝评价 data['data']['rate'] = '' # 买家询问别人 data['data']['resource'] = '' # 也是问和回答 data['data']['vertical'] = '' # 宝贝描述, 卖家服务, 物流服务的评价值... data['data']['seller']['evaluates'] = '' result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {}) if get_base_data_method == 0: # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0][ 'value'] = self._wash_result_data_apiStack_value( goods_id=goods_id, result_data_apiStack_value=result_data_apiStack_value) else: pass mock_data = result_data['mockData'] if get_base_data_method == 0: # 处理mockData mock_data = json_2_dict(json_str=mock_data, logger=self.lg) elif get_base_data_method == 1: pass else: raise ValueError('get_base_data_method value异常!') if mock_data == {}: self.lg.error('出错tm_type: {0}, goods_id: {1}'.format( tm_type, goods_id)) return self._data_error_init() mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # self.lg.info(str(result_data.get('apiStack', [])[0])) # 可能会有{'name': 'esi', 'value': ''}的情况 if result_data.get('apiStack', [])[0].get('value', '') == '': self.lg.error( "result_data.get('apiStack', [])[0].get('value', '')的值为空....出错tm_type: {}, goods_id: {}" .format(tm_type, goods_id)) result_data['trade'] = {} return self._data_error_init() else: # 用于判断该商品是否已经下架的参数 result_data['trade'] = result_data\ .get('apiStack', [])[0]\ .get('value', {})\ .get('trade', {}) # pprint(result_data['trade']) result_data['type'] = tm_type result_data['goods_id'] = goods_id self.result_data = result_data # pprint(self.result_data) return result_data
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' phone_url = 'https://h5.m.taobao.com/awp/core/detail.htm?id={}'.format(goods_id) self.msg = '------>>>| phone_url: {}' # self.lg.info(self.msg) # 获取主接口的body last_url = self._get_last_url(goods_id=goods_id) body = Requests.get_url_body( url=last_url, headers=self.headers, params=None, timeout=14, ip_pool_type=self.ip_pool_type, proxy_type=self.proxy_type, num_retries=self.req_num_retries,) try: data = json_2_dict( json_str=re.compile(r'\((.*)\)').findall(body)[0], default_res={}, logger=self.lg) # self.lg.info(str(data)) assert data != {}, '获取到的data为空dict!' # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \ and data.get('data', {}).get('seller', {}).get('evaluates') is None: raise GoodsShelvesException except GoodsShelvesException: ## 表示该商品已经下架, 原地址被重定向到新页面 _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, logger=self.lg) tmp_data_s = self.init_pull_off_shelves_goods() self.result_data = {} return tmp_data_s except (IndexError, AssertionError): self.lg.error('data为空! 出错goods_id: {0}'.format(goods_id), exc_info=True) return self._data_error_init() # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data').get('seller', {}).get('evaluates') is None: self.lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架') return self._data_error_init() data = self._wash_tb_origin_data(data=data) result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {}) # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0]['value'] = self._wash_result_data_apiStack_value( goods_id=goods_id, result_data_apiStack_value=result_data_apiStack_value) # 处理mockData mock_data = result_data['mockData'] mock_data = json_2_dict( json_str=mock_data, logger=self.lg,) if mock_data == {}: self.lg.error('出错goods_id: {0}'.format(goods_id)) return self._data_error_init() mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # 可能会有{'name': 'esi', 'value': ''}的情况 # self.lg.info(str(result_data.get('apiStack', [])[0])) if result_data.get('apiStack', [])[0].get('value', '') == '': self.lg.info("result_data.get('apiStack', [])[0].get('value', '')的值为空....") result_data['trade'] = {} return self._data_error_init() else: # 用于判断该商品是否已经下架的参数 result_data['trade'] = result_data\ .get('apiStack', [])[0]\ .get('value', {})\ .get('trade', {}) # pprint(result_data['trade']) self.result_data = result_data # pprint(self.result_data) return result_data
async def _update_one_goods_info(self, item, index) -> tuple: ''' 单个更新 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] pid = item[2] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_mia_obj(index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30, ) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_end_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) await async_sleep(.5) self.goods_index = index + 1 return goods_id, res elif is_recent_time == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info( '过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_begin_time))) else: pass self.goods_index = index + 1 return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( pid) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(body) body = '' if body == '' or body == '[]' else body try: tmp_data = json_2_dict( json_str=body, default_res={}, logger=self.lg, ) assert tmp_data != {}, 'tmp_data为空dict!' except AssertionError: self.lg.error('遇到错误:', exc_info=True) self.goods_index = index + 1 await async_sleep(.3) return goods_id, res item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('item_id', '') for item_1 in item_list ] # self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=mia_update_str_6, sql_cli=self.tmp_sql_server, ) self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) self.goods_index = index + 1 await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update( item_list=item_list, goods_id=goods_id, tmp_data=tmp_data, ) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 self.goods_index = index + 1 collect() return goods_id, res
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/小米有品/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=yp_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 yp = YouPinParse(logger=my_lg) for item in result: goods_id = item[1] if index % 5 == 0: try: del yp except: pass yp = YouPinParse(logger=my_lg) collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) yp._get_target_data(goods_id=goods_id) data = yp._handle_target_data() db_goods_info_obj = YPDbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='yp', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) yp._to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) collect()
async def _update_one_goods_info(self, item, index): ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] gender = item[2] page = item[3] miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time( miaosha_time=miaosha_time, logger=self.lg, ) await self._get_new_cc_obj(index=index) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, remainder=25, ) if self.sql_cli.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_end_time) if is_recent_time == 0: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=cc_update_str_2, sql_cli=self.sql_cli, ) self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_end_time))) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res elif is_recent_time == 2: if datetime_to_timestamp( get_shanghai_time()) > miaosha_end_time: res = _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, update_sql_str=cc_update_str_2, sql_cli=self.sql_cli, ) self.lg.info( '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format( goods_id, timestamp_to_regulartime(miaosha_end_time))) else: pass index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) body = await self._get_one_page_goods_info(gender, page) if body == '': index += 1 self.goods_index = index await async_sleep(.3) return goods_id, res json_body = json_2_dict(body, default_res={}) try: this_page_total_count = json_body.get('data', {}).get( 'groupList', [])[0].get('totalCount', 0) except IndexError: self.lg.error('获取this_page_total_count时出错, 请检查!') this_page_total_count = 0 item_list = await self._get_item_list( this_page_total_count=this_page_total_count, json_body=json_body) if item_list == []: self.lg.info( '#### 该gender, page对应得到的item_list为空[]!\n该商品已被下架限时秒杀活动,此处将其删除' ) res = _handle_goods_shelves_in_auto_goods_table( goods_id=item[0], logger=self.lg, update_sql_str=cc_update_str_2, sql_cli=self.sql_cli, ) self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) await async_sleep(.3) index += 1 self.goods_index = index return goods_id, res else: res = await self._one_update(goods_id=goods_id, item_list=item_list) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(CHUCHUJIE_SLEEP_TIME) return goods_id, res
def get_goods_data(self, goods_id: str) -> dict: ''' 模拟构造得到data :param goods_id: :return: data dict类型 ''' if goods_id == '': self._data_error_init() data = {} # 常规商品手机地址 goods_url = 'https://m.mia.com/item-{}.html'.format(goods_id) # 常规商品pc地址 # goods_url = 'https://www.mia.com/item-{}.html'.format(goods_id) print('------>>>| 待抓取的地址为: ', goods_url) body = Requests.get_url_body( url=goods_url, headers=self._get_phone_headers(), # had_referer=True, ip_pool_type=self.ip_pool_type, proxy_type=self.proxy_type, num_retries=self.req_num_retries, ) # print(body) if body == '': print('获取到的body为空值!跳过!') return self._data_error_init() is_mia_mian_page = Selector( text=body).css('div.item-center ::text').extract_first() or '' # print(is_mia_mian_page) # m站是否为补货状态的 判断方法: 通过pc站点击加入购物车的请求来判断是否已缺货!! is_replenishment_status = self._get_replenishment_status( goods_id=goods_id, body=body) if (isinstance(is_mia_mian_page, str) and is_mia_mian_page == '进口母婴正品特卖')\ or is_replenishment_status: # 单独处理拼团下架被定向到手机版主页的拼团商品 print('++++++ 该拼团商品已下架,被定向到蜜芽主页 or 处在缺货状态中!') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mia_update_str_7) collect() return self._data_error_init() # 判断是否跳转,并得到跳转url, 跳转url的body, 以及is_hk(用于判断是否是全球购的商品) body, sign_direct_url, is_hk = self.get_jump_to_url_and_is_hk( body=body) try: self.main_info_dict = self._get_goods_main_info_dict( goods_id=goods_id) # pprint(self.main_info_dict) data['title'], data['sub_title'] = self.get_title_and_sub_title( body=body) all_img_url = self.get_all_img_url() # pprint(all_img_url) p_info = self._get_p_info(body=body) # pprint(p_info) data['p_info'] = p_info # 获取每个商品的div_desc div_desc = self.get_goods_div_desc() assert div_desc != '', '获取到的div_desc为空值! 请检查' data['div_desc'] = div_desc # print(div_desc) ''' 获取每个规格的goods_id,跟规格名,以及img_url, 用于后面的处理 ''' sku_info = self.get_tmp_sku_info(body, goods_id, sign_direct_url, is_hk) assert sku_info != {}, 'sku_info为空dict' # pprint(sku_info) ''' 获取每个规格对应价格跟规格以及其库存 ''' true_sku_info, i_s, pintuan_time, all_sell_count = self.get_true_sku_info( sku_info=sku_info, goods_id=goods_id) # pprint(true_sku_info) data['price_info_list'] = true_sku_info data['pintuan_time'] = pintuan_time data['all_sell_count'] = all_sell_count # pprint(true_sku_info) # 设置detail_name_list data['detail_name_list'] = self.get_detail_name_list( true_sku_info=true_sku_info) # print(data['detail_name_list']) '''单独处理all_img_url为[]的情况''' if all_img_url == []: all_img_url = [{ 'img_url': true_sku_info[0].get('img_url', '') }] data['all_img_url'] = all_img_url # pprint(all_img_url) # 单独处理得到goods_url if sign_direct_url != '': goods_url = sign_direct_url data['goods_url'] = goods_url data['parent_dir'] = _mia_get_parent_dir(p_info=p_info) except MiaSkusIsNullListException: print('该商品已不参与拼团!! 无拼团属性') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mia_update_str_7) collect() return self._data_error_init() except Exception as e: print('遇到错误如下: ', e) return self._data_error_init() self.result_data = data return data
def deal_with_data(self): ''' 解析data数据,得到需要的东西 :return: dict ''' data = self.result_data if data != {}: shop_name = self._get_shop_name(data=data) # 掌柜 account = '' title = self._get_title(data=data) sub_title = '' detail_name_list = self._get_detail_name_list(data=data) # print(detail_name_list) '''单独处理下架的情况''' if isinstance(detail_name_list, str): if detail_name_list == 'is_delete=1': _handle_goods_shelves_in_auto_goods_table( goods_id=self.result_data.get('goods_id', ''), ) else: pass if detail_name_list == {}: return self._data_error_init() price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price(data=data) all_img_url = self._get_all_img_url(data=data) p_info = self._get_p_info(data=data) div_desc = self._get_div_desc(data=data) # 商品销售时间段 schedule = self._get_goods_schedule(data=data) # pprint(schedule) is_delete = self._get_is_delete(data=data, schedule=schedule) if price == 0 or taobao_price == 0: # 没有获取到价格说明商品已经下架了 is_delete = 1 else: pass parent_dir = data.get('parent_dir', '') all_sell_count = '' if target_str_contain_some_char_check( target_str=title, check_char_obj=CONTRABAND_GOODS_KEY_TUPLE): print('违禁物品下架...') is_delete = 1 else: pass result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'is_delete': is_delete, # 是否下架判断 'schedule': schedule, # 商品销售时间段 'parent_dir': parent_dir, 'all_sell_count': all_sell_count, } # pprint(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) collect() return result else: print('待处理的data为空的dict') return {}
def _get_goods_data(self, goods_id): ''' 得到需求数据 :param goods_id: :return: ''' if goods_id == '': self.lg.error('获取到的goods_id为空值!此处跳过!') return self._get_data_error_init() # 网易考拉pc站抓取, m站p_info信息不全(不采用) # phone_body(requests设置代理一直302无限重定向, 于是phantomjs) # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=url) url = 'https://goods.kaola.com/product/{0}.html'.format(goods_id) self.lg.info('------>>>| 正在抓取考拉地址为: {0}'.format(url)) body = self._get_pc_goods_body(goods_id=goods_id) # self.lg.info(body) pc_goods_body = body if body == '': return self._get_data_error_init() if '你很神,找到了不存在的页面' in body: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=self.lg, ) return self._get_data_error_init() # _ = self._get_right_body(body) # phone端 _ = self._get_pc_right_body(body) # pc端 # pprint(_) if _ == {}: self.lg.error('获取body时索引异常!出错goods_id为:{0}, 出错地址: {1}'.format( goods_id, url)) return self._get_data_error_init() else: # TODO 获取m站的sku_info(但是没有税费) # sku_info_url = 'https://m-goods.kaola.com/product/getWapGoodsDetailDynamic.json' # params = self._get_params(goods_id=goods_id) # body = Requests.get_url_body(url=sku_info_url, headers=self.headers, params=params) # 获取pc站的sku_info sku_info_url = 'https://goods.kaola.com/product/getPcGoodsDetailDynamic.json' params = self._get_pc_sku_info_params(goods_id=goods_id) body = Requests.get_url_body(url=sku_info_url, headers=self.headers, params=params, ip_pool_type=self.ip_pool_type) sku_info = json_2_dict(json_str=body, logger=self.lg).get('data') if sku_info is None: self.lg.error( '获取到we的sku_info为None!出错goods_id: {0}, 出错地址: {1}'.format( goods_id, url)) _['sku_info'] = sku_info # pprint(_) _ = self._wash_data(_) # pprint(_) data = {} try: # title, sub_title data['title'] = self._get_title(data=_) data['sub_title'] = '' data['shop_name'] = _.get('goodsInfoBase', {}).get('brandName', '') data['all_img_url'] = self._get_all_img_url(data=_) data['p_info'] = self._get_p_info(data=_) data['div_desc'] = self._get_div_desc(data=_) data['sell_time'] = self._get_sell_time(data=_.get('sku_info', {})) data['detail_name_list'] = self._get_detail_name_list( data=_.get('sku_info', {}).get('skuDetailList', [])) # TODO 网易考拉官方有bug, 实际规格没货的商品, 前端还在卖, 估计是下单后再去订货, 库存0: 我这边就处理为下架 # data['price_info_list'] = self._get_sku_info(data=_.get('sku_info', {}).get('skuDetailList', [])) '''获取pc端的, 价格为算上税费的''' data['price_info_list'] = self._get_pc_sku_info( data=_.get('sku_info', {}).get('skuDetailList', [])) data['price'], data[ 'taobao_price'] = self._get_price_and_taobao_price( data=_.get('sku_info', {}).get('skuPrice', {}), price_info_list=data['price_info_list']) data['is_delete'] = self._get_is_delete( price_info_list=data['price_info_list'], data=data, other=_) data['parent_dir'] = self._get_parent_dir(body=pc_goods_body) self.lg.info('parent_dir: {}'.format(data['parent_dir'])) except GoodsShelvesException: _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, logger=self.lg) return self._get_data_error_init() except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.error('出错goods_id: {0}, 地址: {1}'.format(goods_id, url)) return self._get_data_error_init() self.result_data = data return data
def get_goods_data(self, goods_id): """ 得到data :param goods_id: :return: data 类型dict """ if goods_id == []: return self._data_error_init() tm_type = goods_id[0] # 天猫类型 # self.lg.info(str(tm_type)) goods_id = goods_id[1] # 天猫goods_id # 使用获取基础数据的方式 get_base_data_method = 0 try: data = self.get_tm_base_data(goods_id=goods_id) # pprint(data) try: tb_api_redirect_detect(data=data) except AssertionError: # 尝试第二种获取数据方式 self.lg.info( 'trying second method to get data[where goods_id: {}] ...'. format(goods_id)) # 修改方式 get_base_data_method = 1 data = get_tm_m_body_data(goods_id=goods_id, proxy_type=self.proxy_type, num_retries=self.req_num_retries, logger=self.lg) # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \ and data.get('data', {}).get('seller', {}).get('evaluates') is None: raise GoodsShelvesException # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data', {}).get('seller', {}).get('evaluates') is None: self.lg.error( 'data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错tm_type: {}, goods_id: {}' .format(tm_type, goods_id)) return self._data_error_init() data = self._wash_tm_ori_data(data=data) result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get( 'value', {}) except GoodsShelvesException: ## 表示该商品已经下架, 原地址被重定向到新页面 _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, logger=self.lg) tmp_data_s = self.init_pull_off_shelves_goods(tm_type=tm_type) self.result_data = {} return tmp_data_s except (AssertionError, IndexError): self.lg.error(msg='遇到错误[出错tm_type: {}, goods_id: {}]:'.format( tm_type, goods_id, ), exc_info=True) return self._data_error_init() if get_base_data_method == 0: # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0][ 'value'] = self._wash_result_data_apiStack_value( goods_id=goods_id, result_data_apiStack_value=result_data_apiStack_value) else: pass mock_data = result_data['mockData'] if get_base_data_method == 0: # 处理mockData mock_data = json_2_dict(json_str=mock_data, logger=self.lg) elif get_base_data_method == 1: pass else: raise ValueError('get_base_data_method value异常!') if mock_data == {}: self.lg.error('出错tm_type: {0}, goods_id: {1}'.format( tm_type, goods_id)) return self._data_error_init() mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # self.lg.info(str(result_data.get('apiStack', [])[0])) # 可能会有{'name': 'esi', 'value': ''}的情况 if result_data.get('apiStack', [])[0].get('value', '') == '': self.lg.error( "result_data.get('apiStack', [])[0].get('value', '')的值为空....出错tm_type: {}, goods_id: {}" .format(tm_type, goods_id)) result_data['trade'] = {} return self._data_error_init() else: # 用于判断该商品是否已经下架的参数 result_data['trade'] = result_data\ .get('apiStack', [])[0]\ .get('value', {})\ .get('trade', {}) # pprint(result_data['trade']) # 单独写爬虫进行获取优惠券 # # 获取tm优惠券 # result_data['coupon_list'] = self.get_coupon_list( # result_data=result_data, # goods_id=goods_id,) result_data['type'] = tm_type result_data['goods_id'] = goods_id self.result_data = result_data # pprint(self.result_data) return result_data