def _get_pc_right_body(self, body): ''' 处理pc端得到需求数据 :param body: :return: ''' try: body = re.compile( r'window.__kaolaHeadData = (.*?);</script>').findall(body)[0] goodsInfoBase = re.compile(r'goodsInfoBase: (.*?), //基本').findall( body)[0] goodsDetailContent = re.compile( r'goodsDetailContent: (.*?), //图文详情').findall(body)[0] kaolaSuperMarket = re.compile( r'kaolaSuperMarket: (.*?), //needSelfTag').findall(body)[0] # self.lg.info(str(body)) except IndexError: self.lg.error('遇到错误:', exc_info=True) return {} _ = {} _['goodsInfoBase'] = json_2_dict(json_str=goodsInfoBase, logger=self.lg) _['goodsDetailContent'] = json_2_dict(json_str=goodsDetailContent, logger=self.lg) _['kaolaSuperMarket'] = kaolaSuperMarket return _
def turn_one_time() -> dict: cookies = { 'Hm_lpvt_fa0ddec29ac177a2d127cebe209832e3': str(datetime_to_timestamp(get_shanghai_time())), 'Hm_lvt_fa0ddec29ac177a2d127cebe209832e3': '1537161510,1537228200,1537353114,1537411854', # 定值 'wk_': '9umq63s8g6leobk2p285frmp583nhm9t', # 定值 } headers = { 'Host': 'm.riyiwk.com', 'accept': 'application/json, text/javascript, */*; q=0.01', 'origin': 'https://m.riyiwk.com', 'referer': 'https://m.riyiwk.com/lottery.html?check_login=1', 'accept-language': 'zh-cn', 'x-requested-with': 'XMLHttpRequest', 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Mobile/15A5341f/RIYIWK 2.6.0/USER_ID 203793/TOKEN 3a3988e07be98db064a70fc635c0b590', } url = 'https://m.riyiwk.com/lottery/start.html' res = json_2_dict( Requests.get_url_body(method='post', use_proxy=False, url=url, headers=headers, cookies=cookies)) # pprint(res) return res
def share_2_wx() -> bool: ''' 分享给微信 :return: ''' cookies = { 'wk_': '8llgqrevckd0bmllcdgrtqjv88elq3fl', } headers = { 'Host': 'ios.riyiwk.com', 'accept': '*/*', 'content-type': 'application/x-www-form-urlencoded', 'user-agent': 'ExtraIncome/2.6.0 (iPhone; iOS 11.0; Scale/3.00)', 'accept-language': 'zh-Hans-CN;q=1, en-CN;q=0.9', } data = 'data=6FutSNjTIN512XBvPZXgztwPxRaLLFygqXFrzxnaSHhKJ0RMskgPCJ1veAFe71DmE/Weqi3qbl9Jp%2BWfhSSCtlPnKIheoydBjmxWvUtEh9qV4RXkSil0AWr5P5f8V4jL/OnQQxXgTeOBhhsJK7140Iuc/kdtw0qP' url = 'https://ios.riyiwk.com//user/shareCallback' message = json_2_dict( Requests.get_url_body(method='post', use_proxy=False, url=url, headers=headers, cookies=cookies, data=data)).get('message', '') label, res = ( '+', True, ) if message == '成功' else ( '-', False, ) print('[{}] 分享微信成功!'.format(label)) return res
def _get_shop_name(self, **kwargs): ''' 得到shop_name ''' data = kwargs.get('data', {}) seller_id = data.get('/app/detail/product/base', {}).get('sellerId', 0) tmp_seller_id_url = 'https://th5.m.zhe800.com/api/getsellerandswitch?sellerId=' + str(seller_id) seller_info_body = MyRequests.get_url_body(url=tmp_seller_id_url, headers=self.headers, high_conceal=True) if seller_info_body == '': print('seller_info为空!') return {} else: seller_info = [seller_info_body] seller_info_str = '' for item_ss in seller_info: # 拼接字符串 seller_info_str += item_ss seller_info = [seller_info_str] # print(seller_info) if seller_info != []: seller_info = json_2_dict(json_str=seller_info[0]) if seller_info == {}: print('卖家信息在转换时出现错误, 此处跳过') return {} # pprint(seller_info) shop_name = seller_info.get('sellerInfo', {}).get('nickName', '') else: shop_name = '' # print(shop_name) return shop_name
def get_p_info_list(self, goods_id): ''' 得到详情介绍信息 :param goods_id: :return: 返回一个list ''' p_info_url = 'https://pina.m.zhe800.com/cns/products/get_product_properties_list.json?productId=' + str(goods_id) p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers, high_conceal=True) if p_info_body == '': print('获取到的p_info_body为空值, 此处跳过!') p_info_body = '{}' tmp_p_info = json_2_dict(json_str=p_info_body).get('perportieslist', []) if tmp_p_info == []: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 if tmp_p_info != []: p_info = [{ 'p_name': item.get('name', ''), 'p_value': item.get('value'), } for item in tmp_p_info] else: p_info = tmp_p_info return p_info
async def json_2_dict(self, json_str): ''' 异步json_2_dict :param json_str: :return: {} | {...} ''' return json_2_dict(json_str=json_str, logger=self.my_lg)
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} _tmp_comment_list = [] self.my_lg.info('------>>>| 待抓取的goods_id: %s' % goods_id) ''' 下面抓取的是pc端的数据地址 ''' # 获取评论数据 for current_page_num in range(1, 4): self.my_lg.info('------>>>| 正在抓取第%s页评论...' % str(current_page_num)) tmp_url = 'https://rate.taobao.com/feedRateList.htm' _params = self._set_params(current_page_num=current_page_num, goods_id=goods_id) self.headers.update({'referer': 'https://item.taobao.com/item.htm?id='+goods_id}) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=_params, encoding='gbk') # self.my_lg.info(str(body)) try: body = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.my_lg.error('re得到需求body时出错! 出错goods_id: ' + goods_id) sleep(.5) self.result_data = {} return {} data = json_2_dict(json_str=body, logger=self.my_lg).get('comments') # pprint(data) if data is None: self.my_lg.error('出错goods_id: ' + goods_id) self.result_data = {} return {} if data == []: # 该页的"comments"=[], 跳出本次循环 continue _tmp_comment_list += data sleep(self.comment_page_switch_sleep_time) # self.my_lg.info(str(len(_tmp_comment_list))) try: _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id: ' + goods_id) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _wash_target_data(self, data): ''' 清洗数据 :return: ''' try: data['comment'] = {} data['service'] = [] # 发货售后 except: pass tmp_activitys = data.get('good', {}).get('activitys', {}) activitys = {} try: for key, value in tmp_activitys.items(): value = json_2_dict(value, logger=self.my_lg) activitys.update({ key: value, }) except Exception as e: raise e data['good']['activitys'] = activitys return data
def get_div_desc_body(self, div_desc_url): ''' 得到div_desc的html页面 :param div_desc_url: :return: str类型的data, 出错的情况下返回{} ''' # 使用requests div_desc_body = MyRequests.get_url_body(url=div_desc_url, headers=self.headers) if div_desc_body == '': div_desc_body = '{}' # 使用phantomjs # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url) # # print(div_desc_body) # if div_desc_body == '': # div_desc_body = '{}' # else: # try: # div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0] # div_desc_body = re.compile(r'>').sub('>', div_desc_body) # div_desc_body = re.compile(r'<').sub('<', div_desc_body) # except: # div_desc_body = '{}' tmp_body = json_2_dict(json_str=div_desc_body).get('data', '') if tmp_body == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 tmp_body = self._wash_div_desc(tmp_body=tmp_body) if tmp_body != '': tmp_body = '<div>' + tmp_body + '</div>' return tmp_body
def get_p_info_list(self, p_info_url): ''' 得到详情介绍信息 :param p_info_url: :return: 返回一个list ''' # 使用requests p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers) if p_info_body == '': print('获取到的p_info_body为空值, 此处跳过!') p_info_body = '{}' tmp_p_info = json_2_dict(json_str=p_info_body).get( 'perportieslist', []) if tmp_p_info == []: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 if tmp_p_info != []: p_info = [{ 'p_name': item.get('name', ''), 'p_value': item.get('value'), } for item in tmp_p_info] else: p_info = tmp_p_info return p_info
def orc_captcha(captcha_url): '''识别验证码''' baidu_orc_info_path = '/Users/afa/baidu_orc.json' with open(baidu_orc_info_path, 'r') as f: baidu_orc_info = json_2_dict(f.read()) img_path = './images/captcha.jpg' app_id = str(baidu_orc_info['app_id']) api_key = baidu_orc_info['api_key'] secret_key = baidu_orc_info['secret_key'] save_img_through_url(img_url=captcha_url, save_path=img_path) orc_res = baidu_ocr_captcha( app_id=app_id, api_key=api_key, secret_key=secret_key, img_path=img_path, orc_type=2) # print(orc_res) captcha = '' try: captcha = orc_res.get('words_result', [])[0].get('words', '') except IndexError: pass return captcha
def _get_right_body(self, body): ''' 处理phone端得到需求数据 :param body: :return: ''' try: body_1 = re.compile( r'window.__Goods__ = (.*?),}</script>').findall(body)[0] body_1 += '}' # 尺码表 sizeChartImgs = re.compile(r'sizeChartImgs: (.*?),//').findall( body[0]) basicInfo = re.compile(r'basicInfo: (.*?),goodsNotice').findall( body_1)[0] skuPropertyList = re.compile( r'skuPropertyList: (.*?),specialGoodsDesc').findall(body_1)[0] kaolaSuperMarket = re.compile( r'kaolaSuperMarket: (.*?),colorSliderImgs').findall(body_1)[0] brandGoodsAmount = re.compile( r'brandGoodsAmount: (.*?),brandLogo').findall(body_1)[0] goodsDetailContent = re.compile( r'goodsDetailContent: (.*?),vipGoods').findall(body_1)[0] vipGoods = re.compile(r'vipGoods: (.*?),vipGoodsLogo').findall( body_1)[0] # self.lg.info(str(sizeChartImgs)) # self.lg.info(str(body_1)) except IndexError: self.lg.error('遇到错误:', exc_info=True) return {} _ = {} _['basicInfo'] = json_2_dict(json_str=basicInfo, logger=self.lg) _['skuPropertyList'] = json_2_dict(json_str=skuPropertyList, logger=self.lg) _['kaolaSuperMarket'] = kaolaSuperMarket _['brandGoodsAmount'] = brandGoodsAmount _['goodsDetailContent'] = json_2_dict(json_str=goodsDetailContent, logger=self.lg) _['vipGoods'] = vipGoods _['sizeChartImgs'] = sizeChartImgs # pprint(_) return _
def _get_tmall_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' # 搜索: 偶尔不稳定但是还是能用 headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte', } params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } s_url = 'https://list.tmall.com/m/search_items.htm' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: data = json_2_dict(json_str=body, logger=self.my_lg) if data == {}: self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: _ = data.get('item', []) if _ is None or _ == []: self.my_lg.error( '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] try: goods_id_list = [str(item.get('url', '')) for item in _] except Exception as e: self.my_lg.exception(e) self.my_lg.error( '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] return goods_id_list
def get_ak() -> str: bd_api_json = '' with open('/Users/afa/myFiles/pwd/baidu_map_pwd.json', 'r') as f: for line in f: bd_api_json += line.replace('\n', '').replace(' ', '') # print(bd_api_json) ak = json_2_dict(json_str=bd_api_json) \ .get('fz_map_info', {}) \ .get('ak', '') assert ak != '', 'ak不为空str!' return ak
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) self.goods_id = goods_id self.headers.update({ 'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id), }) # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 3): _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json' params = self._set_params(goods_id=goods_id, current_page=current_page) body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params) # self.my_lg.info(str(body)) _data = json_2_dict(json_str=body, logger=self.my_lg).get( 'wareDetailComment', {}).get('commentInfoList', []) if _data == []: self.my_lg.error('出错goods_id:{0}'.format(self.goods_id)) _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id:{0}'.format(goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _test(self): driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH) url = 'https://httpbin.org/get' body = driver.get_url_body(url=url) # lg.info(str(body)) try: data = json_2_dict(re.compile('<pre.*?>(.*)</pre>').findall(body)[0], default_res={}) except IndexError: return {} del driver return data
def _get_one_page_articles(self, page_num) -> list: ''' 得到一页新闻 :param page_num: :return: ''' headers = { 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': get_random_pc_ua(), 'Accept': '*/*', 'Referer': 'https://36kr.com/', 'Connection': 'keep-alive', } params = ( ('per_page', '20'), ('page', str(page_num)), ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(100, 999))), ) url = 'https://36kr.com/api/search-column/mainsite' data = json_2_dict( Requests.get_url_body(url=url, headers=headers, params=params, cookies=None)).get('data', {}).get('items', []) # pprint(data) if data == []: return [] [ item.update({'user_info': json_2_dict(item.get('user_info', ''))}) for item in data ] # pprint(data) return data
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' pintuan_goods_id_list = [] for page in range(0, 100): tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) try: body = Requests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) assert body != '', 'body为空值!' tmp_data = json_2_dict(json_str=body, default_res={}).get( 'data', {}).get('goods', []) # print(tmp_data) assert tmp_data != [], '该tmp_url得到的goods为空list, 此处跳过!' sleep(.5) except AssertionError as e: print(e) sleep(.5) break tmp_pintuan_goods_id_list = [{ 'goods_id': item.get('goods_id', ''), 'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))), 'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))), 'all_sell_count': str(item.get('join_number_int', '')), 'page': page, } for item in tmp_data] # print(tmp_pintuan_goods_id_list) for item in tmp_pintuan_goods_id_list: if item.get('goods_id', '') not in [ item2.get('goods_id', '') for item2 in pintuan_goods_id_list ]: pintuan_goods_id_list.append(item) print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list)) print(pintuan_goods_id_list) return pintuan_goods_id_list
def _get_origin_comment_list(self, **kwargs) -> list: ''' 得到加密的接口数据信息 :param kwargs: :return: ''' csrf = kwargs.get('csrf', '') goods_id = kwargs.get('goods_id', '') cookies = kwargs.get('cookies', '') url = 'https://m.1688.com/page/offerRemark.htm' headers = { 'cookie': cookies, 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': 'application/json, text/javascript, */*; q=0.01', 'referer': 'https://m.1688.com/page/offerRemark.htm?offerId={}'.format(goods_id), 'authority': 'm.1688.com', 'x-requested-with': 'XMLHttpRequest', } origin_comment_list = [] for i in range(1, self.max_page): __wing_navigate_options = { 'data': { 'bizType': 'trade', 'itemId': int(goods_id), 'offerId': str(goods_id), 'page': i, 'pageSize': 5, # 'receiveUserId': 989036456, 'starLevel': 7 } } params = ( ('_csrf', csrf), ('__wing_navigate_type', 'view'), ('__wing_navigate_url', 'detail:modules/offerRemarkList/view'), ('__wing_navigate_options', dumps(__wing_navigate_options)), ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999))), ) body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) data = json_2_dict(body, encoding='ascii').get('data', {}) # pprint(data) one = data.get('model', []) pprint(one) origin_comment_list += one sleep(.25) return origin_comment_list
def _get_one_page_comment_info(self, page_num, goods_id) -> tuple: """ 获取单页comment info :return: """ def _get_params(goods_id, page_num, page_size): params = ( ('productId', str(goods_id)), ('tagId', ''), ('page', str(page_num)), ('perPage', page_size), ) return params tmp_url = 'https://th5.m.zhe800.com/app/detail/comment/list' headers = get_random_headers( connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='', ) headers.update({ 'referer': 'https://th5.m.zhe800.com/h5/comment/list?zid={0}&dealId=39890410&tagId='.format(str(goods_id)) }) params = _get_params( goods_id=goods_id, page_num=page_num, page_size=self.page_size, ) body = Requests.get_url_body( url=tmp_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) data = json_2_dict( json_str=body, logger=self.lg, default_res={}) # pprint(data) assert data.get('comments') is not None\ and data.get('hasNext') is not None, '获取到的data为None, 出错goods_id: {}'.format(goods_id) # 判断是否下页还有评论信息 # <class 'bool'> has_next_page = data.get('hasNext', False) data = data.get('comments', []) self.lg.info('[{}] page_num: {}'.format( '+' if data != [] else '-', page_num,)) return data, has_next_page
def get_true_sku_info(self, sku_info): ''' 获取每个规格对应价格跟规格以及其库存 :param sku_info: :return: {} 空字典表示出错 | (true_sku_info, i_s) ''' goods_id_str = '-'.join([item.get('goods_id') for item in sku_info]) # print(goods_id_str) tmp_url = 'https://p.mia.com/item/list/' + goods_id_str # print(tmp_url) tmp_body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(tmp_body) tmp_data = json_2_dict(json_str=tmp_body).get('data', []) if tmp_data == []: self.result_data = {} return {} true_sku_info = [] i_s = {} for item_1 in sku_info: for item_2 in tmp_data: if item_1.get('goods_id') == str(item_2.get('id', '')): i_s = item_2.get('i_s', {}) # print(i_s) for item_3 in i_s.keys(): tmp = {} if item_3 == 'SINGLE': spec_value = item_1.get('color_name') else: spec_value = item_1.get( 'color_name') + '|' + item_3 normal_price = str(item_2.get('mp')) detail_price = str(item_2.get('sp')) img_url = item_1.get('img_url') rest_number = i_s.get(item_3) if rest_number == 0: pass else: tmp['spec_value'] = spec_value tmp['normal_price'] = normal_price tmp['detail_price'] = detail_price tmp['img_url'] = img_url tmp['rest_number'] = rest_number true_sku_info.append(tmp) return (true_sku_info, i_s)
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] for gender in ['0', '1']: # 男,女 for page in range(0, 100): # page控制放回数据为哪一页 print('正在抓取的page为: ', page) body = self.get_one_page_goods_info(gender, page) json_body = json_2_dict(body, default_res={}) try: this_page_total_count = json_body.get('data', {}).get( 'groupList', [])[0].get('totalCount', 0) except IndexError: print('获取this_page_total_count时出错, 请检查!') this_page_total_count = 0 # print(this_page_total_count) if this_page_total_count == 0: print('### 该性别的全部限时商品信息获取完毕 ###') break tmp_goods_list = json_body.get('data', {}).get( 'groupList', [])[0].get('dataList', []) for item in tmp_goods_list: item['gender'] = gender item['page'] = page for item in tmp_goods_list: if item.get('id', 0) not in [ item_1.get('id', 0) for item_1 in all_goods_list ]: all_goods_list.append(item) sleep(.4) all_goods_list = [{ 'goods_id': str(item.get('chuchuId', '')), 'sub_title': item.get('description', ''), 'gender': item.get('gender', '0'), 'page': item.get('page') } for item in all_goods_list] print(all_goods_list) print('本次抓取共有限时商品个数为: ', len(all_goods_list)) self.deal_with_data(all_goods_list) return None
def check_proxy_status(self, proxy, timeout=CHECK_PROXY_TIMEOUT) -> bool: ''' 检测代理状态, 突然发现, 免费网站写的信息不靠谱, 还是要自己检测代理的类型 :param proxy: 待检测代理 :return: ''' # lg.info(str(self.request)) res = False headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_pc_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } proxies = { 'http': 'http://' + proxy, # 'https': 'https://' + proxy, } try: response = requests.get(url=TEST_HTTP_HEADER, headers=headers, proxies=proxies, timeout=timeout) lg.info(str(response.text)) if response.ok: content = json_2_dict(json_str=response.text) proxy_connection = content.get('headers', {}).get('Proxy-Connection', None) lg.info('Proxy-Connection: {}'.format(proxy_connection)) ip = content.get('origin', '') if ',' in ip: # 两个ip, 匿名度: 透明 pass elif proxy_connection: pass else: # 只抓取高匿名代理 lg.info(str('成功捕获一只高匿ip: {}'.format(proxy))) return True else: pass except Exception: pass return res
def check_proxy_status(self, proxy, local_ip, timeout=CHECK_PROXY_TIMEOUT) -> bool: ''' 检测代理状态, 突然发现, 免费网站写的信息不靠谱, 还是要自己检测代理的类型 :param proxy: 待检测代理 :return: ''' # lg.info(str(self.request)) res = False headers = _get_base_headers() proxies = { 'http': 'http://' + proxy, # 'https': 'https://' + proxy, } try: with session() as s: response = s.get(url=TEST_HTTP_HEADER, headers=headers, proxies=proxies, timeout=timeout) lg.info(str(response.text)) if response.ok: content = json_2_dict(json_str=response.text) proxy_connection = content.get('headers', {}).get('Proxy-Connection', None) lg.info('Proxy-Connection: {}'.format(proxy_connection)) ip = content.get('origin', '') assert ip != '', 'ip为空!' # TODO 老版本的判断 pass # if ',' in ip\ # or proxy_connection: # 两个ip, 匿名度: 透明 # pass # else: # 只抓取高匿名代理 # if local_ip != ip: # lg.info(str('成功捕获一只高匿ip: {}'.format(proxy))) # return True # 新版判断, 新版不用代理请求httpbin返回格式: '原ip, 原ip' local_ip_str = '{}, {}'.format(local_ip, local_ip) if local_ip_str != ip \ and local_ip not in ip: # print(now_ip) lg.info(str('成功捕获一只高匿ip: {}'.format(proxy))) return True else: pass else: pass except Exception: pass return res
def get_goods_div_desc(self, tmp_p_info_body): ''' 得到div_desc :param body: :return: '' or str ''' def _get_div_images_list(target): div_images_list = [] for item in target: if re.compile('http').findall(item) == []: item = 'http:' + item div_images_list.append(item) return div_images_list tmp_p_info_data = json_2_dict(json_str=tmp_p_info_body) if tmp_p_info_data == {}: return '' div_images_list = _get_div_images_list( target=tmp_p_info_data.get('data', {}).get('detailInfos', {}).get( 'detailImage', [])[0].get('list', [])) if div_images_list == []: # print('div_images_list为空list, 出错请检查!') # 可能在[1] 这个里面再进行处理 div_images_list = _get_div_images_list( target=tmp_p_info_data.get('data', {}).get('detailInfos', {}). get('detailImage', [])[1].get('list', [])) if div_images_list == []: print('div_images_list为空list, 出错请检查!') return '' else: tmp_div_desc = '' for item in div_images_list: tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( item) tmp_div_desc += tmp div_desc = '<div>' + tmp_div_desc + '</div>' else: tmp_div_desc = '' for item in div_images_list: tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( item) tmp_div_desc += tmp div_desc = '<div>' + tmp_div_desc + '</div>' return div_desc
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] for index in range(1, 1000): # 0跟1返回一样,所有从1开始遍历 tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str( index) + '/0/' print('正在抓取: ', tmp_url) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, high_conceal=True, ip_pool_type=self.ip_pool_type) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: tmp_data = json_2_dict(json_str=body) if tmp_data == {}: print('json.loads转换body时出错, 此处跳过!') if tmp_data.get('data_list', []) == []: print('得到的data_list为[], 此处跳过!') break else: # print(tmp_data) data_list = [{ 'goods_id': item.get('sku', ''), 'sub_title': item.get('intro', ''), 'pid': index, } for item in tmp_data.get('data_list', [])] # pprint(data_list) for item in data_list: goods_list.append(item) sleep(.5) pprint(goods_list) self.deal_with_data(goods_list=goods_list) sleep(8) return None
def ocr_mt_captcha(): with open('/Users/afa/myFiles/pwd/yundama_pwd.json', 'r') as f: yundama_info = json_2_dict(f.read()) username = yundama_info['username'] pwd = yundama_info['pwd'] app_key = yundama_info['app_key'] res = yundama_ocr_captcha( username=username, pwd=pwd, app_key=app_key, code_type=1004, # 4位字符数字 img_path='./mt_captcha.png') print('识别结果:{}'.format(res)) return res
def get_gd_key() -> str: gd_api_json = '' gd_map_pwd_file_path = '/Users/afa/myFiles/pwd/gaode_map_pwd.json' with open(gd_map_pwd_file_path, 'r') as f: for line in f: gd_api_json += line.replace('\n', '').replace(' ', '') # self.lg.info(gd_api_json) gd_key_list = json_2_dict( json_str=gd_api_json, ) \ .get('fz_map_info', []) pprint(gd_key_list) assert gd_key_list != [], 'gd_key_list不为空list!' gd_key_list = [item.get('key', '') for item in gd_key_list] gd_key = gd_key_list[1] assert gd_key != '', 'gd_key不为空str!' return gd_key
def get_stock_info_dict(self, goods_id): ''' 得到实时库存信息 :param goods_id: :return: 返回dict类型 ''' stock_info_url = 'https://pina.m.zhe800.com/cns/products/' + str(goods_id) + '/realtime_info.json' stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers, high_conceal=True) if stock_info_body == '': print('获取到的stock_info_body为空值!') stock_info_body = '{}' tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {}) if tmp_stock_info == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return tmp_stock_info
def get_stock_info_dict(self, stock_info_url): ''' 得到实时库存信息 :param stock_info_url: :return: 返回dict类型 ''' stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers) if stock_info_body == '': print('获取到的stock_info_body为空值!') stock_info_body = '{}' tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {}) if tmp_stock_info == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return tmp_stock_info