def get_one_page_all_goods_list(self, *params): ''' 得到一个页面地址的所有商品list :return: str | list 类型 ''' page = params[0] all_goods_list = [] tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} return '网络错误!' this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: return [] for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) # sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] return all_goods_list
def get_one_page_goods_info(self, *params): ''' 得到一个页面的html代码 :param params: 待传入的参数 :return: '{}' or str ''' gender, page = params tmp_url = 'https://api.chuchujie.com/api/' client = { "ageGroup": "AG_0to24", "channel": "QD_web_webkit", "deviceId": "0", "gender": gender, # '0' -> 女 | '1' -> 男 "imei": "0", "packageName": "com.culiu.purchase", "platform": "wap", "sessionId": "0", "shopToken": "0", "userId": "0", "version": "1.0", "xingeToken": "" } query = {"group": 4, "module": "99", "page": page, "tab": "all"} # 切记: Query String Parameters直接这样编码发送即可 # 如果是要post的数据就得使用post的方法 data = { 'client': json.dumps(client), 'query': json.dumps(query), 'page': page } body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=data) if body == '': body = '{}' return body
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' zid_list = [] for page in range(0, 100): tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) tmp_body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True) if tmp_body == '': tmp_body = '{}' try: tmp_data = json.loads(tmp_body) tmp_data = tmp_data.get('objects', []) except: print('json.loads转换tmp_data时出错!') tmp_data = [] # print(tmp_data) if tmp_data == []: print('该tmp_url得到的object为空list, 此处跳过!') break tmp_zid_list = [(item.get('product', {}).get('zid', ''), page) for item in tmp_data] # print(tmp_zid_list) for item in tmp_zid_list: if item != '': zid_list.append(item) zid_list = list(set(zid_list)) print('该zid_list的总个数为: ', len(zid_list)) print(zid_list) return zid_list
def _get_wm_page_info(self): ''' 获取外卖页面的json推荐 :return: ''' # cookies = { # 'ASP.NET_SessionId': 'rxnstx4qhayrkqdne3coeevj', # } all_rows = [] print('开始采集券妈妈外卖券!') for page_index in range(1, 5): print('正在抓取第{0}页...'.format(page_index)) data = self._set_data(page_index=page_index) url = 'https://app.quanmama.com/apios/v5/appZdmList.ashx' body = MyRequests.get_url_body(method='post', url=url, headers=self.headers, cookies=None, data=data) # print(body) if body == '': print('获取到的body为空值!此处跳过!') continue # print(body) rows = json_2_dict(json_str=body).get('data', {}).get('rows', []) if rows == []: print('得到的rows为空值!此处跳过!') continue # pprint(rows) all_rows += rows sleep(self.page_sleep_time) print('\n@@@@@@ 抓取完毕!') wm_list = self._parse_wm_page(all_rows) # pprint(wm_list) self._deal_with_wm_info(wm_list)
def getAllExternalLinks(siteUrl): domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc html = MyRequests.get_url_body(url=siteUrl, headers=headers) bsObj = BeautifulSoup(html, 'lxml') internalLinks = getInternalLinks(bsObj, domain) externalLinks = getExternalLinks(bsObj, domain) f = open('result.txt', 'w') # 收集外链 for link in externalLinks: if link not in allExtLinks: allExtLinks.add(link) # print(link) f.writelines(link + '\n') print("即将获取的外部链接的URL是:" + link) # 收集内链 for link in internalLinks: if link not in allIntLinks: print("即将获取内部链接的URL是:" + link) allIntLinks.add(link) getAllExternalLinks(link) f.writelines(link + '\n')
def run_forever(): with open('./setting.txt', 'r') as f: start = int(f.readline()) for index in range(start, 99999999999999999): if index % 50 == 0: with open('./setting.txt', 'w') as f: f.write(str(index)) print('*** 短暂休眠...') sleep(2) video_id = str(int('65' + 17 * '0') + index) url = 'https://www.iesdouyin.com/share/video/' + video_id + '/' body = MyRequests.get_url_body(url=url, headers=headers, params=params) # print(body) if deal_with_data(video_id=video_id, body=body) is False: continue else: pass sleep(.2)
def _get_pc_goods_body(self, url, goods_id): ''' 得到pc端商品的body :param goods_id: :return: ''' headers = { 'authority': 'goods.kaola.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'davisit=39; usertrack=O2+g2Ftatitk7YwIAwY2Ag==; _ntes_nnid=7732365205c88dc47486ad1208406e7e,1532671534874; _ga=GA1.2.960357080.1532671535; _gid=GA1.2.1543960295.1532671535; _klhtxd_=31; kaola_user_key=47cca4d0-57c9-41ca-ae67-2172c4a81500; __da_ntes_utma=2525167.1705273738.1532671535.1532671535.1532671535.1; davisit=1; __da_ntes_utmz=2525167.1532671535.1.1.utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); _jzqc=1; WM_TID=BuJzWuW25WT9h9YnJbNPwKuHb0%2FJdiEw; __kaola_usertrack=20180727140634933960; _da_ntes_uid=20180727140634933960; NTES_KAOLA_ADDRESS_CONTROL=330000|330100|330102|1; _qzjc=1; _ga=GA1.3.960357080.1532671535; KAOLA_NEW_USER_COOKIE=no; JSESSIONID-WKL-8IO=Ej0upUk4%2BoTwIaESuhWSdrP8LjGjGKPjy%5CzIHKVwWYJVzUbwkZTvIHZZ2oVgK9ZtzWBis36RUCxcfMMr793Xhr%2FSsY%2Br23bCIsjP%2F1bmz05eUdBpClLvMDOX%5CXC%5C4Chn2a6VZ%2FwA4VITIfWMWfpIO2CBt1YfXDpi0a7q2r6pvsE3SihO%3A1532930915344; _jzqckmp=1; _jzqx=1.1532671536.1532917914.2.jzqsr=google%2Ecom|jzqct=/.jzqsr=kaola%2Ecom|jzqct=/; _gid=GA1.3.1543960295.1532671535; _qzja=1.171255260.1532671601817.1532671601817.1532917917322.1532917929092.1532918028838..0.0.7.2; _qzjto=3.1.0; NTES_KAOLA_RV=1330333_1532918029707_0|27757_1532917929047_0|27979_1532672800324_0|1472242_1532671698324_0; _jzqa=1.658432386831847000.1532671536.1532917914.1532923346.10; __da_ntes_utmb=2525167.1.10.1532917929', } params = ( ('ri', 'navigation'), ('from', 'page1'), ('zn', 'result'), ('zp', 'page1-0'), ('position', '0'), ('istext', '0'), # ('srId', '8bd1e06482b5730be802f6ce6f56dacf'), ('isMarketPriceShow', 'true'), ('hcAntiCheatSwitch', '0'), ('anstipamActiCheatSwitch', '1'), # ('anstipamActiCheatToken', 'de3223456456fa2e3324354u4567lt'), # ('anstipamActiCheatValidate', 'anstipam_acti_default_validate'), ) body = MyRequests.get_url_body(url=url, headers=headers, params=params) # print(body) return body
def _get_shop_name(self, **kwargs): ''' 得到shop_name ''' data = kwargs.get('data', {}) seller_id = data.get('/app/detail/product/base', {}).get('sellerId', 0) tmp_seller_id_url = 'https://th5.m.zhe800.com/api/getsellerandswitch?sellerId=' + str( seller_id) seller_info_body = MyRequests.get_url_body(url=tmp_seller_id_url, headers=self.headers, high_conceal=True) if seller_info_body == '': print('seller_info为空!') return {} else: seller_info = [seller_info_body] seller_info_str = '' for item_ss in seller_info: # 拼接字符串 seller_info_str += item_ss seller_info = [seller_info_str] # print(seller_info) if seller_info != []: seller_info = json_2_dict(json_str=seller_info[0]) if seller_info == {}: print('卖家信息在转换时出现错误, 此处跳过') return {} # pprint(seller_info) shop_name = seller_info.get('sellerInfo', {}).get('nickName', '') else: shop_name = '' # print(shop_name) return shop_name
def get_div_desc_body(self, goods_id): ''' 得到div_desc的html页面 :param goods_id: :return: str类型的data, 出错的情况下返回{} ''' div_desc_url = 'https://pina.m.zhe800.com/nnc/product/detail_content.json?zid=' + str(goods_id) # 使用requests div_desc_body = MyRequests.get_url_body(url=div_desc_url, headers=self.headers, high_conceal=True) if div_desc_body == '': div_desc_body = '{}' # 使用phantomjs # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url) # # print(div_desc_body) # if div_desc_body == '': # div_desc_body = '{}' # else: # try: # div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0] # div_desc_body = re.compile(r'>').sub('>', div_desc_body) # div_desc_body = re.compile(r'<').sub('<', div_desc_body) # except: # div_desc_body = '{}' tmp_body = json_2_dict(json_str=div_desc_body).get('data', '') if tmp_body == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 tmp_body = self._wash_div_desc(tmp_body=tmp_body) if tmp_body != '': tmp_body = '<div>' + tmp_body + '</div>' return tmp_body
def get_goods_data(self, goods_id:str) -> '重载获取数据的方法': ''' 模拟构造得到data的url :param goods_id: :return: data dict类型 ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: if re.compile(r'/rushdetail/').findall(goods_id) != []: tmp_url = goods_id print('------>>>| 原pc地址为: ', tmp_url) goods_id = re.compile('https://shop.mogujie.com/rushdetail/(.*?)\?.*?').findall(goods_id)[0] print('------>>>| 得到的蘑菇街商品id为:', goods_id) else: print('获取到的蘑菇街买哦啥地址错误!请检查') self.result_data = {} return {} data = {} body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('获取到的body为空str!') self.result_data = {} return {} try: goods_info = re.compile(r'var detailInfo = (.*?);</script>').findall(body)[0] # print(goods_info) item_info = re.compile(r'itemInfo:(.*?) ,priceRuleImg').findall(goods_info)[0] # print(item_info) sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall(goods_info)[0] # print(sku_info) shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall(goods_info)[0] # print(shop_info) item_info = json_2_dict(json_str=item_info) sku_info = json_2_dict(json_str=sku_info) shop_info = json_2_dict(json_str=shop_info) # pprint(item_info) # pprint(sku_info) # pprint(shop_info) data['title'] = item_info.get('title', '') if data['title'] == '': print('title为空!') raise Exception data['sub_title'] = '' data['shop_name'] = shop_info.get('name', '') # print(data['shop_name']) # 获取所有示例图片 all_img_url = [{'img_url': item} for item in item_info.get('topImages', [])] # pprint(all_img_url) data['all_img_url'] = all_img_url ''' 获取p_info ''' p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str(goods_id) tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url, headers=self.headers, had_referer=True) # print(tmp_p_info_body) if tmp_p_info_body == '': print('获取到的tmp_p_info_body为空值, 请检查!') raise Exception p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body) # pprint(p_info) # if p_info == []: # print('获取到的p_info为空list') # self.result_data = {} # return {} # else: # 不做上面判断了因为存在没有p_info的商品 data['p_info'] = p_info # 获取每个商品的div_desc div_desc = self.get_goods_div_desc(tmp_p_info_body=tmp_p_info_body) # print(div_desc) if div_desc == '': print('获取到的div_desc为空str, 请检查!') self.result_data = {} return {} else: data['div_desc'] = div_desc ''' 获取去detail_name_list ''' detail_name_list = self.get_goods_detail_name_list(sku_info=sku_info) # print(detail_name_list) if detail_name_list == '': print('获取detail_name_list出错, 请检查!') self.result_data = {} return {} else: data['detail_name_list'] = detail_name_list ''' 获取每个规格对应价格跟规格以及其库存 ''' price_info_list = self.get_price_info_list(sku_info=sku_info) # pprint(price_info_list) if price_info_list == '': raise Exception else: # pprint(price_info_list) data['price_info_list'] = price_info_list if price_info_list == []: print('该商品已售完,此处将商品状态改为1') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() try: sql_str = r'update dbo.mogujie_xianshimiaosha set is_delete=1 where goods_id = %s' my_pipeline._update_table(sql_str=sql_str, params=(goods_id)) except: print('将该商品逻辑删除时出错!') pass print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |') self.result_data = {} return {} # 商品价格和淘宝价 try: tmp_price_list = sorted([round(float(item.get('detail_price', '')), 2) for item in data['price_info_list']]) price = Decimal(tmp_price_list[-1]).__round__(2) # 商品价格 taobao_price = Decimal(tmp_price_list[0]).__round__(2) # 淘宝价 # print('商品的最高价: ', price, ' 最低价: ', taobao_price) except IndexError: print('获取price和taobao_price时出错! 请检查') raise Exception data['price'] = price data['taobao_price'] = taobao_price except Exception as e: print('遇到错误: ', e) self.result_data = {} return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def _get_div_desc(self, **kwargs): ''' 处理detail_data转换成能被html显示页面信息 :param kwargs: :return: ''' detail = kwargs.get('detail') goods_id = kwargs.get('goods_id') tmp_div_desc = '' if isinstance(detail, dict): if detail.get('detailImages') is not None: for item in detail.get('detailImages', []): tmp_big = item.get('big', '') tmp_height = item.get('height', 0) tmp_width = item.get('width', 0) # tmp = r'<img src="{}" style="height:{}px;width:{}px;"/>'.format(tmp_big, tmp_height, tmp_width) tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( tmp_big) tmp_div_desc += tmp if detail.get('noticeImage') is not None: if isinstance(detail.get('noticeImage'), dict): item = detail.get('noticeImage') tmp_image = item.get('image', '') tmp_height = item.get('height', 0) tmp_width = item.get('width', 0) # tmp = r'<img src="{}" style="height:{}px;width:{}px;"/>'.format(tmp_image, tmp_height, tmp_width) tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( tmp_image) tmp_div_desc += tmp elif isinstance(detail.get('noticeImage'), list): for item in detail.get('noticeImage', []): tmp_image = item.get('image', '') tmp_height = item.get('height', 0) tmp_width = item.get('width', 0) # tmp = r'<img src="{}" style="height:{}px;width:{}px;"/>'.format(tmp_image, tmp_height, tmp_width) tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( tmp_image) tmp_div_desc += tmp else: pass ''' 处理有尺码的情况(将其加入到div_desc中) ''' tmp_size_url = 'https://th5.m.zhe800.com/app/detail/product/size?productId=' + str( goods_id) size_data_body = MyRequests.get_url_body(url=tmp_size_url, headers=self.headers, high_conceal=True) if size_data_body == '': print('size_data为空!') return '' else: size_data = [size_data_body] if size_data != []: size_data = json_2_dict(json_str=size_data[0]) if size_data == {}: print('json.loads(size_data)出错, 此处跳过') return '' # pprint(size_data) tmp_div_desc_2 = '' if size_data is not None: charts = size_data.get('charts', []) for item in charts: # print(item) tmp = '' charts_data = item.get('data', []) # table title = item.get('title', '') for item2 in charts_data: # item为一个list # print(item2) charts_item = '' for i in item2: # i为一个dict # print(i) data_value = i.get('value', '') tmp_1 = '<td style="vertical-align:inherit;display:table-cell;font-size:12px;color:#666;border:#666 1px solid;">{}</td>'.format( data_value) charts_item += tmp_1 charts_item = '<tr style="border:#666 1px solid;">' + charts_item + '</tr>' # print(charts_item) tmp += charts_item tmp = '<div>' + '<strong style="color:#666;">' + title + '</strong>' + '<table style="border-color:grey;border-collapse:collapse;text-align:center;line-height:25px;background:#fff;border-spacing:0;border:#666 1px solid;"><tbody style="border:#666 1px solid;">' + tmp + '</tbody></table></div><br>' tmp_div_desc_2 += tmp # print(tmp_div_desc_2) else: pass else: tmp_div_desc_2 = '' else: tmp_div_desc_2 = '' pass tmp_div_desc = tmp_div_desc_2 + '<div>' + tmp_div_desc + '</div>' return tmp_div_desc
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'https://th5.m.zhe800.com/gateway/app/detail/product?productId=' + str( goods_id) # print('------>>>| 得到的detail信息的地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True) if body == '': self.result_data = {} return {} else: data = [body] if data != []: data = json_2_dict(json_str=data[0]) if data == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) # 处理base base = data.get('/app/detail/product/base', '') base = json_2_dict(json_str=base) if base == {}: print("json.loads转换出错,得到base值可能为空,此处跳过") base = '' # 处理profiles profiles = data.get('/app/detail/product/profiles', '') profiles = json_2_dict(json_str=profiles) if profiles == {}: print("json.loads转换出错,得到profiles值可能为空,此处跳过") profiles = '' # 处理score score = data.get('/app/detail/product/score', '') score = json_2_dict(json_str=score) try: score.pop('contents') except: pass if score == {}: print("json.loads转换出错,得到score值可能为空,此处跳过") score = '' # 处理sku sku = data.get('/app/detail/product/sku', '') sku = json_2_dict(json_str=sku) # pprint(sku) if sku == {}: print("json.loads转换出错,得到sku值可能为空,此处跳过") sku = '' data['/app/detail/product/base'] = base data['/app/detail/product/profiles'] = profiles data['/app/detail/product/score'] = score data['/app/detail/product/sku'] = sku # 得到手机版地址 try: phone_url = 'http://th5.m.zhe800.com/h5/shopdeal?id=' + str( base.get('dealId', '')) except AttributeError: print('获取手机版地址失败,此处跳过') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} print('------>>>| 得到商品手机版地址为: ', phone_url) # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) # 得到并处理detail(即图文详情显示信息) # http://m.zhe800.com/gateway/app/detail/graph?productId= tmp_detail_url = 'https://th5.m.zhe800.com/gateway/app/detail/graph?productId=' + str( goods_id) detail_data_body = MyRequests.get_url_body( url=tmp_detail_url, headers=self.headers, high_conceal=True) if detail_data_body == '': print('detail_data为[]!') self.result_data = {} return {} else: detail_data = [detail_data_body] if detail_data != []: detail_data = json_2_dict(json_str=detail_data[0]) if detail_data == {}: print('json.loads(detail_data)时报错, 此处跳过') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(detail_data) detail = detail_data.get('/app/detail/graph/detail', '') detail = json_2_dict(json_str=detail) try: detail.pop('small') except: pass if detail == {}: print("json.loads转换出错,得到detail值可能为空,此处跳过") detail = '' # print(detail) # div_desc tmp_div_desc = self._get_div_desc(detail=detail, goods_id=goods_id) if tmp_div_desc == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # print(tmp_div_desc) data['/app/detail/graph/detail'] = tmp_div_desc # shop_name shop_name = self._get_shop_name(data=data) if isinstance(shop_name, dict): if shop_name == {}: self.result_data = {} return {} data['shop_name'] = shop_name ''' 得到秒杀开始时间和结束时间 ''' schedule_and_stock_url = 'https://th5.m.zhe800.com/gateway/app/detail/status?productId=' + str( goods_id) schedule_and_stock_info_body = MyRequests.get_url_body( url=schedule_and_stock_url, headers=self.headers, high_conceal=True) if schedule_and_stock_info_body == '': print('schedule_and_stock_info为空!') self.result_data = {} return {} else: schedule_and_stock_info = [ schedule_and_stock_info_body ] if schedule_and_stock_info != []: schedule_and_stock_info = json_2_dict( json_str=schedule_and_stock_info[0]) if schedule_and_stock_info == {}: print('得到秒杀开始时间和结束时间时错误, 此处跳过') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} schedule = schedule_and_stock_info.get( '/app/detail/status/schedule') if schedule is None: schedule = {} else: schedule = json_2_dict(json_str=schedule) stock = schedule_and_stock_info.get( '/app/detail/status/stock') if stock is None: stock = {} else: stock = json_2_dict(json_str=stock) else: schedule = {} stock = {} data['schedule'] = schedule data['stock'] = stock # pprint(data) self.result_data = data return data else: print('detail_data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def get_div_from_pc_div_url(self, url, goods_id): ''' 根据pc描述的url模拟请求获取描述的div :return: str ''' t = str(int(time.time())) + str(randint( 100, 999)) # time.time().__round__() 表示保留到个位 params_data_1 = { 'id': goods_id, 'type': '1', } tmp_url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdesc/6.0/' _params = ( ('appKey', '12574478'), ('t', t), ('api', 'mtop.taobao.detail.getdesc'), ('v', '6.0'), ('type', 'jsonp'), ('dataType', 'jsonp'), ('timeout', '20000'), ('callback', 'mtopjsonp1'), ('data', json.dumps(params_data_1)), ) url = tmp_url + '?' + urlencode(_params) last_url = re.compile(r'\+').sub('', url) # 转换后得到正确的url请求地址(替换'+') # self.my_lg.info(last_url) data = MyRequests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14, num_retries=3) if data == '': self.my_lg.error( '获取到的div_desc为空值!请检查! 出错goods_id: {0}'.format(goods_id)) return '' try: data = re.compile('mtopjsonp1\((.*)\)').findall(data)[ 0] # 贪婪匹配匹配所有 # self.my_lg.info(str(data)) except IndexError as e: self.my_lg.error( '获取data时, IndexError出错! 出错goods_id: {0}'.format(goods_id)) self.my_lg.exception(e) return '' try: data = json.loads(data) # pprint(data) except JSONDecodeError: self.my_lg.error('json转换data时出错, 请检查!') data = {} div = data.get('data', {}).get('pcDescContent', '') # self.my_lg.info(str(div)) div = self.deal_with_div(div) # self.my_lg.info(div) return div
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' self.msg = '------>>>| 对应的手机端地址为: ' + 'https://h5.m.taobao.com/awp/core/detail.htm?id=' + str( goods_id) self.my_lg.info(self.msg) # 获取主接口的body last_url = self._get_last_url(goods_id=goods_id) data = MyRequests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14) if data == '': self.my_lg.error('出错goods_id: {0}'.format((goods_id))) self.result_data = {} return {} try: data = re.compile(r'mtopjsonp1\((.*)\)').findall(data)[ 0] # 贪婪匹配匹配所有 # self.my_lg.info(str(data)) except IndexError: self.my_lg.error('data为空! 出错goods_id: {0}'.format(goods_id)) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} try: data = json.loads(data) except json.JSONDecodeError: self.my_lg.error('json.loads转换data时出错, 请检查! 出错goods_id: ' + str(goods_id)) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \ and data.get('data', {}).get('seller', {}).get('evaluates') is None: ''' ## 表示该商品已经下架, 原地址被重定向到新页面 ''' self.my_lg.info('@@@@@@ 该商品已经下架...') tmp_data_s = self.init_pull_off_shelves_goods() self.result_data = {} return tmp_data_s # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data').get('seller', {}).get('evaluates') is None: self.my_lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} data['data']['rate'] = '' # 这是宝贝评价 data['data']['resource'] = '' # 买家询问别人 data['data']['vertical'] = '' # 也是问和回答 data['data']['seller']['evaluates'] = '' # 宝贝描述, 卖家服务, 物流服务的评价值... result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {}) # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0][ 'value'] = self._wash_result_data_apiStack_value( goods_id=goods_id, result_data_apiStack_value=result_data_apiStack_value) # 处理mockData mock_data = result_data['mockData'] try: mock_data = json.loads(mock_data) except Exception: self.my_lg.error('json.loads转化mock_data时出错, 跳出' + ' 出错goods_id: ' + str(goods_id)) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # self.my_lg.info(str(result_data.get('apiStack', [])[0])) # 可能会有{'name': 'esi', 'value': ''}的情况 if result_data.get('apiStack', [])[0].get('value', '') == '': self.my_lg.info( "result_data.get('apiStack', [])[0].get('value', '')的值为空....") result_data['trade'] = {} self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: result_data['trade'] = result_data.get('apiStack', [])[0].get( 'value', {}).get('trade', {}) # 用于判断该商品是否已经下架的参数 # pprint(result_data['trade']) self.result_data = result_data # pprint(self.result_data) return result_data
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33] # notice for tab_id in tab_id_list: for index in range(0, 50): tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(index)) print('待抓取的限时秒杀地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format( tab_id, index)) break else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) print(miaosha_goods_list) juanpi = JuanPiParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: if my_pipeline._select_table( sql_str=jp_select_str_5) is None: db_goods_id_list = [] else: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table( sql_str=jp_select_str_5)) ] for item in miaosha_goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://shop.juanpi.com/deal/' + item.get( 'goods_id') juanpi.get_goods_data( goods_id=item.get('goods_id')) goods_data = juanpi.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = item.get( 'goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get( 'price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get( 'taobao_price') # 秒杀价 goods_data['sub_title'] = item.get( 'sub_title', '') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get( 'miaosha_time')) goods_data['tab_id'] = tab_id goods_data['page'] = index # print(goods_data) juanpi.insert_into_juanpi_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(.4) # 短暂sleep下避免出错跳出 sleep(.65) else: pass try: del juanpi except: pass gc.collect()
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) # # 原先采用phantomjs, 改用手机端抓html(speed slow, give up) # tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str(goods_id) # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, exec_code=self._exec_code) # # self.my_lg.info(str(body)) # # if body == '': # self.result_data = {} # self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url) # return {} # # _html_comment_list = list(Selector(text=body).css('div.remark-item').extract()) # if _html_comment_list != []: # _comment_list = [] # for index, item in enumerate(_html_comment_list): # if index > 25: # 就取前25条评论信息 # break # # buyer_name = str(Selector(text=item).css('span.member::text').extract_first()) # quantify = str(Selector(text=item).css('span.amount::text').extract_first()) # try: # quantify = int(re.compile(r'\d+').findall(quantify)[0]) # except IndexError: # self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url) # self.result_data = {} # return {} # # comment_date = str(Selector(text=item).css('div.date span::text').extract_first()) # comment_date = self._get_comment_date(comment_date) # str '2017-01-25 17:06:00' # tmp_sku_info = str(Selector(text=item).css('div.date::text').extract_first()) # # _comment_content = self._wash_comment(str(Selector(text=item).css('div.bd::text').extract_first())) # if not filter_invalid_comment_content(_comment_content): # continue # # comment = [{ # 'comment': _comment_content, # 'comment_date': comment_date, # 评论创建日期 # 'sku_info': re.compile(r'<span.*?</span>').sub('', tmp_sku_info), # 购买的商品规格 # 'img_url_list': [], # 'star_level': randint(3, 5), # 几星好评 # 'video': '', # }] # # _ = { # 'buyer_name': buyer_name, # 买家昵称 # 'comment': comment, # 评论内容 # 'quantify': quantify, # 购买数量 # 'head_img': '', # 用户头像 # 'append_comment': {}, # 追评 # } # _comment_list.append(_) # # _t = datetime.datetime.now() # # _r = CommentItem() # _r['goods_id'] = str(goods_id) # _r['create_time'] = _t # _r['modify_time'] = _t # _r['_comment_list'] = _comment_list # self.result_data = _r # # pprint(self.result_data) # return self.result_data # else: # self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url) # self.result_data = {} # return {} '''下面是模拟pc端好评接口''' member_id = self._get_this_goods_member_id(goods_id=goods_id) self.my_lg.info('------>>>| 获取到的member_id: {0}'.format(member_id)) if member_id == '': self.my_lg.error('获取到的member_id为空值!请检查!') self.result_data = {} return {} # 这里从db获取该商品原先的规格值 sku_info = self._get_sku_info_from_db(goods_id) # self.my_lg.info('sku_info: {0}'.format(sku_info)) if sku_info == []: self.result_data = {} return {} _comment_list = [] for page_num in range(1, 4): self.my_lg.info('------>>>| 正在抓取第{0}页...'.format(page_num)) params = self._set_params(goods_id=goods_id, member_id=member_id, page_num=page_num) url = 'https://rate.1688.com/remark/offerDetail/rates.json' tmp_headers = self.headers tmp_headers.update({ 'referer': 'https://detail.1688.com/offer/{0}.html'.format(str(goods_id)) }) # 原先用MyRequests老是404,改用phantomjsy也还是老是404 body = MyRequests.get_url_body(url=url, headers=tmp_headers, params=params) # self.my_lg.info(str(body)) # 用phantomjs # url = self._set_url(url=url, params=params) # self.my_lg.info(url) # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url) # try: # body = re.compile('<pre.*?>(.*)</pre>').findall(body)[0] # except IndexError: # self.my_lg.error('获取body时索引异常!') # self.result_data = {} # return {} if body == '': self.result_data = {} self.my_lg.error('该地址的body为空值, 出错goods_id: {0}'.format(goods_id)) return {} data = self.json_str_2_dict(json_str=body) if data.get('url') is not None: self.my_lg.info('------>>>| 被重定向到404页面, 休眠{0}s中...'.format(self._page_sleep_time)) sleep(self._page_sleep_time) break # self.my_lg.info(str(body)) data = data.get('data', {}).get('rates', []) # pprint(data) if data == []: # sleep(self._page_sleep_time) break try: for item in data: buyer_name = item.get('member', '') comment = [] for i in item.get('rateItem', []): _comment_content = self._wash_comment(i.get('remarkContent', '')) if not filter_invalid_comment_content(_comment_content): continue comment.append({ 'comment': _comment_content, 'comment_date': str(i.get('remarkTime', '')), # 评论日期 'sku_info': choice(sku_info), # 购买的商品规格(pc端1688商品没有规格) 'star_level': i.get('starLevel', 5), 'img_url_list': [], 'video': '', }) quantify = item.get('quantity', 1) # 购买数量 if comment == []: # 为空不录入 continue _ = { 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 购买数量 'head_img': '', # 用户头像 'append_comment': {}, # 追评 } _comment_list.append(_) except Exception: self.result_data = {} self.my_lg.error('出错商品goods_id: {0}'.format(goods_id), exc_info=True) return {} sleep(self._page_sleep_time) if _comment_list != []: # pprint(_comment_list) _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r return self.result_data else: self.my_lg.error('出错goods_id: {0}'.format(goods_id)) self.result_data = {} return {}
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url, 并得到相应数据 :param goods_id: :return: data 类型dict ''' if goods_id == []: self.result_data = {} return {} goods_url = 'https://h5.jumei.com/product/detail?item_id=' + str(goods_id[0]) + '&type=' + str(goods_id[1]) print('------>>>| 对应的手机端地址为: ', goods_url) #** 获取ajaxStaticDetail请求中的数据 tmp_url = 'https://h5.jumei.com/product/ajaxStaticDetail?item_id=' + goods_id[0] + '&type=' + str(goods_id[1]) self.headers['Referer'] = goods_url body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) if body == '': print('获取到的body为空str!') self.result_data = {} return {} try: tmp_data = json.loads(body) # pprint(tmp_data) except Exception: print('json.loads转换body时出错!请检查!') self.result_data = {} return {} tmp_data = self.wash_data(data=tmp_data) # pprint(tmp_data) #** 获取ajaxDynamicDetail请求中的数据 tmp_url_2 = 'https://h5.jumei.com/product/ajaxDynamicDetail?item_id=' + str(goods_id[0]) + '&type=' + str(goods_id[1]) body_2 = MyRequests.get_url_body(url=tmp_url_2, headers=self.headers) # print(body) if body_2 == '': print('获取到的body为空str!') self.result_data = {} return {} try: tmp_data_2 = json.loads(body_2) # pprint(tmp_data_2) except Exception: print('json.loads转换body_2时出错!请检查!') self.result_data = {} return {} tmp_data_2 = self.wash_data_2(data=tmp_data_2) # pprint(tmp_data_2) tmp_data['data_2'] = tmp_data_2.get('data', {}).get('result', {}) if tmp_data['data_2'] == {}: print('获取到的ajaxDynamicDetail中的数据为空值!请检查!') self.result_data = {} return {} # pprint(tmp_data) data = {} try: data['title'] = tmp_data.get('data', {}).get('name', '') data['sub_title'] = '' # print(data['title']) if data['title'] == '': print('获取到的title为空值, 请检查!') raise Exception # shop_name if tmp_data.get('data_2', {}).get('shop_info') == []: data['shop_name'] = '' else: data['shop_name'] = tmp_data.get('data_2', {}).get('shop_info', {}).get('store_title', '') # print(data['shop_name']) # 获取所有示例图片 all_img_url = tmp_data.get('data', {}).get('image_url_set', {}).get('single_many', []) if all_img_url == []: print('获取到的all_img_url为空[], 请检查!') raise Exception else: all_img_url = [{ 'img_url': item.get('800', ''), } for item in all_img_url] # pprint(all_img_url) data['all_img_url'] = all_img_url # 获取p_info p_info = self.get_p_info(tmp_data=tmp_data) # pprint(p_info) data['p_info'] = p_info # 获取每个商品的div_desc # 注意其商品的div_desc = description + description_usage + description_images div_desc = self.get_goods_div_desc(tmp_data=tmp_data) # print(div_desc) if div_desc == '': print('获取到的div_desc为空值! 请检查') raise Exception data['div_desc'] = div_desc ''' 上下架时间 (注意:聚美优品常规今日10点上新商品,销售时长都是24小时) ''' sell_time = self.get_sell_time( begin_time=tmp_data.get('data_2', {}).get('start_time'), end_time=tmp_data.get('data_2', {}).get('end_time') ) # pprint(sell_time) data['sell_time'] = sell_time # 设置detail_name_list detail_name_list = self.get_detail_name_list(size_attr=tmp_data.get('data_2', {}).get('size_attr', [])) # print(detail_name_list) data['detail_name_list'] = detail_name_list ''' 获取每个规格对应价格跟规格以及库存 ''' true_sku_info = self.get_true_sku_info(size=tmp_data.get('data_2', {}).get('size', [])) # pprint(true_sku_info) if true_sku_info == []: print('获取到的sku_info为空值, 请检查!') raise Exception else: data['price_info_list'] = true_sku_info ''' is_delete ''' if int(tmp_data.get('data_2', {}).get('end_time')) < int(time.time()): is_delete = 1 else: all_stock = 0 for item in true_sku_info: all_stock += item.get('rest_number', 0) # print(all_stock) if all_stock == 0: is_delete = 1 else: is_delete = 0 # print(is_delete) data['is_delete'] = is_delete # all_sell_count all_sell_count = tmp_data.get('data_2', {}).get('buyer_number', '0') data['all_sell_count'] = all_sell_count except Exception as e: print('遇到错误如下: ', e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None) result = list( tmp_sql_server._select_table(sql_str=jp_select_str_4)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() for item in result: # 实时更新数据 miaosha_begin_time = json.loads( item[1]).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_begin_time) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0]), lock_timeout=2000) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_begin_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(item[2]), str(item[3]), ) # print('待爬取的tab_id, page地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'. format(item[2], item[3])) pass else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # print(miaosha_goods_list) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in miaosha_goods_list ] # print(miaosha_goods_all_goods_id) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示该tab_id,page中没有了该goods_id ''' tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # juanpi_miaosha = JuanPiParse() juanpi_miaosha.get_goods_data( goods_id=item[0]) goods_data = juanpi_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data[ 'stock_info'] = item_1.get( 'stock_info') goods_data[ 'goods_id'] = item_1.get( 'goods_id') # goods_data['username'] = '******' if item_1.get( 'stock_info' ).get('activity_stock') > 0: goods_data[ 'price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price' ) # 秒杀价 else: pass goods_data[ 'sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1 .get('miaosha_time' )) juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(.3) # 避免太快 else: pass if index % 10 == 0: # 每过几个初始化一次,既能加快速度,又能优化内存 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() gc.collect() index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: # sleep(5) pass gc.collect()
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: 常规商品goods_id :return: ''' """ 方法1: 原先采用调用api的方法, 无奈分析js源码未找到sign是如何md5加密,从而无法实现通过api调用参数 (pass) """ # """ # 这些是构造参数 # mw-appkey:100028 # mw-t:1517037701053 # mw-uuid:956bf265-90a4-45b0-bfa8-31040782f99e # mw-ttid:NMMain@mgj_h5_1.0 # mw-sign:ef29b1801c79d63907f3589c68e4cd4c # data:{"iid":"1lnrc42","template":"1-2-detail_normal-1.0.0","appPlat":"m","noPintuan":false} # callback:mwpCb2 # _:1517037701056 # """ # print('------>>>| 对应的手机端地址为: ', 'https://h5.mogujie.com/detail-normal/index.html?itemId=' + goods_id) # # appkey = '100028' # t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # ttid = 'NMMain@mgj_h5_1.0' # sign = '' # # ''' # 下面是构造params # ''' # params_data_2 = { # 'iid': goods_id, # 'template': '1-2-detail_normal-1.0.0', # 'appPlat': 'm', # 'noPintuan': 'false', # } # # params = { # 'data': json.dumps(params_data_2), # } # # tmp_url = 'https://api.mogujie.com/h5/http.detail.api/1/?mw-appkey={}&mw-t={}&mw-uuid={}&mw-ttid={}&mw-sign={}&callback=mwpCb2'.format( # appkey, t, uuid, ttid, sign # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) # # try: # response = requests.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # last_url = re.compile(r'\+').sub('', response.url) # 转换后得到正确的url请求地址 # # print(last_url) # response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # data = response.content.decode('utf-8') # print(data) # data = re.compile(r'mwpCb2\((.*)\)').findall(data) # 贪婪匹配匹配所有 # # print(data) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} """ 方法2: 通过页面源码来获取 """ if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'https://shop.mogujie.com/detail/' + str(goods_id) print('------>>>| 原pc地址为: ', tmp_url) # print('------>>>| 对应的手机端地址为: ', 'https://h5.mogujie.com/detail-normal/index.html?itemId=' + goods_id) data = {} body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('获取到的body为空str!') self.result_data = {} return {} try: goods_info = re.compile( r'var detailInfo = (.*?);</script>').findall(body)[0] # print(goods_info) item_info = re.compile(r'itemInfo:(.*?),priceRuleImg').findall( goods_info)[0] # print(item_info) sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall( goods_info)[0] # print(sku_info) shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall( goods_info)[0] # print(shop_info) item_info = json_2_dict(json_str=item_info) sku_info = json_2_dict(json_str=sku_info) shop_info = json_2_dict(json_str=shop_info) # pprint(item_info) # pprint(sku_info) # pprint(shop_info) data['title'] = item_info.get('title', '') if data['title'] == '': print('title为空!') raise Exception data['sub_title'] = '' data['shop_name'] = shop_info.get('name', '') # print(data['shop_name']) # 获取所有示例图片 all_img_url = [{ 'img_url': item } for item in item_info.get('topImages', [])] # pprint(all_img_url) data['all_img_url'] = all_img_url ''' 获取p_info ''' p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str( goods_id) tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url, headers=self.headers, had_referer=True) # print(tmp_p_info_body) if tmp_p_info_body == '': print('获取到的tmp_p_info_body为空值, 请检查!') raise Exception p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body) # pprint(p_info) # if p_info == []: # print('获取到的p_info为空list') # self.result_data = {} # return {} # else: # 存在p_info为[]的商品 data['p_info'] = p_info # 获取每个商品的div_desc div_desc = self.get_goods_div_desc( tmp_p_info_body=tmp_p_info_body) # print(div_desc) if div_desc == '': print('获取到的div_desc为空str, 请检查!') self.result_data = {} return {} else: data['div_desc'] = div_desc ''' 获取detail_name_list ''' detail_name_list = self.get_goods_detail_name_list( sku_info=sku_info) # print(detail_name_list) if detail_name_list == '': print('获取detail_name_list出错, 请检查!') self.result_data = {} return {} else: data['detail_name_list'] = detail_name_list ''' 获取每个规格对应价格跟规格以及其库存 ''' price_info_list = self.get_price_info_list(sku_info=sku_info) if price_info_list == '': raise Exception else: # pprint(price_info_list) data['price_info_list'] = price_info_list # 商品价格和淘宝价 try: tmp_price_list = sorted([ round(float(item.get('detail_price', '')), 2) for item in data['price_info_list'] ]) price = Decimal(tmp_price_list[-1]).__round__(2) # 商品价格 taobao_price = Decimal(tmp_price_list[0]).__round__( 2) # 淘宝价 # print('商品的最高价: ', price, ' 最低价: ', taobao_price) except IndexError: print('获取price和taobao_price时出错! 请检查') raise Exception data['price'] = price data['taobao_price'] = taobao_price except Exception as e: print('遇到错误: ', e) self.result_data = {} return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def _get_true_sku_info(self, goods_id, tmp_data): ''' 得到每个规格对应的库存, 价格, 图片等详细信息 :param tmp_data: :return: ''' def _get_other(other_items): other_ = [] for item in other_items: if item.get('type', 0) == 1: # 该规格无库存 continue else: # 该规格有库存 detail_price = item.get('promotion_price', '') # 还是选择所有商品都拿最优惠的价格 # if detail_price == '' or goods_id[0] == 1: # 为空就改为获取vipshop_price字段 if detail_price == '': # 为空就改为获取vipshop_price字段 detail_price = item.get('vipshop_price', '') else: pass normal_price = item.get('market_price', '') if normal_price == '': normal_price = detail_price other_.append({ 'spec_value': item.get('sku_name', ''), 'detail_price': detail_price, 'normal_price': normal_price, 'rest_number': item.get('leavings', 0), # 该规格的剩余库存量 'img_url': '', # 设置默认为空值 }) return other_ multiColor = tmp_data[5].get('result', {}) # pprint(multiColor) ## ** 研究发现multiColor以及productSku中的type为1时,表示该商品规格库存为0 productSku = tmp_data[6].get('result', {}).get('productSku', {}) # pprint(productSku) true_sku_info = [] color_ = None if multiColor == {} or productSku == {}: return [] else: if multiColor.get('items') is None: pass else: tmp_color_items = multiColor.get('items', []) color_ = [] for item in tmp_color_items: if item.get('type', 0) == 1: # 该颜色无库存 continue else: # 为0,表示有库存 # 先获取到有库存的对应规格, 是否有颜色属性后面再判断 color_.append({ 'goods_id': item.get('product_id', ''), 'name': item.get('name', ''), 'img_url': 'https:' + item.get('icon', {}).get('imageUrl', '') }) # pprint(color_) if color_ == []: # 没有规格 也可能是 # 表示没有库存, 买完或者下架 print('获取到的color_为空[], 请检查!') return [] else: if productSku.get('items') is None: print('获取到的others_items为None') return [] else: other_items = productSku.get('items', []) other_ = _get_other(other_items=other_items) if color_ is None: for item_2 in other_: spec_value = item_2.get('spec_value', '') item_2['spec_value'] = spec_value item_2['img_url'] = '' true_sku_info.append(item_2) else: for item in color_: if item.get( 'goods_id') == goods_id[1]: # 表示为原先的那个goods_id if item.get('name', '') == '无': # 表示无颜色属性 pass else: for item_2 in other_: spec_value = item.get( 'name', '') + '|' + item_2.get( 'spec_value', '') item_2['spec_value'] = spec_value item_2['img_url'] = item.get('img_url', '') true_sku_info.append(item_2) else: # 表示是其他颜色对应的goods_id '''下面是获取该颜色对应goods_id的所有可售的规格价格信息''' url = 'https://m.vip.com/server.html' params = self._set_params() page = 'product-0-' + str(goods_id[1]) + '.html' post_data = self._set_post_data(page=page) tmp_data_2 = MyRequests.get_url_body( method='post', url=url, headers=self.headers, params=params, data=post_data) # print(tmp_data_2) # 先处理得到dict数据 if tmp_data_2 == '': print('获取其他颜色规格的url的body时为空值') return [] else: tmp_data_2 = json_2_dict(json_str=tmp_data_2) if tmp_data_2 == {}: return [] other_items_2 = tmp_data_2[6].get( 'result', {}).get('productSku', {}).get('items', []) other_2 = _get_other(other_items=other_items_2) for item_4 in other_2: spec_value = item.get( 'name', '') + '|' + item_4.get( 'spec_value', '') item_4['spec_value'] = spec_value item_4['img_url'] = item.get('img_url', '') true_sku_info.append(item_4) return true_sku_info
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: 类型 list :return: data dict类型 ''' if goods_id == []: return self._error_data_init() else: data = {} # 抓包: 唯品会微信小程序 url = 'https://m.vip.com/server.html' params = self._set_params() page = 'product-0-' + str(goods_id[1]) + '.html' post_data = self._set_post_data(page=page) body = MyRequests.get_url_body(method='post', url=url, headers=self.headers, params=params, data=post_data) # print(body) if body == '': return self._error_data_init() else: tmp_data = json_2_dict(json_str=body) if tmp_data == {}: return self._error_data_init() try: # title, sub_title data['title'] = tmp_data[2].get('result', {}).get( 'product_name', '') assert data['title'] != '', '获取到的title为空值, 请检查!' data['sub_title'] = '' data['shop_name'] = tmp_data[2].get('result', {}).get( 'brand_info', {}).get('brand_name', '') # 获取所有示例图片 all_img_url = tmp_data[2].get('result', {}).get('img_pre', []) assert all_img_url != [], '获取到的all_img_url为空[], 请检查!' all_img_url = [{ 'img_url': 'https:' + item.get('b_img', '') } for item in all_img_url] # pprint(all_img_url) data['all_img_url'] = all_img_url # 获取p_info p_info = self._get_p_info(tmp_data=tmp_data) assert p_info != [], 'p_info为空list, 请检查!' # pprint(p_info) data['p_info'] = p_info # 获取每个商品的div_desc div_desc = self.get_goods_div_desc( tmp_data=tmp_data[2].get('result', {}).get( 'detailImages', [])) assert div_desc != '', '获取到的div_desc为空值! 请检查' data['div_desc'] = div_desc ''' 上下架时间 ''' data['sell_time'] = { 'begin_time': tmp_data[2].get('result', {}).get('sell_time_from', {}), 'end_time': tmp_data[2].get('result', {}).get('sell_time_to', {}), } if int(data['sell_time'].get('begin_time')) > int( time.time()): # *** 先根据上下架时间来判断是否为预售商品,如果是预售商品就按预售商品的method来去对应规格的价格 goods_id = [1, goods_id[1]] # 设置成预售的商品goods_id格式 # 设置detail_name_list detail_name_list = self._get_detail_name_list( tmp_data=tmp_data) data['detail_name_list'] = detail_name_list ''' 获取每个规格对应价格跟规格以及库存 ''' true_sku_info = self._get_true_sku_info(goods_id=goods_id, tmp_data=tmp_data) # pprint(true_sku_info) if true_sku_info == []: # 也可能是 表示没有库存, 买完或者下架 print('获取到的sku_info为空值, 请检查!') print('*** 注意可能是卖完了,库存为0 导致!! ***') # raise Exception data['price_info_list'] = true_sku_info else: data['price_info_list'] = true_sku_info except Exception as e: print('遇到错误如下: ', e) return self._error_data_init() if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') return self._error_data_init()
def _get_jd_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取京东销量靠前的商品 :param keyword: :return: [] or ['xxxx', ....] ''' # 方案1: jd m站的搜索(基于搜索接口) headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://so.m.jd.com/ware/search.action?keyword=b&area_ids=1,72,2819&sort_type=sort_totalsales15_desc&qp_disable=no&fdesc=%E5%8C%97%E4%BA%AC&t1=1529934870416', 'authority': 'so.m.jd.com', # 'cookie': '3AB9D23F7A4B3C9B=SL4YPRE3Y4C627UCHFP4ROHI54TTYYJKLFSVROZQ57T7K3OUUKSYIVFUJKQHBAUPRANZOTPLCVC2TICTSJG6WEMUII; mba_muid=1523868445027-16c30fbc5f8c54c429; abtest=20180416164812814_35; visitkey=41587293677961039; shshshfpa=9e159581-c64f-e9f4-ad0c-8b6ced0d9f28-1525907842; shshshfpb=1a725fe3148b84c839f009c93fc261f2218f59c61e7f4e6c05af381826; retina=1; webp=1; TrackerID=GGwYSka4RvH3lm0ZwLoO2_qdMpBwRG39BvyBvQaJfzyN5cmdGt4lEMSqqJS-sbDqj4nAUX2HU4sVDGA8vl169D37w4EqceYcH6ysXv46kMVfvVdAPmSMV9LceeO3Cc6Z; whwswswws=; __jdc=122270672; subAbTest=20180604104024339_59; mobilev=html5; m_uuid_new=05C2D24B7D8FFDA8D4243A929A5C6234; intlIpLbsCountrySite=jd; mhome=1; cid=9; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; M_Identification_abtest=20180604104040270_32361722; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; so_eggsCount=1; warehistory="4764260,10658784927,"; wq_logid=1528080290.1936376147; __jdu=15238681432201722645210; __jda=122270672.15238681432201722645210.1523868143.1528255502.1529934182.18; __jdv=122270672|direct|-|none|-|1529934182053; cn=0; user-key=ecfc3673-cc54-43e2-96bd-fb7a7e700c32; ipLoc-djd=1-72-2799-0; shshshfp=a3b9323dfc6a675230170e6a43efcb81; USER_FLAG_CHECK=d9f73823a80c0305366f70a3b99b9ecb; sid=57ea016fe0ab4b04271e00f01d94d3b9; intlIpLbsCountryIp=60.177.32.78; autoOpenApp_downCloseDate_auto=1529934572240_21600000; wxa_level=1; PPRD_P=UUID.15238681432201722645210; sc_width=1280; wq_area=15_1213_0%7C3; __jdb=122270672.10.15238681432201722645210|18.1529934182; mba_sid=15299345705167145512031951538.7; __wga=1529934993217.1529934585585.1528080039013.1526716673573.6.3; shshshsID=7f3d94fa215b4e53b467f0d5e0563e9c_9_1529934993592', } params = ( ('keyword', keyword[1]), ('datatype', '1'), ('callback', 'jdSearchResultBkCbA'), ('page', '1'), ('pagesize', '10'), ('ext_attr', 'no'), ('brand_col', 'no'), ('price_col', 'no'), ('color_col', 'no'), ('size_col', 'no'), ('ext_attr_sort', 'no'), ('merge_sku', 'yes'), ('multi_suppliers', 'yes'), ('area_ids', '1,72,2819'), ('sort_type', 'sort_totalsales15_desc'), ('qp_disable', 'no'), ('fdesc', '\u5317\u4EAC'), # ('t1', '1529934992189'), ) s_url = 'https://so.m.jd.com/ware/search._m2wq_list' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: try: data = re.compile('jdSearchResultBkCbA\((.*)\)').findall( body)[0] except IndexError: self.my_lg.error('获取jd的关键字数据时, IndexError! 出错关键字为{0}'.format( (keyword[1]))) return [] '''问题在于编码中是\xa0之类的,当遇到有些 不用转义的\http之类的,则会出现以上错误。''' data = deal_with_JSONDecodeError_about_value_invalid_escape( json_str=data) data = json_2_dict(json_str=data, logger=self.my_lg) if data == {}: self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: # 注意拿到的数据如果是京东拼购则跳过 # pprint(data) data = data.get('data', {}).get('searchm', {}).get('Paragraph', []) # pingou中字段'bp'不为空即为拼购商品,抓取时不抓取拼购商品, 即'pingou_price': item.get('pinGou', {}).get('bp', '') == '' if data is not None and data != []: goods_id_list = [ item.get('wareid', '') for item in data if item.get('pinGou', {}).get('bp', '') == '' ] return goods_id_list else: self.my_lg.error('获取到的data为空list, 请检查!') return []
def _get_taobao_goods_keywords_goods_id_list(self, keyword): ''' 获取该keywords的商品的goods_id_list :param keyword: (id, keyword) :return: a list ''' headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306', 'authority': 's.taobao.com', # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw', } # 获取到的为淘宝关键字搜索按销量排名 params = ( ('data-key', 'sort'), ('data-value', 'sale-desc'), ('ajax', 'true'), # ('_ksTS', '1528171408340_395'), ('callback', 'jsonp396'), ('q', keyword[1]), ('imgfile', ''), ('commend', 'all'), ('ssid', 's5-e'), ('search_type', 'item'), ('sourceId', 'tb.index'), # ('spm', 'a21bo.2017.201856-taobao-item.1'), ('ie', 'utf8'), # ('initiative_id', 'tbindexz_20170306'), ) s_url = 'https://s.taobao.com/search' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) if body == '': return [] else: try: data = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.my_lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1])) return [] data = json_2_dict(json_str=data, logger=self.my_lg) if data == {}: self.my_lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: goods_id_list = data.get('mainInfo', {}).get( 'traceInfo', {}).get('traceData', {}).get('allNids', []) if goods_id_list is None or goods_id_list == []: self.my_lg.error( '获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] else: return goods_id_list
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': return self._data_error_init() else: tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(goods_id) print('------>>>| 得到的商品手机版地址为: ', tmp_url) ''' 原先采用requests来模拟的,之前能用,但是数据多了请求多了sleep也不管用后面会获取不到信息 ''' body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True) # print(body) if body == '': print('获取到的tmp_url的body为空值, 此处跳过!') return self._data_error_init() # 不用这个了因为会影响到正常情况的商品 try: if re.compile(r'很抱歉,您查看的页面木有了~').findall(body) != [] and (len(body)< 660 and len(body)>640): # 单独处理商品页面不存在的情况 print('很抱歉,您查看的页面木有了~') self.result_data = {} return str(goods_id) else: pass except: pass try: data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(body) except: data = [] ''' 采用phantomjs ''' # main_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, css='div.title') # # print(main_body) # if main_body == '': # print('获取到的main_body为空值, 此处跳过!') # return self._data_error_init() # # try: # data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(main_body) # 贪婪匹配匹配所有 # # print(data) # except: # data = [] if data != []: data = json_2_dict(json_str=data[0]) # pprint(data) if data == {}: return self._data_error_init() # div_desc div_desc_body = self.get_div_desc_body(goods_id=goods_id) # print(div_desc_body) if div_desc_body == '': print('获取到的div_desc_body为空!') return {} # p_info p_info = self.get_p_info_list(goods_id=goods_id) # pprint(p_info) if p_info == []: return {} # 获取商品实时库存信息 stock_info = self.get_stock_info_dict(goods_id=goods_id) if stock_info == {}: print('获取到的库存信息为{}!') return {} # pprint(stock_info) data['div_desc'] = div_desc_body data['p_info'] = p_info data['stock_info'] = stock_info if stock_info.get('pin_status', 2) == 3: print('##### 该拼团商品已经被抢光 ...') is_delete = 1 else: is_delete = 0 data['is_delete'] = is_delete data['parent_dir'] = _z8_get_parent_dir(goods_id) self.result_data = data # pprint(data) return data else: print('data为空!') return self._data_error_init()
def test(): # 抓包: 唯品会微信小程序 url = 'https://m.vip.com/server.html' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip', 'Accept-Language': 'zh-cn', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'm.vip.com', 'Referer': 'https://servicewechat.com/wxe9714e742209d35f/284/page-frame.html', 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Mobile/15A5341f MicroMessenger/6.6.5 NetType/WIFI Language/zh_CN', } t = str(time.time().__round__()) + str(randint(100, 999)) params = { 'serv': 'getGoodsActiveMsg', '_xcxid': t, } goods_id = '460143743' page = 'product-0-' + str(goods_id) + '.html' data = dumps([ { "method": "getGoodsActiveMsg", "params": { "page": page, "query": "" }, # "id":4884390025335, 'id': 1, "jsonrpc": "2.0" }, { "method": "getCoupon", "params": { "page": page, "query": "" }, # "id":4884390025336, 'id': 2, "jsonrpc": "2.0" }, { "method": "getProductDetail", "params": { "page": page, "query": "" }, # "id":4884390025337, 'id': 3, "jsonrpc": "2.0" }, { "method": "getProductMeta", "params": { "page": page, "query": "" }, # "id":4884390025338, 'id': 4, "jsonrpc": "2.0" }, { "method": "getProductSlide", "params": { "page": page, "query": "" }, # "id":4884390025339, 'id': 5, "jsonrpc": "2.0" }, { "method": "getProductMultiColor", "params": { "page": page, "query": "" }, # "id":4884390025340, 'id': 6, "jsonrpc": "2.0" }, { "method": "getProductSize", "params": { "page": page, "query": "" }, # "id":4884390025341, 'id': 7, "jsonrpc": "2.0" }, { "method": "getProductCountdown", "params": { "page": page, "query": "" }, # "id":4884390025342, 'id': 8, "jsonrpc": "2.0" }, { "method": "ProductRpc.getProductLicense", "params": { "page": page, "query": "" }, # "id":4884390025343, 'id': 9, "jsonrpc": "2.0" }, ]) body = MyRequests.get_url_body(method='post', url=url, headers=headers, params=params, data=data) # print(body) data = json_2_dict(json_str=body) return data
def _get_target_data(self, **kwargs): ''' 获取目标需求数据 :return: ''' goods_id = kwargs.get('goods_id', '') if goods_id == '': self.my_lg.error('获取到的goods_id为空值!此处跳过!') return self._get_data_error_init() # 小米有品m站抓取 base_url = 'https://home.mi.com/app/shop/pipe' # cookies = self._get_cookies() post_data = self._get_post_data(goods_id=goods_id) m_url = 'https://home.mi.com/detail?gid={0}'.format(goods_id) self.my_lg.info('------>>>| 正在抓取小米有品地址为: {0}'.format(m_url)) write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url) body = MyRequests.get_url_body(method='post', url=base_url, headers=self.headers, cookies=None, data=post_data) # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空值!' + write_info) return self._get_data_error_init() _ = json_2_dict(json_str=body, logger=self.my_lg).get('result', {}).get('detail', {}).get('data', {}) # pprint(_) if _ == {}: self.my_lg.error('获取到的data为空dict!' + write_info) return self._get_data_error_init() try: _ = self._wash_target_data(_) except Exception: self.my_lg.error('清洗数据时出错!' + write_info, exc_info=True) self._get_data_error_init() # pprint(_) data = {} try: data['title'] = self._wash_sensitive_info(self._get_title(data=_)) data['sub_title'] = self._wash_sensitive_info( self._get_sub_title(data=_)) data['shop_name'] = self._get_shop_name(data=_) data['all_img_url'] = self._get_all_img_url(data=_) data['p_info'] = self._get_p_info(data=_) # 小米有品无p_info data['div_desc'] = self._get_div_desc(data=_) data['sell_time'] = {} # 默认为空 data['detail_name_list'] = self._get_detail_name_list( data=_.get('group', [])) data['price_info_list'] = self._get_price_info_list(data=_) data['price'], data[ 'taobao_price'] = self._get_price_and_taobao_price( price_info_list=data['price_info_list']) if data['price'] == 0 or data['taobao_price'] == 0: # 售罄商品处理 data['is_delete'] = 1 else: data['is_delete'] = self._get_is_delete( price_info_list=data['price_info_list'], data=data, other=_) except Exception: self.my_lg.error('遇到错误:', exc_info=True) self.my_lg.error(write_info) return self._get_data_error_init() if data != {}: self.result_data = data return data else: self.my_lg.info('data为空值') return self._get_data_error_init()
def get_goods_data(self, goods_id): ''' 得到data :param goods_id: :return: data 类型dict ''' if goods_id == []: self.result_data = {} return {} type = goods_id[0] # 天猫类型 # self.my_lg.info(str(type)) goods_id = goods_id[1] # 天猫goods_id tmp_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id) self.my_lg.info('------>>>| 得到的移动端地址为: %s' % tmp_url) self.headers.update({'Referer': tmp_url}) last_url = self._get_last_url(goods_id=goods_id) body = MyRequests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14) if body == '': self.my_lg.error('出错goods_id: {0}'.format((goods_id))) self.result_data = {} return {} try: assert body != '', '获取到的body为空值, 此处跳过! 出错type %s: , goods_id: %s' % ( str(type), goods_id) data = re.compile('mtopjsonp3\((.*)\)').findall(body)[ 0] # 贪婪匹配匹配所有 except (AssertionError, IndexError) as e: self.my_lg.exception(e) self.result_data = {} return {} if data != '': data = json_2_dict(json_str=data, logger=self.my_lg) if data == {}: self.my_lg.error('出错type: %s, goods_id: %s' % (str(type), str(goods_id))) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \ and data.get('data', {}).get('seller', {}).get('evaluates') is None: ''' ## 表示该商品已经下架, 原地址被重定向到新页面 ''' self.my_lg.info('@@@@@@ 该商品已经下架...') tmp_data_s = self.init_pull_off_shelves_goods(type) self.result_data = {} return tmp_data_s # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data', {}).get('seller', {}).get('evaluates') is None: self.my_lg.error( 'data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错type: %s, goods_id: %s' % (str(type), str(goods_id))) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} data['data']['rate'] = '' # 这是宝贝评价 data['data']['resource'] = '' # 买家询问别人 data['data']['vertical'] = '' # 也是问和回答 data['data']['seller']['evaluates'] = '' # 宝贝描述, 卖家服务, 物流服务的评价值... result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get( 'value', {}) # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0][ 'value'] = self._wash_result_data_apiStack_value( goods_id=goods_id, result_data_apiStack_value=result_data_apiStack_value) # 处理mockData mock_data = result_data['mockData'] mock_data = json_2_dict(json_str=mock_data, logger=self.my_lg) if mock_data == {}: self.my_lg.error('出错type: {0}, goods_id: {1}'.format( type, goods_id)) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # self.my_lg.info(str(result_data.get('apiStack', [])[0])) # 可能会有{'name': 'esi', 'value': ''}的情况 if result_data.get('apiStack', [])[0].get('value', '') == '': self.my_lg.error( "result_data.get('apiStack', [])[0].get('value', '')的值为空....出错type: %s, goods_id: %s" % (str(type), goods_id)) result_data['trade'] = {} self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: result_data['trade'] = result_data.get('apiStack', [])[0].get( 'value', {}).get('trade', {}) # 用于判断该商品是否已经下架的参数 # pprint(result_data['trade']) result_data['type'] = type result_data['goods_id'] = goods_id self.result_data = result_data # pprint(self.result_data) return result_data else: self.my_lg.error('data为空! 出错type: %s, goods_id: %s' % (str(type), str(goods_id))) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def get_random_user_id_list(): # cookies = { # 'install_id': '29797177823', # 'odin_tt': 'c53dd298a0e92adf64e9303da9ab2efbe0cbef78e6737970d9adb9b207d0758ac4b9c183d9d96c3b84f3e4eedb68c12d', # 'sessionid': '16fc74a57b38e96fc93bf967a6ccd76a', # 'sid_guard': '16fc74a57b38e96fc93bf967a6ccd76a%7C1522509051%7C2592000%7CMon%2C+30-Apr-2018+15%3A10%3A51+GMT', # 'sid_tt': '16fc74a57b38e96fc93bf967a6ccd76a', # 'ttreq': '1$494b0ed8e828b687a808d93e101fac11837708e6', # 'uid_tt': '9e0f14ca7575e68526e07408631cd322', # } headers = { 'Host': 'aweme.snssdk.com', 'Accept': '*/*', 'User-Agent': 'Aweme/1.7.8 (iPhone; iOS 11.0; Scale/3.00)', 'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9', } params = ( ('iid', '29797177823'), ('device_id', '48592631504'), ('os_api', '18'), ('app_name', 'aweme'), ('channel', 'App Store'), ('idfa', 'DA8C3A83-C08C-4881-86A8-1E67849F5BB2'), ('device_platform', 'iphone'), ('build_number', '17805'), ('vid', '855FEC75-BEB7-45A5-BE6A-2699A6864BAC'), ('openudid', 'c33813d872541f3bfc4ca174d9fbc5e708dd9ec5'), ('device_type', 'iPhone7,1'), ('app_version', '1.7.8'), ('version_code', '1.7.8'), ('os_version', '11.0'), ('screen_width', '1242'), ('aid', '1128'), ('ac', 'WIFI'), ('count', '30'), # 变 ('cursor', '30'), # 游标位置 # ('mas', '00648f2f9c5b661213d05736e23eea622bf96d64dd09f6e283ea97'), # ('as', 'a12512fc4e4aaa76481324'), # ('ts', '1523066542'), ('ts', str(time.time().__round__()) + str(randint(100, 999))), ) url = 'https://aweme.snssdk.com/aweme/v1/category/list/' body = MyRequests.get_url_body(url=url, headers=headers, params=params) # print(body) try: data = json.loads(body).get('category_list', []) # pprint(data) print('count数:', len(data)) except: data = {} print('error') aweme_list = [item.get('aweme_list') for item in data] # pprint(aweme_list) user_id_list = [] for item in aweme_list: if isinstance(item, list): for i in item: user_id_list.append(i.get('author_user_id', '')) else: user_id_list.append(item.get('author_user_id', '')) user_id_list = sorted(list(set(user_id_list))) user_id_list = [item for item in user_id_list if item not in all_user_id_list] # pprint(user_id_list) return user_id_list
def _get_goods_data(self, goods_id): ''' 得到需求数据 :param goods_id: :return: ''' if goods_id == '': self.my_lg.error('获取到的goods_id为空值!此处跳过!') return self._get_data_error_init() # 网易严选m站抓取 url = 'http://m.you.163.com/item/detail' params = self._get_params(goods_id=goods_id) m_url = url + '?id={0}'.format(goods_id) self.my_lg.info('------>>>| 正在抓取严选地址为: {0}'.format(m_url)) write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url) body = MyRequests.get_url_body(url=url, headers=self.headers, params=params) # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空值!' + write_info) return self._get_data_error_init() try: body = re.compile('var jsonData=(.*?),policyList=').findall( body)[0] except IndexError: self.my_lg.error('获取body时索引异常!' + write_info, exc_info=True) return self._get_data_error_init() body = nonstandard_json_str_handle(json_str=body) # self.my_lg.info(str(body)) _ = json_2_dict(json_str=body) # pprint(_) if _ == {}: self.my_lg.error('获取到的data为空dict!' + write_info) return self._get_data_error_init() _ = self._wash_data(_) data = {} try: data['title'] = self._get_title(data=_) data['sub_title'] = self._get_sub_title(data=_) data['shop_name'] = '' data['all_img_url'] = self._get_all_img_url(data=_) data['p_info'] = self._get_p_info(data=_) data['div_desc'] = self._get_div_desc(data=_) data['sell_time'] = self._get_sell_time(data=_) data['detail_name_list'] = self._get_detail_name_list( data=_.get('skuSpecList', [])) data['price_info_list'] = self._get_price_info_list( data=_.get('skuList', [])) data['price'], data[ 'taobao_price'] = self._get_price_and_taobao_price( price_info_list=data['price_info_list']) data['is_delete'] = self._get_is_delete( price_info_list=data['price_info_list'], data=data, other=_) except Exception: self.my_lg.error('遇到错误:', exc_info=True) self.my_lg.error(write_info) return self._get_data_error_init() if data != {}: self.result_data = data return data else: self.my_lg.info('data为空值') return self._get_data_error_init()
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} return {} else: tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str( goods_id) print('------>>>| 得到的商品手机版地址为: ', tmp_url) ''' 原先采用requests来模拟的,之前能用,但是数据多了请求多了sleep也不管用后面会获取不到信息 ''' body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) if body == '': print('获取到的tmp_url的body为空值, 此处跳过!') self.result_data = {} return {} # 不用这个了因为会影响到正常情况的商品 # try: # if re.compile(r'很抱歉,您查看的页面木有了~').findall(body) != []: # 单独处理商品页面不存在的情况 # # print('test############') # self.result_data = {} # return str(goods_id) # else: # pass # except: # pass try: data = re.compile( r'window.prod_info = (.*?);seajs.use\(.*?\);</script>' ).findall(body) # 贪婪匹配匹配所有 except: data = [] ''' 采用phantomjs ''' # main_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, css='div.title') # # print(main_body) # if main_body == '': # print('获取到的main_body为空值, 此处跳过!') # self.result_data = {} # return {} # # try: # data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(main_body) # 贪婪匹配匹配所有 # # print(data) # except: # data = [] if data != []: data = data[0] data = json_2_dict(json_str=data) if data == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) ''' 得到div_desc的html页面 ''' div_desc_url = 'https://pina.m.zhe800.com/nnc/product/detail_content.json?zid=' + str( goods_id) div_desc_body = self.get_div_desc_body( div_desc_url=div_desc_url) # print(div_desc_body) if div_desc_body == '': print('获取到的div_desc_body为空!') return {} ''' 获取到详情介绍页面 ''' p_info_url = 'https://pina.m.zhe800.com/cns/products/get_product_properties_list.json?productId=' + str( goods_id) p_info = self.get_p_info_list(p_info_url=p_info_url) # pprint(p_info) if p_info == []: return {} ''' 获取商品实时库存信息 ''' stock_info_url = 'https://pina.m.zhe800.com/cns/products/' + str( goods_id) + '/realtime_info.json' stock_info = self.get_stock_info_dict( stock_info_url=stock_info_url) if stock_info == {}: print('获取到的库存信息为{}!') return {} # pprint(stock_info) data['div_desc'] = div_desc_body data['p_info'] = p_info data['stock_info'] = stock_info if stock_info.get('pin_status', 2) == 3: print('##### 该拼团商品已经被抢光 ...') is_delete = 1 else: is_delete = 0 data['is_delete'] = is_delete self.result_data = data # pprint(data) return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}