def get_id_info(cate, i): params2 = { 'beginpage': '%s' % i, 'asyncreq': '1', 'keywords': '%s' % cate, 'sortType': '', 'descendOrder': '', 'province': '', 'city': '', 'priceStart': '', 'priceEnd': '', 'dis': '', 'spm': 'a2609.11209760.it2i6j8a.30.44292de113BNUL', 'cosite': 'baidujj_pz', 'trackid': '{trackid}', 'location': 're', 'pageid': '17145fa7ralgjD', 'p4pid': 'f5abf68bdcb94f5dab3c43c91ea6af09', 'callback': 'jsonp_{}_51591'.format(int(round(time.time() * 1000))), '_': '%s' % int(round(time.time() * 1000)), } headers_eve = get_headers2(cate) url = 'https://data.p4psearch.1688.com/data/ajax/get_premium_offer_list.json' urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res = requests.get(url, headers=headers_eve, params=params2, verify=False).text res_temp = '{"data' + re.search( 'data(.*?)ret', res, re.S).group(1).rstrip('"').rstrip(',') + '}' res_eve = json.loads(res_temp) if res_eve["data"] != {}: temp = re.findall(r'\"eurl\":\"(.*?)\"', res, re.S) goods_id_list = [] for eve in temp: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) print(eve) res = requests.get(eve, headers=get_headers3(), verify=False).text goods_id = re.search('<meta.*?b2c_auction.*?content=\"(\d+)\".*?>', res, re.S).group(1) print(goods_id) goods_id_list.append(goods_id) return goods_id_list
def get_detail(url): print(url) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res = requests.get(url, headers=get_headers3(), verify=False).text product_info = {} product_info['title'] = ''.join( etree.HTML(res).xpath("//html[@lang='zh-CN']/head/title/text()")) product_info['shop_name'] = re.search( '<meta.*?og:product:nick.*?name=(.*?);.*?>', res, re.S).group(1) product_info['goods_id'] = re.search( '<meta.*?b2c_auction.*?content=\"(\d+)\".*?>', res, re.S).group(1) product_info['source'] = "https://detail.1688.com/offer/{}.html".format( product_info['goods_id']) # 商品图,创建product_img文件夹并下载图片 product_info['imgsSrc'] = re.findall( '<li.*?tab-trigger.*?original\"\:\"(.*?)\"', res, re.S) os.chdir('C:/Users/admin/Desktop/1688') if os.path.exists('./4/' + '%s' % product_info['goods_id']) == False: os.makedirs('./4/' + '%s' % product_info['goods_id'] + '/product_img') os.chdir('C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id'] + '/product_img') i = 1 for temp_img in product_info['imgsSrc']: with open('%s' % i + '.jpg', 'wb') as f: urllib3.disable_warnings( urllib3.exceptions.InsecureRequestWarning) eve_image = requests.get(temp_img, verify=False).content f.write(eve_image) i += 1 # 视频页面,创建product_video文件夹并下载视频 memberId = re.search('member_id.*?\"(.*?)\"', res, re.S).group(1) videoId = re.search('videoId.*?\"(\d+)\"', res, re.S).group(1) if videoId != '0': res2 = requests.get( 'https://apps.1688.com/event/app/videoInfo/getVideoById.htm', params=get_params3(videoId, memberId), headers=get_headers4(), verify=False).text os.chdir('C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id']) os.mkdir('./product_video') os.chdir('C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id'] + '/product_video') with open('%s' % product_info['goods_id'] + '.mp4', 'wb') as f: urllib3.disable_warnings( urllib3.exceptions.InsecureRequestWarning) video = requests.get(product_info['videoUrl'], verify=False).content f.write(video) if os.path.getsize('C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id'] + '/product_video/' + '%s' % product_info['goods_id'] + '.mp4') < 10000: product_info['videoUrl'] = re.search('address\"\:\"(.*?)\"', res2, re.S).group(1) else: pass try: # 分销、代发页面 url3 = 'https://detail.1688.com/offer/{}.html?sk=consign'.format( product_info['goods_id']) res3 = requests.get(url3, headers=get_headers7(product_info['goods_id']), verify=False, allow_redirects=False, timeout=None).text # 是否有分销界面解析规则不一样 try: skuProps = '[' + re.search( 'skuProps.*?\[(.*?)skuMap', res3, re.S).group(1).rstrip('"').strip().rstrip(',') skuMap = '{' + re.search( 'skuMap.*?\{(.*?)end', res3, re.S).group(1).rstrip('"').strip().rstrip(',').rstrip( '}').strip().rstrip(',') Specifications1 = json.loads(skuProps) Specifications2 = json.loads(skuMap) except json.decoder.JSONDecodeError: skuProps = '[' + re.search( 'skuProps.*?\[(.*?)skuMap', res3, re.S).group(1).rstrip('"').strip().rstrip(',') skuMap = '{' + re.search('skuMap.*?\{(.*?)end', res3, re.S).group(1).strip().rstrip(',') Specifications1 = json.loads(skuProps) Specifications2 = json.loads(skuMap) product_info['spcification_amount'] = skuProps base_price = re.search('consignBasePrice\"\:\"(.*?)\"', res3, re.S).group(1) if '-' in base_price: base_price_eve = base_price.split('-')[1] else: base_price_eve = base_price # 运费页面 url4 = 'https://laputa.1688.com/offer/ajax/widgetList.do' res4 = requests.get(url4, headers=get_headers9(product_info['goods_id']), params=get_params4(product_info['goods_id']), verify=False).text res_temp = '{"data' + re.search( 'data(.*?)message', res4, re.S).group(1).rstrip('"').rstrip(',') + '}' res_eve = json.loads(res_temp) if res_eve['data']['data']['offerdetail_ditto_postage'][ 'showFreightCost'] == False: fee = 10 elif res_eve['data']['data']['offerdetail_ditto_postage'][ 'freightCost'] == []: fee = 0 else: fee = res_eve['data']['data']['offerdetail_ditto_postage'][ 'freightCost'][0]['costItems'][0]['value'] # 产品最终价格,分销/代发价+运费 product_info['price'] = float(base_price_eve) + float(fee) # 有规格图,无价格 img_dict = {} for value_eve in Specifications1: for vv_eve in value_eve['value']: if "imageUrl" in vv_eve.keys(): img_dict["%s" % vv_eve["name"]] = { "price": "", "url": vv_eve['imageUrl'] } # 有价格,无规格图 price_dict = {} for k, v in Specifications2.items(): if "price" in v.keys(): price_dict[k.replace('gt;', '').replace("/", "").replace("*", "")] = { 'price': v['price'], 'url': '' } else: price_dict[k.replace('gt;', '').replace("/", "").replace("*", "")] = { 'price': product_info['price'], 'url': '' } # 构建新的json,含有价格和规格图,存入新的文件中 for key1, value1 in img_dict.items(): for key2, value2 in price_dict.items(): if key1 in key2: price_dict['%s' % key2]['url'] = img_dict['%s' % key1]['url'] product_info['spcification'] = price_dict # 创建规格文件夹颜色 os.chdir('C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id']) os.makedirs('./product_specifications/color') os.chdir('C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id'] + '/product_specifications/color') with open('%s' % product_info['goods_id'] + '.json', 'w') as fp: fp.write(json.dumps(price_dict, indent=4, ensure_ascii=False)) for key3, value3 in price_dict.items(): with open('%s' % key3 + '.jpg', 'wb') as f: urllib3.disable_warnings( urllib3.exceptions.InsecureRequestWarning) if "https" in value3['url']: specification_img = requests.get(value3['url'], verify=False).content f.write(specification_img) # 尺寸 if len(Specifications1) >= 2: os.chdir('C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id']) os.makedirs('./product_specifications/size') os.chdir('C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id'] + '/product_specifications/size') for key4 in Specifications1: for kk_eve in key4['value']: with open( '%s' % kk_eve['name'].replace("/", "").replace( "*", "") + '.txt', 'w') as f: f.write('1') except AttributeError: pass # 富文本图链接 html_url = re.search('data-tfs-url=\"(.*?)\"', res, re.S).group(1) params = {'t': '%s' % int(round(time.time() * 1000))} res1 = requests.get(html_url, headers=get_headers5(), params=params, verify=False).text.replace('\\', '') # 匹配图片用于富文本构造 result = re.findall('alt.*?src=\"(.*?)\"', res1, re.S) file = 'C:/Users/admin/Desktop/1688/html_template.html' saveHtmlFilePath = 'C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id'] + '/' + '%s' % \ product_info[ 'goods_id'] + '.html' # 参数依次为富文本模板,保存路径,图片列表 format_html(file, saveHtmlFilePath, result) print(product_info) mongo_info_alibaba.insert_item(product_info)
# coding: utf-8 import os import re import time from handle_template import format_html from info_mongoimport import mongo_info_alibaba from lxml import etree from headers_lists import get_headers3, get_headers4, get_headers5, get_headers7, get_headers8, get_headers9, get_headers11 from params_lists import get_params2, get_params3, get_params4 import requests import json import urllib.parse import urllib3 url = 'https://dj.1688.com/ci_bb?a=1923980011&e=lAG-AAzlwOsUaTpZjJ16uzysGfgiPdCGznc8lxNj34GE78gq37QZz7eqnGWUmWh03sLDFALx9QABiYu8uNqj7k.-mXVYZTkKC7pweePwvxXJAH5D-y.1.M-1omwCs9MWgXfCUV5fJOxZ6es5M-yIyVaw3u9tLWLbFlxvFroRca9CsIEq-t2YAMtylY7adRJU92kC.yZpBxYFPyAkdnylcUMq.ggBO7XRpyfFgQ8t4UK.0UDzTsXJfHl7Aa3Rd2LkmIQiN8IbVBtlqp6imHFZz7kwGSM7MSgNatSNuO-NIOhKVSNkrfZBGvc.K7fFYeVx0KdzQaamASVUap55vQnE5wDxwQK7xHkWtNVNc1n-XvQaoi8grnABg9GaCVc1R9.HAOhct7foY3gYgD77GCHAmDRB9KM5s2KMQYGe8pIMTqohKU-YPJDEWWzAk9ihFiPwXwA.Hww2n.nwfA99vOkKs0W54NXOZfuvgSKh2sNZe-A1X2uKEjae9qyaAnneA.iMDzMFIxs1UOXZcfSg3p3TqDyLVIw-5GvoWhkQpsRtBd-ye25DwrYmRvGE1nJsIVnCHwFYwLc4ly.g0s9QMP8cgBpGplGkrJdvXwb0xRqAgcbVCF0B-JlP9p85k58lLB4qHrGBttlA-pG4UQ5biM9wot64Ks1SvawoBrEpp4jjxwtgmfioW7dCyXNAs7QxXVlkEmTHb4dNh1rcQTVhvKYroZFm.8NNSEzvYQ.xfgFFYAbfvsCf.Sl-nnYMvgv4QKDD3wR.09cpujKnN.dwDI98HFxcI4wDFOSnIcIqxH9pgAzUHeYnIZ5BlhtpAfbejhLaCCGsCHTxYJy.-UTr62SkpQ3tCCyo0-vpCiA8m27oNwmSlzoZfuYothm5Ye9rmOUJIDZKaZk-8tom0OoyuyCKBe6.xgZeviNJ&v=4&ap=1&rp=1' urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res = requests.get(url, headers=get_headers3(), verify=False).text print(res)
def detail_three(good_id): url = "https://item.taobao.com/item.htm?spm=a219r.lm874.14.1.45cb36a7Nv5owE&id={}&ns=1&abbucket=12".format( good_id) res = requests.get(url, headers=get_headers3(), verify=False).text product_info = {} # 商品标题 if etree.HTML(res).xpath("//h3[@class='tb-main-title']/text()") != []: product_info['title'] = ''.join( etree.HTML(res).xpath( "//h3[@class='tb-main-title']/text()")).strip() else: product_info['title'] = ''.join( etree.HTML(res).xpath( "//meta[@name='keywords']/@content")).strip() # 店铺名称 if etree.HTML(res).xpath( "//div[@class='tb-shop-name']/dl/dd/strong/a/@title") != []: product_info['shop_name'] = ''.join( etree.HTML(res).xpath( "//div[@class='tb-shop-name']/dl/dd/strong/a/@title")).strip() else: product_info['shop_name'] = ''.join( etree.HTML(res).xpath( '//a[@class="slogo-shopname"]/strong/text()')) product_info['goods_id'] = good_id product_info['source'] = url # 商品图 if etree.HTML(res).xpath( "//ul[@class='tb-thumb tm-clear']/li/a/img/@src") != []: product_info['imgsSrc'] = etree.HTML(res).xpath( "//ul[@class='tb-thumb tm-clear']/li/a/img/@src") else: product_info['imgsSrc'] = eval( '[' + re.search('auctionImages.*?\[(.*?)\]', res, re.S).group(1) + ']') # 判断商品视频是否存在 temp_result = re.search('imgVedioID.*?\"(\d+)\"', res, re.S) if temp_result != None: imgVedioID = temp_result.group(1) sellerId = re.search('sellerId.*?(\d+)', res, re.S).group(1) product_info[ 'videoUrl'] = "https://cloud.video.taobao.com/play/u/{}/p/1/e/6/t/1/{}.mp4".format( sellerId, imgVedioID) # 运费 url1 = "https://mdskip.taobao.com/core/initItemDetail.htm" res1 = requests.get(url1, headers=get_headers4(url), params=get_params1(good_id)).text res_temp = '{' + re.search('defaultModel.*?\{(.*?)isSuccess', res1, re.S).group(1).rstrip('"').strip().rstrip(',') res_eve = json.loads(res_temp) if res_eve['deliveryDO']['deliverySkuMap']['default'][0][ 'postageFree'] == False: postage = float(res_eve['deliveryDO']['deliverySkuMap']['default'][0] ['postage'].split(':')[1]) else: postage = float( res_eve['deliveryDO']['deliverySkuMap']['default'][0]['money']) # 产品最终价格,最高价+运费 try: base_price = float( re.search('defaultItemPrice.*?\"(.*?)\"', res, re.S).group(1).split('-')[1].strip()) except: base_price = float(''.join( etree.HTML(res).xpath("//em[@class='tb-rmb-num']/text()"))) product_info['price'] = base_price + postage # 规格 if etree.HTML(res).xpath( "//div[@class='tb-skin']/dl/dd/ul/li/a/span/text()") == []: key = etree.HTML(res).xpath( "//div[@class='tb-skin']/div/dl/dd/ul/li/a/span/text()") else: key = etree.HTML(res).xpath( "//div[@class='tb-skin']/dl/dd/ul/li/a/span/text()") == [] if etree.HTML(res).xpath( "//div[@class='tb-skin']/dl/dd/ul/li/@data-value") == []: value = etree.HTML(res).xpath( "//div[@class='tb-skin']/div/dl/dd/ul/li/@data-value") else: value = etree.HTML(res).xpath( "//div[@class='tb-skin']/dl/dd/ul/li/@data-value") # 代号对应的规格名 result_dict = {} for name, mark in zip(key, value): result_dict['%s' % mark.split(':')[1]] = name # 规格解析方法不同 try: skumap = '{' + re.search('skuMap.*?\{(.*?)propertyMemoMap', res, re.S).group(1).strip().rstrip(',') skuMap = json.loads(skumap) except: skumap = '{' + re.search('skuMap.*?\{(.*?)salesProp', res, re.S).group(1).rstrip('"').strip().rstrip(',') skuMap = json.loads(skumap) new_dict = {} for k, v in skuMap.items(): kk = re.findall('\:(\d+)\;', k, re.S) new_dict['%s' % kk] = v['price'] # 图片链接 temp_result = etree.HTML(res).xpath( "//ul[contains(@class,'J_TSaleProp tb-img')]/li/a/@style") if temp_result != []: color = etree.HTML(res).xpath( "//ul[contains(@class,'J_TSaleProp tb-img')]/li/a/span/text()") color_dict = {} for key1, value1 in zip(temp_result, color): color_dict['%s' % value1] = 'https:' + re.search( '\((.*?)\)', key1, re.S).group(1) another_dict = {} for key, value in new_dict.items(): key2 = eval(key) rep = [result_dict[x] if x in result_dict else x for x in key2] another_dict['%s' % rep] = {"price": value, "url": ''} other_dict = {} for key3, value3 in another_dict.items(): key4 = eval(key3) for y in key4: if y in color_dict: other_dict['%s' % key3] = { 'price': '%s' % value3['price'], 'url': '%s' % color_dict[y] } else: other_dict = {} for key, value in new_dict.items(): key2 = eval(key) rep = [result_dict[x] if x in result_dict else x for x in key2] other_dict['%s' % rep] = {"price": value, "url": ''} product_info['spcification'] = other_dict return product_info
def get_params(cate, i): params2 = { 'beginpage': '%s' % i, 'asyncreq': '1', 'keywords': '%s' % cate, 'sortType': '', 'descendOrder': '', 'province': '', 'city': '', 'priceStart': '', 'priceEnd': '', 'dis': '', 'spm': 'a2609.11209760.it2i6j8a.30.44292de113BNUL', 'cosite': 'baidujj_pz', 'trackid': '{trackid}', 'location': 're', 'pageid': '17145fa7ralgjD', 'p4pid': 'f5abf68bdcb94f5dab3c43c91ea6af09', 'callback': 'jsonp_{}_51591'.format(int(round(time.time() * 1000))), '_': '%s' % int(round(time.time() * 1000)), } headers_eve = get_headers2(cate) url = 'https://data.p4psearch.1688.com/data/ajax/get_premium_offer_list.json' urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res = requests.get(url, headers=headers_eve, params=params2, verify=False).text res_temp = '{"data' + re.search( 'data(.*?)ret', res, re.S).group(1).rstrip('"').rstrip(',') + '}' res_eve = json.loads(res_temp) if res_eve["data"] != {}: temp = re.findall(r'\"eurl\":\"(.*?)\"', res, re.S) product_info_list = [] for eve in temp: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) print(eve) res = requests.get(eve, headers=get_headers3(), verify=False).text goods_id = re.search('<meta.*?b2c_auction.*?content=\"(\d+)\".*?>', res, re.S).group(1) print(goods_id) url1 = 'https://detail.1688.com/offer/{}.html?sk=consign'.format( goods_id) print(url1) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res = requests.get(url1, headers=get_headers3(), verify=False).text product_info = {} product_info['title'] = ''.join( etree.HTML(res).xpath( "//html[@lang='zh-CN']/head/title/text()")) product_info['shop_name'] = re.search( '<meta.*?og:product:nick.*?name=(.*?);.*?>', res, re.S).group(1) product_info['goods_id'] = re.search( '<meta.*?b2c_auction.*?content=\"(\d+)\".*?>', res, re.S).group(1) product_info['source'] = url1 # 商品图,创建product_img文件夹并下载图片 product_info['imgsSrc'] = re.findall( '<li.*?tab-trigger.*?original\"\:\"(.*?)\"', res, re.S) # 视频页面,创建product_video文件夹并下载视频 memberId = re.search('member_id.*?\"(.*?)\"', res, re.S).group(1) videoId = re.search('videoId.*?\"(\d+)\"', res, re.S).group(1) if videoId != '0': res2 = requests.get( 'https://apps.1688.com/event/app/videoInfo/getVideoById.htm', params=get_params3(videoId, memberId), headers=get_headers4(), verify=False).text product_info['videoUrl'] = re.search('address\"\:\"(.*?)\"', res2, re.S).group(1) try: try: skuProps = '[' + re.search( 'skuProps.*?\[(.*?)skuMap', res, re.S).group(1).rstrip('"').strip().rstrip(',') skuMap = '{' + re.search( 'skuMap.*?\{(.*?)end', res, re.S).group(1).rstrip('"').strip().rstrip(',').rstrip( '}').strip().rstrip(',') Specifications1 = json.loads(skuProps) Specifications2 = json.loads(skuMap) except json.decoder.JSONDecodeError: skuProps = '[' + re.search( 'skuProps.*?\[(.*?)skuMap', res, re.S).group(1).rstrip('"').strip().rstrip(',') skuMap = '{' + re.search('skuMap.*?\{(.*?)end', res, re.S).group(1).strip().rstrip(',') Specifications1 = json.loads(skuProps) Specifications2 = json.loads(skuMap) base_price = re.search('consignBasePrice\"\:\"(.*?)\"', res, re.S).group(1) if '-' in base_price: base_price_eve = base_price.split('-')[1] else: base_price_eve = base_price # 运费页面 url4 = 'https://laputa.1688.com/offer/ajax/widgetList.do' res4 = requests.get( url4, headers=get_headers9(product_info['goods_id']), params=get_params4(product_info['goods_id']), verify=False).text res_temp = '{"data' + re.search( 'data(.*?)message', res4, re.S).group(1).rstrip('"').rstrip(',') + '}' res_eve = json.loads(res_temp) if res_eve['data']['data']['offerdetail_ditto_postage'][ 'showFreightCost'] == False: fee = 10 elif res_eve['data']['data']['offerdetail_ditto_postage'][ 'freightCost'] == []: fee = 0 else: fee = res_eve['data']['data']['offerdetail_ditto_postage'][ 'freightCost'][0]['costItems'][0]['value'] # 产品最终价格,分销/代发价+运费 product_info['price'] = float(base_price_eve) + float(fee) # 有规格图,无价格 img_dict = {} for value_eve in Specifications1: for vv_eve in value_eve['value']: if "imageUrl" in vv_eve.keys(): img_dict["%s" % vv_eve["name"]] = { "price": "", "url": vv_eve['imageUrl'] } # 有价格,无规格图 price_dict = {} for k, v in Specifications2.items(): if "price" in v.keys(): price_dict[k.replace('gt;', '').replace("/", "").replace( "*", "")] = { 'price': v['price'], 'url': '' } else: price_dict[k.replace('gt;', '').replace( "/", "").replace("*", "")] = { 'price': product_info['price'], 'url': '' } # 构建新的json,含有价格和规格图,存入新的文件中 for key1, value1 in img_dict.items(): for key2, value2 in price_dict.items(): if key1 in key2: price_dict['%s' % key2]['url'] = img_dict['%s' % key1]['url'] product_info['spcification'] = price_dict except AttributeError: pass product_info_list.append(product_info) return product_info_list
def two_detail(cate): url = "https://tce.alicdn.com/api/data.htm?ids=222887%2C222890%2C222889%2C222886%2C222906%2C222898%2C222907%2C222885%2C222895%2C222878%2C222908%2C222879%2C222893%2C222896%2C222918%2C222917%2C222888%2C222902%2C222880%2C222913%2C222910%2C222882%2C222883%2C222921%2C222899%2C222905%2C222881%2C222911%2C222894%2C222920%2C222914%2C222877%2C222919%2C222915%2C222922%2C222884%2C222912%2C222892%2C222900%2C222923%2C222909%2C222897%2C222891%2C222903%2C222901%2C222904%2C222916%2C222924&callback=tbh_service_cat" urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res = requests.get(url, headers=get_headers1(), verify=False).text result = re.search('\((.*?)\)', res, re.S).group(1) temp = json.loads(result) cate_list = [] for eve in temp.values(): for item in eve['value']['list']: if len(item) == 3: cate_link_dict = {} cate_link_dict['name'] = item['name'] cate_link_dict['link'] = item['link'] cate_list.append(cate_link_dict) eve_dict = {} for ee in cate_list: eve_dict['%s' % ee['name']] = ee['link'] # 请求一级页面获取商品id urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res3 = requests.get(eve_dict['%s' % cate], headers=get_headers2(), verify=False).text temp = eval('[' + re.search('allNids.*?\[(.*?)\]', res3, re.S).group(1) + ']') product_info_list = [] for good_id in temp: # 请求详情页面获取商品详细信息 url1 = "https://item.taobao.com/item.htm?spm=a219r.lm874.14.1.45cb36a7Nv5owE&id={}&ns=1&abbucket=12".format( good_id) print(url1) res1 = requests.get(url1, headers=get_headers3(), verify=False).text product_info = {} # 商品标题 if etree.HTML(res1).xpath("//h3[@class='tb-main-title']/text()") != []: product_info['title'] = ''.join( etree.HTML(res1).xpath( "//h3[@class='tb-main-title']/text()")).strip() else: product_info['title'] = ''.join( etree.HTML(res1).xpath( "//meta[@name='keywords']/@content")).strip() # 店铺名称 if etree.HTML(res1).xpath( "//div[@class='tb-shop-name']/dl/dd/strong/a/@title") != []: product_info['shop_name'] = ''.join( etree.HTML(res1).xpath( "//div[@class='tb-shop-name']/dl/dd/strong/a/@title") ).strip() else: product_info['shop_name'] = ''.join( etree.HTML(res1).xpath( '//a[@class="slogo-shopname"]/strong/text()')) product_info['goods_id'] = good_id product_info['source'] = url1 # 商品图 if etree.HTML(res1).xpath( "//ul[@class='tb-thumb tm-clear']/li/a/img/@src") != []: product_info['imgsSrc'] = etree.HTML(res1).xpath( "//ul[@class='tb-thumb tm-clear']/li/a/img/@src") else: product_info['imgsSrc'] = eval( '[' + re.search('auctionImages.*?\[(.*?)\]', res1, re.S).group(1) + ']') # 判断商品视频是否存在 temp_result = re.search('imgVedioID.*?\"(\d+)\"', res1, re.S) if temp_result != None: imgVedioID = temp_result.group(1) sellerId = re.search('sellerId.*?(\d+)', res1, re.S).group(1) product_info[ 'videoUrl'] = "https://cloud.video.taobao.com/play/u/{}/p/1/e/6/t/1/{}.mp4".format( sellerId, imgVedioID) # 运费 url2 = "https://mdskip.taobao.com/core/initItemDetail.htm" urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res2 = requests.get(url2, headers=get_headers4(url1), params=get_params1(good_id), verify=False).text res_temp = '{' + re.search( 'defaultModel.*?\{(.*?)isSuccess', res2, re.S).group(1).rstrip('"').strip().rstrip(',') res_eve = json.loads(res_temp) if res_eve['deliveryDO']['deliverySkuMap']['default'][0][ 'postageFree'] == False: postage = float(res_eve['deliveryDO']['deliverySkuMap']['default'] [0]['postage'].split(':')[1]) else: postage = float( res_eve['deliveryDO']['deliverySkuMap']['default'][0]['money']) # 产品最终价格,最高价+运费 if re.search('defaultItemPrice\".*?\"(.*?)\"', res1, re.S) != None: if '-' in re.search('defaultItemPrice\".*?\"(.*?)\"', res1, re.S).group(1): base_price = float( re.search('defaultItemPrice\".*?\"(.*?)\"', res1, re.S).group(1).split('-')[1].strip()) else: base_price = float( re.search('defaultItemPrice\".*?\"(.*?)\"', res1, re.S).group(1).strip()) else: if '-' in ''.join( etree.HTML(res1).xpath( "//em[@class='tb-rmb-num']/text()")): base_price = float(''.join( etree.HTML(res1).xpath("//em[@class='tb-rmb-num']/text()") ).split('-')[1].strip()) else: base_price_temp = ''.join( etree.HTML(res1).xpath("//em[@class='tb-rmb-num']/text()")) base_price = float(base_price_temp) product_info['price'] = base_price + postage try: # 规格 if etree.HTML(res1).xpath( "//div[@class='tb-skin']/dl/dd/ul/li/a/span/text()") == []: key = etree.HTML(res1).xpath( "//div[@class='tb-skin']/div/dl/dd/ul/li/a/span/text()") else: key = etree.HTML(res1).xpath( "//div[@class='tb-skin']/dl/dd/ul/li/a/span/text()") == [] if etree.HTML(res1).xpath( "//div[@class='tb-skin']/dl/dd/ul/li/@data-value") == []: value = etree.HTML(res1).xpath( "//div[@class='tb-skin']/div/dl/dd/ul/li/@data-value") else: value = etree.HTML(res1).xpath( "//div[@class='tb-skin']/dl/dd/ul/li/@data-value") # 代号对应的规格名 result_dict = {} for name, mark in zip(key, value): result_dict['%s' % mark.split(':')[1]] = name # 规格解析方法不同 if re.search('skuMap.*?\{(.*?)propertyMemoMap', res1, re.S) != None: skumap = '{' + re.search('skuMap.*?\{(.*?)propertyMemoMap', res1, re.S).group(1).strip().rstrip(',') skuMap = json.loads(skumap) else: skumap = '{' + re.search( 'skuMap.*?\{(.*?)salesProp', res1, re.S).group(1).rstrip('"').strip().rstrip(',') skuMap = json.loads(skumap) new_dict = {} for k, v in skuMap.items(): kk = re.findall('\:(\d+)\;', k, re.S) new_dict['%s' % kk] = v['price'] # 图片链接 temp_result = etree.HTML(res1).xpath( "//ul[contains(@class,'J_TSaleProp tb-img')]/li/a/@style") if temp_result != []: color = etree.HTML(res1).xpath( "//ul[contains(@class,'J_TSaleProp tb-img')]/li/a/span/text()" ) color_dict = {} for key1, value1 in zip(temp_result, color): color_dict['%s' % value1] = 'https:' + re.search( '\((.*?)\)', key1, re.S).group(1) another_dict = {} for key4, value4 in new_dict.items(): key2 = eval(key4) rep = [ result_dict[x] if x in result_dict else x for x in key2 ] another_dict['%s' % rep] = {"price": value4, "url": ''} other_dict = {} for key3, value3 in another_dict.items(): key7 = eval(key3) for y in key7: if y in color_dict: other_dict['%s' % key3] = { 'price': '%s' % value3['price'], 'url': '%s' % color_dict[y] } else: other_dict = {} for key5, value5 in new_dict.items(): key6 = eval(key5) rep = [ result_dict[x] if x in result_dict else x for x in key6 ] other_dict['%s' % rep] = {"price": value5, "url": ''} product_info['spcification'] = other_dict except TypeError: pass print(product_info) product_info_list.append(product_info) time.sleep(5) print(cate_list)