Пример #1
0
def get_id_info(cate, i):
    params2 = {
        'beginpage': '%s' % i,
        'asyncreq': '1',
        'keywords': '%s' % cate,
        'sortType': '',
        'descendOrder': '',
        'province': '',
        'city': '',
        'priceStart': '',
        'priceEnd': '',
        'dis': '',
        'spm': 'a2609.11209760.it2i6j8a.30.44292de113BNUL',
        'cosite': 'baidujj_pz',
        'trackid': '{trackid}',
        'location': 're',
        'pageid': '17145fa7ralgjD',
        'p4pid': 'f5abf68bdcb94f5dab3c43c91ea6af09',
        'callback': 'jsonp_{}_51591'.format(int(round(time.time() * 1000))),
        '_': '%s' % int(round(time.time() * 1000)),
    }
    headers_eve = get_headers2(cate)
    url = 'https://data.p4psearch.1688.com/data/ajax/get_premium_offer_list.json'
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    res = requests.get(url, headers=headers_eve, params=params2,
                       verify=False).text
    res_temp = '{"data' + re.search(
        'data(.*?)ret', res, re.S).group(1).rstrip('"').rstrip(',') + '}'
    res_eve = json.loads(res_temp)
    if res_eve["data"] != {}:
        temp = re.findall(r'\"eurl\":\"(.*?)\"', res, re.S)
        goods_id_list = []
        for eve in temp:
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
            print(eve)
            res = requests.get(eve, headers=get_headers3(), verify=False).text
            goods_id = re.search('<meta.*?b2c_auction.*?content=\"(\d+)\".*?>',
                                 res, re.S).group(1)
            print(goods_id)
            goods_id_list.append(goods_id)
        return goods_id_list
Пример #2
0
def get_detail(url):
    print(url)
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    res = requests.get(url, headers=get_headers3(), verify=False).text
    product_info = {}
    product_info['title'] = ''.join(
        etree.HTML(res).xpath("//html[@lang='zh-CN']/head/title/text()"))
    product_info['shop_name'] = re.search(
        '<meta.*?og:product:nick.*?name=(.*?);.*?>', res, re.S).group(1)
    product_info['goods_id'] = re.search(
        '<meta.*?b2c_auction.*?content=\"(\d+)\".*?>', res, re.S).group(1)
    product_info['source'] = "https://detail.1688.com/offer/{}.html".format(
        product_info['goods_id'])
    # 商品图,创建product_img文件夹并下载图片
    product_info['imgsSrc'] = re.findall(
        '<li.*?tab-trigger.*?original\"\:\"(.*?)\"', res, re.S)
    os.chdir('C:/Users/admin/Desktop/1688')
    if os.path.exists('./4/' + '%s' % product_info['goods_id']) == False:
        os.makedirs('./4/' + '%s' % product_info['goods_id'] + '/product_img')
        os.chdir('C:/Users/admin/Desktop/1688/4/' +
                 '%s' % product_info['goods_id'] + '/product_img')
        i = 1
        for temp_img in product_info['imgsSrc']:
            with open('%s' % i + '.jpg', 'wb') as f:
                urllib3.disable_warnings(
                    urllib3.exceptions.InsecureRequestWarning)
                eve_image = requests.get(temp_img, verify=False).content
                f.write(eve_image)
                i += 1

        # 视频页面,创建product_video文件夹并下载视频
        memberId = re.search('member_id.*?\"(.*?)\"', res, re.S).group(1)
        videoId = re.search('videoId.*?\"(\d+)\"', res, re.S).group(1)
        if videoId != '0':
            res2 = requests.get(
                'https://apps.1688.com/event/app/videoInfo/getVideoById.htm',
                params=get_params3(videoId, memberId),
                headers=get_headers4(),
                verify=False).text
            os.chdir('C:/Users/admin/Desktop/1688/4/' +
                     '%s' % product_info['goods_id'])
            os.mkdir('./product_video')
            os.chdir('C:/Users/admin/Desktop/1688/4/' +
                     '%s' % product_info['goods_id'] + '/product_video')
            with open('%s' % product_info['goods_id'] + '.mp4', 'wb') as f:
                urllib3.disable_warnings(
                    urllib3.exceptions.InsecureRequestWarning)
                video = requests.get(product_info['videoUrl'],
                                     verify=False).content
                f.write(video)
            if os.path.getsize('C:/Users/admin/Desktop/1688/4/' +
                               '%s' % product_info['goods_id'] +
                               '/product_video/' +
                               '%s' % product_info['goods_id'] +
                               '.mp4') < 10000:
                product_info['videoUrl'] = re.search('address\"\:\"(.*?)\"',
                                                     res2, re.S).group(1)
            else:
                pass
        try:
            # 分销、代发页面
            url3 = 'https://detail.1688.com/offer/{}.html?sk=consign'.format(
                product_info['goods_id'])
            res3 = requests.get(url3,
                                headers=get_headers7(product_info['goods_id']),
                                verify=False,
                                allow_redirects=False,
                                timeout=None).text
            # 是否有分销界面解析规则不一样
            try:
                skuProps = '[' + re.search(
                    'skuProps.*?\[(.*?)skuMap', res3,
                    re.S).group(1).rstrip('"').strip().rstrip(',')
                skuMap = '{' + re.search(
                    'skuMap.*?\{(.*?)end', res3,
                    re.S).group(1).rstrip('"').strip().rstrip(',').rstrip(
                        '}').strip().rstrip(',')
                Specifications1 = json.loads(skuProps)
                Specifications2 = json.loads(skuMap)

            except json.decoder.JSONDecodeError:
                skuProps = '[' + re.search(
                    'skuProps.*?\[(.*?)skuMap', res3,
                    re.S).group(1).rstrip('"').strip().rstrip(',')
                skuMap = '{' + re.search('skuMap.*?\{(.*?)end', res3,
                                         re.S).group(1).strip().rstrip(',')
                Specifications1 = json.loads(skuProps)
                Specifications2 = json.loads(skuMap)

            product_info['spcification_amount'] = skuProps

            base_price = re.search('consignBasePrice\"\:\"(.*?)\"', res3,
                                   re.S).group(1)
            if '-' in base_price:
                base_price_eve = base_price.split('-')[1]
            else:
                base_price_eve = base_price

            # 运费页面
            url4 = 'https://laputa.1688.com/offer/ajax/widgetList.do'
            res4 = requests.get(url4,
                                headers=get_headers9(product_info['goods_id']),
                                params=get_params4(product_info['goods_id']),
                                verify=False).text
            res_temp = '{"data' + re.search(
                'data(.*?)message', res4,
                re.S).group(1).rstrip('"').rstrip(',') + '}'
            res_eve = json.loads(res_temp)
            if res_eve['data']['data']['offerdetail_ditto_postage'][
                    'showFreightCost'] == False:
                fee = 10
            elif res_eve['data']['data']['offerdetail_ditto_postage'][
                    'freightCost'] == []:
                fee = 0
            else:
                fee = res_eve['data']['data']['offerdetail_ditto_postage'][
                    'freightCost'][0]['costItems'][0]['value']
            # 产品最终价格,分销/代发价+运费
            product_info['price'] = float(base_price_eve) + float(fee)

            # 有规格图,无价格
            img_dict = {}
            for value_eve in Specifications1:
                for vv_eve in value_eve['value']:
                    if "imageUrl" in vv_eve.keys():
                        img_dict["%s" % vv_eve["name"]] = {
                            "price": "",
                            "url": vv_eve['imageUrl']
                        }
            # 有价格,无规格图
            price_dict = {}
            for k, v in Specifications2.items():
                if "price" in v.keys():
                    price_dict[k.replace('gt;',
                                         '').replace("/",
                                                     "").replace("*", "")] = {
                                                         'price': v['price'],
                                                         'url': ''
                                                     }
                else:
                    price_dict[k.replace('gt;',
                                         '').replace("/",
                                                     "").replace("*", "")] = {
                                                         'price':
                                                         product_info['price'],
                                                         'url': ''
                                                     }
            # 构建新的json,含有价格和规格图,存入新的文件中
            for key1, value1 in img_dict.items():
                for key2, value2 in price_dict.items():
                    if key1 in key2:
                        price_dict['%s' % key2]['url'] = img_dict['%s' %
                                                                  key1]['url']

            product_info['spcification'] = price_dict
            # 创建规格文件夹颜色
            os.chdir('C:/Users/admin/Desktop/1688/4/' +
                     '%s' % product_info['goods_id'])
            os.makedirs('./product_specifications/color')
            os.chdir('C:/Users/admin/Desktop/1688/4/' +
                     '%s' % product_info['goods_id'] +
                     '/product_specifications/color')
            with open('%s' % product_info['goods_id'] + '.json', 'w') as fp:
                fp.write(json.dumps(price_dict, indent=4, ensure_ascii=False))
            for key3, value3 in price_dict.items():
                with open('%s' % key3 + '.jpg', 'wb') as f:
                    urllib3.disable_warnings(
                        urllib3.exceptions.InsecureRequestWarning)
                    if "https" in value3['url']:
                        specification_img = requests.get(value3['url'],
                                                         verify=False).content
                        f.write(specification_img)
            # 尺寸
            if len(Specifications1) >= 2:
                os.chdir('C:/Users/admin/Desktop/1688/4/' +
                         '%s' % product_info['goods_id'])
                os.makedirs('./product_specifications/size')
                os.chdir('C:/Users/admin/Desktop/1688/4/' +
                         '%s' % product_info['goods_id'] +
                         '/product_specifications/size')
                for key4 in Specifications1:
                    for kk_eve in key4['value']:
                        with open(
                                '%s' % kk_eve['name'].replace("/", "").replace(
                                    "*", "") + '.txt', 'w') as f:
                            f.write('1')
        except AttributeError:
            pass

        # 富文本图链接
        html_url = re.search('data-tfs-url=\"(.*?)\"', res, re.S).group(1)
        params = {'t': '%s' % int(round(time.time() * 1000))}
        res1 = requests.get(html_url,
                            headers=get_headers5(),
                            params=params,
                            verify=False).text.replace('\\', '')

        # 匹配图片用于富文本构造
        result = re.findall('alt.*?src=\"(.*?)\"', res1, re.S)
        file = 'C:/Users/admin/Desktop/1688/html_template.html'
        saveHtmlFilePath = 'C:/Users/admin/Desktop/1688/4/' + '%s' % product_info['goods_id'] + '/' + '%s' % \
                           product_info[
                               'goods_id'] + '.html'
        # 参数依次为富文本模板,保存路径,图片列表
        format_html(file, saveHtmlFilePath, result)
        print(product_info)
        mongo_info_alibaba.insert_item(product_info)
Пример #3
0
# coding: utf-8

import os
import re
import time
from handle_template import format_html
from info_mongoimport import mongo_info_alibaba
from lxml import etree
from headers_lists import get_headers3, get_headers4, get_headers5, get_headers7, get_headers8, get_headers9, get_headers11
from params_lists import get_params2, get_params3, get_params4
import requests
import json
import urllib.parse
import urllib3

url = 'https://dj.1688.com/ci_bb?a=1923980011&e=lAG-AAzlwOsUaTpZjJ16uzysGfgiPdCGznc8lxNj34GE78gq37QZz7eqnGWUmWh03sLDFALx9QABiYu8uNqj7k.-mXVYZTkKC7pweePwvxXJAH5D-y.1.M-1omwCs9MWgXfCUV5fJOxZ6es5M-yIyVaw3u9tLWLbFlxvFroRca9CsIEq-t2YAMtylY7adRJU92kC.yZpBxYFPyAkdnylcUMq.ggBO7XRpyfFgQ8t4UK.0UDzTsXJfHl7Aa3Rd2LkmIQiN8IbVBtlqp6imHFZz7kwGSM7MSgNatSNuO-NIOhKVSNkrfZBGvc.K7fFYeVx0KdzQaamASVUap55vQnE5wDxwQK7xHkWtNVNc1n-XvQaoi8grnABg9GaCVc1R9.HAOhct7foY3gYgD77GCHAmDRB9KM5s2KMQYGe8pIMTqohKU-YPJDEWWzAk9ihFiPwXwA.Hww2n.nwfA99vOkKs0W54NXOZfuvgSKh2sNZe-A1X2uKEjae9qyaAnneA.iMDzMFIxs1UOXZcfSg3p3TqDyLVIw-5GvoWhkQpsRtBd-ye25DwrYmRvGE1nJsIVnCHwFYwLc4ly.g0s9QMP8cgBpGplGkrJdvXwb0xRqAgcbVCF0B-JlP9p85k58lLB4qHrGBttlA-pG4UQ5biM9wot64Ks1SvawoBrEpp4jjxwtgmfioW7dCyXNAs7QxXVlkEmTHb4dNh1rcQTVhvKYroZFm.8NNSEzvYQ.xfgFFYAbfvsCf.Sl-nnYMvgv4QKDD3wR.09cpujKnN.dwDI98HFxcI4wDFOSnIcIqxH9pgAzUHeYnIZ5BlhtpAfbejhLaCCGsCHTxYJy.-UTr62SkpQ3tCCyo0-vpCiA8m27oNwmSlzoZfuYothm5Ye9rmOUJIDZKaZk-8tom0OoyuyCKBe6.xgZeviNJ&v=4&ap=1&rp=1'
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
res = requests.get(url, headers=get_headers3(), verify=False).text
print(res)
Пример #4
0
def detail_three(good_id):
    url = "https://item.taobao.com/item.htm?spm=a219r.lm874.14.1.45cb36a7Nv5owE&id={}&ns=1&abbucket=12".format(
        good_id)
    res = requests.get(url, headers=get_headers3(), verify=False).text
    product_info = {}
    # 商品标题
    if etree.HTML(res).xpath("//h3[@class='tb-main-title']/text()") != []:
        product_info['title'] = ''.join(
            etree.HTML(res).xpath(
                "//h3[@class='tb-main-title']/text()")).strip()
    else:
        product_info['title'] = ''.join(
            etree.HTML(res).xpath(
                "//meta[@name='keywords']/@content")).strip()
    # 店铺名称
    if etree.HTML(res).xpath(
            "//div[@class='tb-shop-name']/dl/dd/strong/a/@title") != []:
        product_info['shop_name'] = ''.join(
            etree.HTML(res).xpath(
                "//div[@class='tb-shop-name']/dl/dd/strong/a/@title")).strip()
    else:
        product_info['shop_name'] = ''.join(
            etree.HTML(res).xpath(
                '//a[@class="slogo-shopname"]/strong/text()'))
    product_info['goods_id'] = good_id
    product_info['source'] = url
    # 商品图
    if etree.HTML(res).xpath(
            "//ul[@class='tb-thumb tm-clear']/li/a/img/@src") != []:
        product_info['imgsSrc'] = etree.HTML(res).xpath(
            "//ul[@class='tb-thumb tm-clear']/li/a/img/@src")
    else:
        product_info['imgsSrc'] = eval(
            '[' + re.search('auctionImages.*?\[(.*?)\]', res, re.S).group(1) +
            ']')
    # 判断商品视频是否存在
    temp_result = re.search('imgVedioID.*?\"(\d+)\"', res, re.S)
    if temp_result != None:
        imgVedioID = temp_result.group(1)
        sellerId = re.search('sellerId.*?(\d+)', res, re.S).group(1)
        product_info[
            'videoUrl'] = "https://cloud.video.taobao.com/play/u/{}/p/1/e/6/t/1/{}.mp4".format(
                sellerId, imgVedioID)

    # 运费
    url1 = "https://mdskip.taobao.com/core/initItemDetail.htm"
    res1 = requests.get(url1,
                        headers=get_headers4(url),
                        params=get_params1(good_id)).text
    res_temp = '{' + re.search('defaultModel.*?\{(.*?)isSuccess', res1,
                               re.S).group(1).rstrip('"').strip().rstrip(',')
    res_eve = json.loads(res_temp)
    if res_eve['deliveryDO']['deliverySkuMap']['default'][0][
            'postageFree'] == False:
        postage = float(res_eve['deliveryDO']['deliverySkuMap']['default'][0]
                        ['postage'].split(':')[1])
    else:
        postage = float(
            res_eve['deliveryDO']['deliverySkuMap']['default'][0]['money'])
    # 产品最终价格,最高价+运费
    try:
        base_price = float(
            re.search('defaultItemPrice.*?\"(.*?)\"', res,
                      re.S).group(1).split('-')[1].strip())
    except:
        base_price = float(''.join(
            etree.HTML(res).xpath("//em[@class='tb-rmb-num']/text()")))
    product_info['price'] = base_price + postage

    # 规格
    if etree.HTML(res).xpath(
            "//div[@class='tb-skin']/dl/dd/ul/li/a/span/text()") == []:
        key = etree.HTML(res).xpath(
            "//div[@class='tb-skin']/div/dl/dd/ul/li/a/span/text()")
    else:
        key = etree.HTML(res).xpath(
            "//div[@class='tb-skin']/dl/dd/ul/li/a/span/text()") == []
    if etree.HTML(res).xpath(
            "//div[@class='tb-skin']/dl/dd/ul/li/@data-value") == []:
        value = etree.HTML(res).xpath(
            "//div[@class='tb-skin']/div/dl/dd/ul/li/@data-value")
    else:
        value = etree.HTML(res).xpath(
            "//div[@class='tb-skin']/dl/dd/ul/li/@data-value")

    # 代号对应的规格名
    result_dict = {}
    for name, mark in zip(key, value):
        result_dict['%s' % mark.split(':')[1]] = name

    # 规格解析方法不同
    try:
        skumap = '{' + re.search('skuMap.*?\{(.*?)propertyMemoMap', res,
                                 re.S).group(1).strip().rstrip(',')
        skuMap = json.loads(skumap)
    except:
        skumap = '{' + re.search('skuMap.*?\{(.*?)salesProp', res,
                                 re.S).group(1).rstrip('"').strip().rstrip(',')
        skuMap = json.loads(skumap)

    new_dict = {}
    for k, v in skuMap.items():
        kk = re.findall('\:(\d+)\;', k, re.S)
        new_dict['%s' % kk] = v['price']

    # 图片链接
    temp_result = etree.HTML(res).xpath(
        "//ul[contains(@class,'J_TSaleProp tb-img')]/li/a/@style")
    if temp_result != []:
        color = etree.HTML(res).xpath(
            "//ul[contains(@class,'J_TSaleProp tb-img')]/li/a/span/text()")
        color_dict = {}
        for key1, value1 in zip(temp_result, color):
            color_dict['%s' % value1] = 'https:' + re.search(
                '\((.*?)\)', key1, re.S).group(1)

        another_dict = {}
        for key, value in new_dict.items():
            key2 = eval(key)
            rep = [result_dict[x] if x in result_dict else x for x in key2]
            another_dict['%s' % rep] = {"price": value, "url": ''}

        other_dict = {}
        for key3, value3 in another_dict.items():
            key4 = eval(key3)
            for y in key4:
                if y in color_dict:
                    other_dict['%s' % key3] = {
                        'price': '%s' % value3['price'],
                        'url': '%s' % color_dict[y]
                    }

    else:
        other_dict = {}
        for key, value in new_dict.items():
            key2 = eval(key)
            rep = [result_dict[x] if x in result_dict else x for x in key2]
            other_dict['%s' % rep] = {"price": value, "url": ''}
    product_info['spcification'] = other_dict

    return product_info
Пример #5
0
def get_params(cate, i):
    params2 = {
        'beginpage': '%s' % i,
        'asyncreq': '1',
        'keywords': '%s' % cate,
        'sortType': '',
        'descendOrder': '',
        'province': '',
        'city': '',
        'priceStart': '',
        'priceEnd': '',
        'dis': '',
        'spm': 'a2609.11209760.it2i6j8a.30.44292de113BNUL',
        'cosite': 'baidujj_pz',
        'trackid': '{trackid}',
        'location': 're',
        'pageid': '17145fa7ralgjD',
        'p4pid': 'f5abf68bdcb94f5dab3c43c91ea6af09',
        'callback': 'jsonp_{}_51591'.format(int(round(time.time() * 1000))),
        '_': '%s' % int(round(time.time() * 1000)),
    }
    headers_eve = get_headers2(cate)
    url = 'https://data.p4psearch.1688.com/data/ajax/get_premium_offer_list.json'
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    res = requests.get(url, headers=headers_eve, params=params2,
                       verify=False).text
    res_temp = '{"data' + re.search(
        'data(.*?)ret', res, re.S).group(1).rstrip('"').rstrip(',') + '}'
    res_eve = json.loads(res_temp)
    if res_eve["data"] != {}:
        temp = re.findall(r'\"eurl\":\"(.*?)\"', res, re.S)
        product_info_list = []
        for eve in temp:
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
            print(eve)
            res = requests.get(eve, headers=get_headers3(), verify=False).text
            goods_id = re.search('<meta.*?b2c_auction.*?content=\"(\d+)\".*?>',
                                 res, re.S).group(1)
            print(goods_id)
            url1 = 'https://detail.1688.com/offer/{}.html?sk=consign'.format(
                goods_id)
            print(url1)
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
            res = requests.get(url1, headers=get_headers3(), verify=False).text
            product_info = {}
            product_info['title'] = ''.join(
                etree.HTML(res).xpath(
                    "//html[@lang='zh-CN']/head/title/text()"))
            product_info['shop_name'] = re.search(
                '<meta.*?og:product:nick.*?name=(.*?);.*?>', res,
                re.S).group(1)
            product_info['goods_id'] = re.search(
                '<meta.*?b2c_auction.*?content=\"(\d+)\".*?>', res,
                re.S).group(1)
            product_info['source'] = url1
            # 商品图,创建product_img文件夹并下载图片
            product_info['imgsSrc'] = re.findall(
                '<li.*?tab-trigger.*?original\"\:\"(.*?)\"', res, re.S)
            # 视频页面,创建product_video文件夹并下载视频
            memberId = re.search('member_id.*?\"(.*?)\"', res, re.S).group(1)
            videoId = re.search('videoId.*?\"(\d+)\"', res, re.S).group(1)
            if videoId != '0':
                res2 = requests.get(
                    'https://apps.1688.com/event/app/videoInfo/getVideoById.htm',
                    params=get_params3(videoId, memberId),
                    headers=get_headers4(),
                    verify=False).text
                product_info['videoUrl'] = re.search('address\"\:\"(.*?)\"',
                                                     res2, re.S).group(1)

            try:
                try:
                    skuProps = '[' + re.search(
                        'skuProps.*?\[(.*?)skuMap', res,
                        re.S).group(1).rstrip('"').strip().rstrip(',')
                    skuMap = '{' + re.search(
                        'skuMap.*?\{(.*?)end', res,
                        re.S).group(1).rstrip('"').strip().rstrip(',').rstrip(
                            '}').strip().rstrip(',')
                    Specifications1 = json.loads(skuProps)
                    Specifications2 = json.loads(skuMap)

                except json.decoder.JSONDecodeError:
                    skuProps = '[' + re.search(
                        'skuProps.*?\[(.*?)skuMap', res,
                        re.S).group(1).rstrip('"').strip().rstrip(',')
                    skuMap = '{' + re.search('skuMap.*?\{(.*?)end', res,
                                             re.S).group(1).strip().rstrip(',')
                    Specifications1 = json.loads(skuProps)
                    Specifications2 = json.loads(skuMap)

                base_price = re.search('consignBasePrice\"\:\"(.*?)\"', res,
                                       re.S).group(1)
                if '-' in base_price:
                    base_price_eve = base_price.split('-')[1]
                else:
                    base_price_eve = base_price

                # 运费页面
                url4 = 'https://laputa.1688.com/offer/ajax/widgetList.do'
                res4 = requests.get(
                    url4,
                    headers=get_headers9(product_info['goods_id']),
                    params=get_params4(product_info['goods_id']),
                    verify=False).text
                res_temp = '{"data' + re.search(
                    'data(.*?)message', res4,
                    re.S).group(1).rstrip('"').rstrip(',') + '}'
                res_eve = json.loads(res_temp)
                if res_eve['data']['data']['offerdetail_ditto_postage'][
                        'showFreightCost'] == False:
                    fee = 10
                elif res_eve['data']['data']['offerdetail_ditto_postage'][
                        'freightCost'] == []:
                    fee = 0
                else:
                    fee = res_eve['data']['data']['offerdetail_ditto_postage'][
                        'freightCost'][0]['costItems'][0]['value']
                # 产品最终价格,分销/代发价+运费
                product_info['price'] = float(base_price_eve) + float(fee)

                # 有规格图,无价格
                img_dict = {}
                for value_eve in Specifications1:
                    for vv_eve in value_eve['value']:
                        if "imageUrl" in vv_eve.keys():
                            img_dict["%s" % vv_eve["name"]] = {
                                "price": "",
                                "url": vv_eve['imageUrl']
                            }
                # 有价格,无规格图
                price_dict = {}
                for k, v in Specifications2.items():
                    if "price" in v.keys():
                        price_dict[k.replace('gt;',
                                             '').replace("/", "").replace(
                                                 "*", "")] = {
                                                     'price': v['price'],
                                                     'url': ''
                                                 }
                    else:
                        price_dict[k.replace('gt;', '').replace(
                            "/", "").replace("*", "")] = {
                                'price': product_info['price'],
                                'url': ''
                            }
                # 构建新的json,含有价格和规格图,存入新的文件中
                for key1, value1 in img_dict.items():
                    for key2, value2 in price_dict.items():
                        if key1 in key2:
                            price_dict['%s' %
                                       key2]['url'] = img_dict['%s' %
                                                               key1]['url']

                product_info['spcification'] = price_dict
            except AttributeError:
                pass
            product_info_list.append(product_info)
        return product_info_list
Пример #6
0
def two_detail(cate):
    url = "https://tce.alicdn.com/api/data.htm?ids=222887%2C222890%2C222889%2C222886%2C222906%2C222898%2C222907%2C222885%2C222895%2C222878%2C222908%2C222879%2C222893%2C222896%2C222918%2C222917%2C222888%2C222902%2C222880%2C222913%2C222910%2C222882%2C222883%2C222921%2C222899%2C222905%2C222881%2C222911%2C222894%2C222920%2C222914%2C222877%2C222919%2C222915%2C222922%2C222884%2C222912%2C222892%2C222900%2C222923%2C222909%2C222897%2C222891%2C222903%2C222901%2C222904%2C222916%2C222924&callback=tbh_service_cat"
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    res = requests.get(url, headers=get_headers1(), verify=False).text
    result = re.search('\((.*?)\)', res, re.S).group(1)
    temp = json.loads(result)
    cate_list = []
    for eve in temp.values():
        for item in eve['value']['list']:
            if len(item) == 3:
                cate_link_dict = {}
                cate_link_dict['name'] = item['name']
                cate_link_dict['link'] = item['link']
                cate_list.append(cate_link_dict)
    eve_dict = {}
    for ee in cate_list:
        eve_dict['%s' % ee['name']] = ee['link']
    # 请求一级页面获取商品id
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    res3 = requests.get(eve_dict['%s' % cate],
                        headers=get_headers2(),
                        verify=False).text
    temp = eval('[' + re.search('allNids.*?\[(.*?)\]', res3, re.S).group(1) +
                ']')
    product_info_list = []
    for good_id in temp:
        # 请求详情页面获取商品详细信息
        url1 = "https://item.taobao.com/item.htm?spm=a219r.lm874.14.1.45cb36a7Nv5owE&id={}&ns=1&abbucket=12".format(
            good_id)
        print(url1)
        res1 = requests.get(url1, headers=get_headers3(), verify=False).text
        product_info = {}
        # 商品标题
        if etree.HTML(res1).xpath("//h3[@class='tb-main-title']/text()") != []:
            product_info['title'] = ''.join(
                etree.HTML(res1).xpath(
                    "//h3[@class='tb-main-title']/text()")).strip()
        else:
            product_info['title'] = ''.join(
                etree.HTML(res1).xpath(
                    "//meta[@name='keywords']/@content")).strip()
        # 店铺名称
        if etree.HTML(res1).xpath(
                "//div[@class='tb-shop-name']/dl/dd/strong/a/@title") != []:
            product_info['shop_name'] = ''.join(
                etree.HTML(res1).xpath(
                    "//div[@class='tb-shop-name']/dl/dd/strong/a/@title")
            ).strip()
        else:
            product_info['shop_name'] = ''.join(
                etree.HTML(res1).xpath(
                    '//a[@class="slogo-shopname"]/strong/text()'))
        product_info['goods_id'] = good_id
        product_info['source'] = url1
        # 商品图
        if etree.HTML(res1).xpath(
                "//ul[@class='tb-thumb tm-clear']/li/a/img/@src") != []:
            product_info['imgsSrc'] = etree.HTML(res1).xpath(
                "//ul[@class='tb-thumb tm-clear']/li/a/img/@src")
        else:
            product_info['imgsSrc'] = eval(
                '[' +
                re.search('auctionImages.*?\[(.*?)\]', res1, re.S).group(1) +
                ']')
        # 判断商品视频是否存在
        temp_result = re.search('imgVedioID.*?\"(\d+)\"', res1, re.S)
        if temp_result != None:
            imgVedioID = temp_result.group(1)
            sellerId = re.search('sellerId.*?(\d+)', res1, re.S).group(1)
            product_info[
                'videoUrl'] = "https://cloud.video.taobao.com/play/u/{}/p/1/e/6/t/1/{}.mp4".format(
                    sellerId, imgVedioID)

        # 运费
        url2 = "https://mdskip.taobao.com/core/initItemDetail.htm"
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        res2 = requests.get(url2,
                            headers=get_headers4(url1),
                            params=get_params1(good_id),
                            verify=False).text
        res_temp = '{' + re.search(
            'defaultModel.*?\{(.*?)isSuccess', res2,
            re.S).group(1).rstrip('"').strip().rstrip(',')
        res_eve = json.loads(res_temp)
        if res_eve['deliveryDO']['deliverySkuMap']['default'][0][
                'postageFree'] == False:
            postage = float(res_eve['deliveryDO']['deliverySkuMap']['default']
                            [0]['postage'].split(':')[1])
        else:
            postage = float(
                res_eve['deliveryDO']['deliverySkuMap']['default'][0]['money'])
        # 产品最终价格,最高价+运费
        if re.search('defaultItemPrice\".*?\"(.*?)\"', res1, re.S) != None:
            if '-' in re.search('defaultItemPrice\".*?\"(.*?)\"', res1,
                                re.S).group(1):
                base_price = float(
                    re.search('defaultItemPrice\".*?\"(.*?)\"', res1,
                              re.S).group(1).split('-')[1].strip())
            else:
                base_price = float(
                    re.search('defaultItemPrice\".*?\"(.*?)\"', res1,
                              re.S).group(1).strip())
        else:
            if '-' in ''.join(
                    etree.HTML(res1).xpath(
                        "//em[@class='tb-rmb-num']/text()")):
                base_price = float(''.join(
                    etree.HTML(res1).xpath("//em[@class='tb-rmb-num']/text()")
                ).split('-')[1].strip())
            else:
                base_price_temp = ''.join(
                    etree.HTML(res1).xpath("//em[@class='tb-rmb-num']/text()"))
                base_price = float(base_price_temp)
        product_info['price'] = base_price + postage

        try:

            # 规格
            if etree.HTML(res1).xpath(
                    "//div[@class='tb-skin']/dl/dd/ul/li/a/span/text()") == []:
                key = etree.HTML(res1).xpath(
                    "//div[@class='tb-skin']/div/dl/dd/ul/li/a/span/text()")
            else:
                key = etree.HTML(res1).xpath(
                    "//div[@class='tb-skin']/dl/dd/ul/li/a/span/text()") == []
            if etree.HTML(res1).xpath(
                    "//div[@class='tb-skin']/dl/dd/ul/li/@data-value") == []:
                value = etree.HTML(res1).xpath(
                    "//div[@class='tb-skin']/div/dl/dd/ul/li/@data-value")
            else:
                value = etree.HTML(res1).xpath(
                    "//div[@class='tb-skin']/dl/dd/ul/li/@data-value")

            # 代号对应的规格名
            result_dict = {}
            for name, mark in zip(key, value):
                result_dict['%s' % mark.split(':')[1]] = name

            # 规格解析方法不同
            if re.search('skuMap.*?\{(.*?)propertyMemoMap', res1,
                         re.S) != None:
                skumap = '{' + re.search('skuMap.*?\{(.*?)propertyMemoMap',
                                         res1,
                                         re.S).group(1).strip().rstrip(',')
                skuMap = json.loads(skumap)
            else:
                skumap = '{' + re.search(
                    'skuMap.*?\{(.*?)salesProp', res1,
                    re.S).group(1).rstrip('"').strip().rstrip(',')
                skuMap = json.loads(skumap)

            new_dict = {}
            for k, v in skuMap.items():
                kk = re.findall('\:(\d+)\;', k, re.S)
                new_dict['%s' % kk] = v['price']

            # 图片链接
            temp_result = etree.HTML(res1).xpath(
                "//ul[contains(@class,'J_TSaleProp tb-img')]/li/a/@style")
            if temp_result != []:
                color = etree.HTML(res1).xpath(
                    "//ul[contains(@class,'J_TSaleProp tb-img')]/li/a/span/text()"
                )
                color_dict = {}
                for key1, value1 in zip(temp_result, color):
                    color_dict['%s' % value1] = 'https:' + re.search(
                        '\((.*?)\)', key1, re.S).group(1)

                another_dict = {}
                for key4, value4 in new_dict.items():
                    key2 = eval(key4)
                    rep = [
                        result_dict[x] if x in result_dict else x for x in key2
                    ]
                    another_dict['%s' % rep] = {"price": value4, "url": ''}

                other_dict = {}
                for key3, value3 in another_dict.items():
                    key7 = eval(key3)
                    for y in key7:
                        if y in color_dict:
                            other_dict['%s' % key3] = {
                                'price': '%s' % value3['price'],
                                'url': '%s' % color_dict[y]
                            }

            else:
                other_dict = {}
                for key5, value5 in new_dict.items():
                    key6 = eval(key5)
                    rep = [
                        result_dict[x] if x in result_dict else x for x in key6
                    ]
                    other_dict['%s' % rep] = {"price": value5, "url": ''}
            product_info['spcification'] = other_dict
        except TypeError:
            pass
        print(product_info)
        product_info_list.append(product_info)
        time.sleep(5)
    print(cate_list)