예제 #1
0
def parse_more(item=None, response=None):
    if not item or not response:
        return -404
    root = lxml.html.fromstring(response.text.encode('utf-8'))
    data = {}
    # family_sn
    match = family_sn_pattern.search(response.url)
    data['family_sn'] = match.group(1) if match else item['goods_name']
    # catlog
    breadcrumb = root.xpath('//p[@class="breadcrumb"]/a')
    data['catlog'] = []
    for catlog in breadcrumb:
        catlog_name = util.cleartext(catlog.text_content())
        catlog_url = util.urljoin(response.url, catlog.xpath('./@href')[0])
        if catlog_name and catlog_url:
            data['catlog'].append([catlog_name, catlog_url])
        else:
            data['catlog'] = []
            break
    else:
        data['catlog'].append([data['family_sn'], response.url])
    # doc
    doc = root.xpath('//li[@class="pdf"]/a[@class="doclink"]/@title')
    data['doc'] = "http://cds.linear.com/docs/en/datasheet/{title}".format(
        title=doc[0]) if doc else ''

    item.update(data)
    return item
예제 #2
0
    def parse_model_detail(self, response):
        '''解析产品详情'''
        json_html = re.findall(
            r'<script type="application/ld\+json">(.*?)</script>',
            response.body, re.S)
        if not json_html:
            raise DropItem('匹配源码内容异常 请检查:{0}'.format(response.url))
        json_data = json.loads(json_html[0])
        product_list = json_data['offers']
        pre_url = 'https://www.ti.com.cn/product/cn/{}'.format(
            json_data['mpn'])
        description = json_data['description']
        doc_url = urljoin(
            self.base_url,
            response.xpath(
                '//div/a[@data-navtitle="data sheet"]/@href').extract_first())
        attrs_items = response.xpath(
            '//ti-multicolumn-list/ti-multicolumn-list-row')
        attr_list = []
        # 获取属性列表
        for attrs_item in attrs_items:
            attr = attrs_item.xpath(
                './ti-multicolumn-list-cell/span/text()').extract()
            if not attr:
                continue
            key = util.cleartext(attr[0])
            val = util.cleartext(attr[1])
            if key and val:
                attr_list.append((key, val))
        # 获取分类列表
        cat_list = []
        cat_items = response.xpath(
            '//ti-breadcrumb/ti-breadcrumb-section/a')[1:]
        for cat_item in cat_items:
            ckey = util.cleartext(cat_item.xpath('./text()').extract_first())
            cval = urljoin(self.base_url,
                           cat_item.xpath('./@href').extract_first())
            cat_list.append((ckey, cval))

        for data in product_list:
            item = GoodsItem()
            data = data['itemOffered']
            item['url'] = pre_url
            item['goods_sn'] = data['sku']
            item['goods_other_name'] = item['goods_name'] = data['mpn']
            item['provider_name'] = data['brand']
            item['provider_url'] = ''
            item['goods_desc'] = description
            item['goods_img'] = item['goods_thumb'] = ''
            item['doc'] = doc_url
            item['rohs'] = 0
            shop_price = data['offers'].get('price')
            item['tiered'] = []
            if not shop_price:
                item['stock'] = [0, 1]  # 库存
                item['increment'] = 1
            else:
                # 庫存判斷
                if not data['offers'].get('inventoryLevel'):
                    item['stock'] = [0, 1]
                else:
                    item['stock'] = [
                        util.intval(data['offers']['inventoryLevel']), 1
                    ]  # 库存
                for price_item in data['offers']['priceSpecification']:
                    pnum = price_item['eligibleQuantity']['minValue']
                    pval = price_item['price']
                    item['tiered'].append(
                        (util.intval(pnum), util.floatval(pval)))
                item['increment'] = item['tiered'][0][0]
            if not item['tiered']:
                item['tiered'] = [[0, 0.00]]
            # 属性
            item['attr'] = attr_list
            # 分类
            item['catlog'] = cat_list
            yield item
예제 #3
0
def _parse_detail_data(resp, headers=None, **kwargs):
    """
    解析详情数据,独立出来

    @param  data    页面数据
    @param  url     解析的页面url(方便记录异常)
    @param  kwargs  扩展参数
    """
    item = {}
    root = lxml.html.fromstring(resp.text.encode('utf-8'))
    # goods_name
    goods_name = root.xpath('//td[@class="lnk11b-colorOff"]')
    item['goods_name'] = util.cleartext(
        goods_name[0].text) if goods_name else ''
    # goods_sn
    match = goods_sn_pattern.search(resp.url)
    item['goods_sn'] = match.group(1) if match else ''
    if not item['goods_name'] or not item['goods_sn']:
        logger.debug("无法解析goods_name和goods_sn URL:{url}".format(url=resp.url))
        return -404
    # goods_desc
    goods_desc = root.xpath('//td[@class="txt11"]/text()')
    item['desc'] = util.cleartext(goods_desc[0], '\n',
                                  '\t') if goods_desc else ''
    # tiered
    tiered = []
    price_list = root.xpath('//td[@class="texttable"]')
    for x in range(0, len(price_list), 2):
        qty = util.intval(price_list[x].text_content())
        price = util.floatval(price_list[x + 1].text_content())
        if qty and price:
            tiered.append([qty, price])
        else:
            tiered = [[0, 0.00]]
            break
    if not tiered:
        price = root.xpath('//td[@class="txt18b-red"]/text()')
        price = util.floatval(price[0]) if price else 0
        if price:
            tiered = [1, price]
        else:
            tiered = []

    item['tiered'] = tiered if tiered else [[0, 0.00]]
    # stock
    qty = root.xpath('//input[@id="qty"]/@value')
    qty = util.intval(qty[0]) if qty else 1
    stock = root.xpath('//input[@id="custcol7"]/@value')
    stock = util.intval(stock[0]) if stock else 0
    item['stock'] = [stock, qty]
    # url
    item['url'] = resp.url
    # provider_name
    item['provider_name'] = 'LINEAR'
    item['provider_url'] = ''
    # doc catlog
    item['doc'] = ''
    item['catlog'] = ''
    # attr
    item['attr'] = []
    # rohs
    item['rohs'] = -1
    item['goods_other_name'] = ''
    # increment
    item['increment'] = 1
    # img
    item['goods_img'] = ''
    item['goods_thumb'] = ''
    # 一些信息需要在linear.com.cn获取
    return handle_of_redirects(item)
예제 #4
0
def _parse_detail_data(resp, headers=None, **kwargs):
    """
    解析详情数据,独立出来

    @param  data    页面数据
    @param  url     解析的页面url(方便记录异常)
    @param  kwargs  扩展参数
    """
    items = {'list': []}
    item = {}
    """解析系列型号数据"""
    # gpn
    pattern_gpn = re.compile(r'/product/([^/\?\.%&]+)')
    gpn = pattern_gpn.search(resp.url)
    if not gpn:
        logger.debug('status: -403; 解析商品详情失败, url: %s', str(resp.url))
        return -403
    gpn = gpn.group(1)
    soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml')
    # family_sn
    item['family_sn'] = gpn.upper()
    item['product_id'] = item['family_sn']
    # category
    breadcrumb_div = soup.find('div', class_='breadcrumb')
    cat_log = []
    if breadcrumb_div:
        for a in breadcrumb_div.find_all('a'):
            if 'TI Home' in a.get_text(strip=True):
                continue
            cat_log.append([a.get_text(strip=True), a['href']])
    item['catlog'] = cat_log if cat_log else []
    # goods_img, goods_thumb
    img_div = soup.find('div', class_='image')
    img = img_div.img['src'] if img_div else ''
    item['goods_img'] = img
    item['goods_thumb'] = img
    # attr
    attr = []
    params_table = soup.find('table', id='paramsName')
    data_table = soup.find('table', id='parametricdata')
    if params_table and data_table:
        attr_params = params_table.find_all('td')[0:-1]
        attr_data = data_table.find_all('td', class_='on')[0:-1]
        for k, v in zip(attr_params, attr_data):
            pattern_blank = re.compile('\s+')
            k = pattern_blank.sub(' ', k.get_text(strip=True))
            v = pattern_blank.sub(' ', v.get_text(strip=True))
            attr.append([k, v])
    item['attr'] = attr
    # doc
    doc_url = soup.find('a', class_='local')
    item['doc'] = util.cleartext(doc_url.get('href')) if doc_url else ''
    # description
    desc = soup.find('h1', class_='productTitle')
    item['desc'] = desc.get_text(strip=True) if desc else ''
    for p in get_detail(gpn, **kwargs):
        item['goods_sn'] = p.get('goods_sn', '')
        if not item['goods_sn']:
            continue
        item['goods_name'] = p.get('goods_sn', '')
        item['goods_other_name'] = ''
        item['url'] = p.get('url', '')
        # item['doc'] = get_data_sheet(gpn, **kwargs)
        item['stock'] = p.get('stock', [0, 1])
        item['tiered'] = p.get('tiered', [[0, 0.0]])
        # 添加供应商品牌
        item['provider_name'] = p.get('provider_name', '')
        item['provider_url'] = ''
        item['increment'] = 1
        item['rohs'] = -1
        items['list'].append(copy.copy(item))
    if not items['list']:
        logger.debug('status: -403; 解析商品详情失败, url: %s', str(resp.url))
        return -403
    return items