def parseProductPage(product, need_img_urls=False): """进入商品详情页, 抓取四个新字段 delivery reviews star total_sales """ if product['sku_id']: content = fetchContent("https://detail.1688.com/offer/" + product['sku_id'] + ".html") doc = PyQuery(content) #product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了 product['reviews'] = doc('p.satisfaction-number > a > em.value').text() product['star'] = doc('p.star-level > i').attr("class") product['total_sales'] = doc('p.bargain-number > a > em.value').text() if need_img_urls: url_list = get_img_urls(content) product['img_urls'] = ', '.join(url_list) else: product['img_urls'] = '' product['color'], product['size'] = '', '' for index, td in enumerate( doc('div.obj-content > table > tbody > tr > td')): tdQ = PyQuery(td) if tdQ.attr( 'class') == 'de-feature' and tdQ.text().strip() == u'颜色': product['color'] = PyQuery( doc('div.obj-content > table > tbody > tr > td')[ index + 1]).text() if tdQ.attr( 'class') == 'de-feature' and tdQ.text().strip() == u'尺寸': product['size'] = PyQuery( doc('div.obj-content > table > tbody > tr > td')[ index + 1]).text() return product
def parseProductPage(product, need_img_urls=False): """进入商品详情页, 抓取四个新字段 delivery reviews star total_sales """ if product['product_url']: content = fetchContent(product['product_url'], False) doc=PyQuery(content) #product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了 product['reviews'] = doc('p.satisfaction-number > a > em.value').text() product['star'] = doc('p.star-level > i').attr("class") product['total_sales'] = doc('p.bargain-number > a > em.value').text() if need_img_urls: url_list = get_img_urls(content) product['img_urls'] = ', '.join(url_list) else: product['img_urls'] = '' product['color'], product['size'] = '', '' for index, td in enumerate(doc('div.obj-content > table > tbody > tr > td')): tdQ = PyQuery(td) if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'颜色': product['color'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text() if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'尺寸': product['size'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text() product['MOQ'] = extractNum(doc('tr.amount > td.ladder-1-1 > span.value').text().replace(u"≥", "")) if not product['MOQ'] or product['MOQ'] == 0: product['MOQ'] = extractNum(PyQuery(doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text()) if product['MOQ'] == 1: #print product['product_url'] product['sku_size'] = PyQuery(doc('div.unit-detail-spec-operator').eq(0))('span.text').text() product['sku_color'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.name').text() product['sku_price'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.price').text() product['sku_amount'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.count > span > em.value').text() print product['sku_id'], '\t', product['sku_size'], "\t", product['sku_color'], "\t", product['sku_price'], "\t", product['sku_amount'] return product
def parseProductPage(product, need_img_urls=False): """进入商品详情页, 抓取四个新字段 delivery reviews star total_sales """ if product["sku_id"]: content = fetchContent("https://detail.1688.com/offer/" + product["sku_id"] + ".html") doc = PyQuery(content) # product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了 product["reviews"] = doc("p.satisfaction-number > a > em.value").text() product["star"] = doc("p.star-level > i").attr("class") product["total_sales"] = doc("p.bargain-number > a > em.value").text() if need_img_urls: url_list = get_img_urls(content) product["img_urls"] = ", ".join(url_list) else: product["img_urls"] = "" product["color"], product["size"] = "", "" for index, td in enumerate(doc("div.obj-content > table > tbody > tr > td")): tdQ = PyQuery(td) if tdQ.attr("class") == "de-feature" and tdQ.text().strip() == u"颜色": product["color"] = PyQuery(doc("div.obj-content > table > tbody > tr > td")[index + 1]).text() if tdQ.attr("class") == "de-feature" and tdQ.text().strip() == u"尺寸": product["size"] = PyQuery(doc("div.obj-content > table > tbody > tr > td")[index + 1]).text() return product