def parse_detail(self, resp): if 'item' in resp.request.meta: root = lxml.html.fromstring(resp.text.encode('utf-8')) item = resp.request.meta.get('item') goods_desc = root.xpath('//td[@class="txt11"]/text()') item['goods_desc'] = goods_desc[0].replace('\n', '').replace( '\t', '') if goods_desc else '' # goods_name goods_name = root.xpath('//td[@class="lnk11b-colorOff"]') item['goods_name'] = util.clear_text( goods_name[0].text) if goods_name else '' # goods_sn match = self.goods_sn_pattern.search(resp.url) item['goods_sn'] = match.group(1) if match else '' # tiered tiered = [] price_list = root.xpath('//td[@class="texttable"]') for x in range(0, len(price_list), 2): qty = util.intval(price_list[x].text_content()) price = util.floatval(price_list[x + 1].text_content()) if qty and price: tiered.append([qty, price]) else: tiered = [[0, 0.00]] break if not tiered: price = root.xpath('//td[@class="txt18b-red"]/text()') price = util.floatval(price[0]) if price else 0 if price: tiered = [1, price] else: tiered = [] item['tiered'] = tiered if tiered else [[0, 0.00]] # stock qty = root.xpath('//input[@id="qty"]/@value') qty = util.intval(qty[0]) if qty else 1 stock = root.xpath('//input[@id="custcol7"]/@value') stock = util.intval(stock[0]) if stock else 0 item['stock'] = [stock, qty] # url item['url'] = resp.url # provider_name item['provider_name'] = 'LINEAR' item['provider_url'] = '' # attr item['attr'] = [] # rohs item['rohs'] = -1 item['goods_other_name'] = '' # increment item['increment'] = 1 # img item['goods_img'] = '' item['goods_thumb'] = '' # else: item = None return item
def parse_detail(self, resp): """解析系列型号数据""" item = GoodsItem() try: soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml') except Exception as e: logger.debug(u"初始化BS4对象失败,重试一次 URL:{url}".format(url=resp.url)) # 重试一次 return Request(url=resp.url, headers=self.headers, cookies=self.cookies) # goods_sn product_id = self.product_id_pattern_1.search( resp.url) or self.product_id_pattern_2.search(resp.url) goods_sn = product_id.group(1) if product_id else '' item['goods_sn'] = goods_sn if not goods_sn: logger.debug(u"获取goods_sn失败 URL:{url}".format(url=resp.url)) return None try: # goods_name product_ref = soup.find('p', class_='ref') goods_name = '' if product_ref: goods_name_pattern = re.compile( ur'<b>制造商零件编号:</b>\s*([^\"\'<>/]+)') product_ref_list = unicode(product_ref).split('<br/>') for x in product_ref_list: match = goods_name_pattern.search(x) if match: goods_name = match.group(1) break item['goods_name'] = goods_name # goods_other_name item['goods_other_name'] = '' except: logger.debug(u"获取goods_name失败 URL:{url}".format(url=resp.url)) item['goods_name'] = '' item['goods_other_name'] = '' # goods_desc goods_desc = soup.find('p', class_='desc') if not goods_desc: logger.debug(u"获取goods_desc失败 URL:{url}".format(url=resp.url)) item['goods_desc'] = goods_desc.get_text( strip=True) if goods_desc else '' # provider_name and provider_url provider_name = soup.find('img', id='ctl00_PlaceHolderMain_mfrLogo') item['provider_name'] = provider_name.get('title', '') if provider_name else '' # 如果在商标图片中无法获取 provider_name ,尝试从 product-desc 中获取 if not provider_name: desc_div = soup.find('div', id='product-desc') provider_name = desc_div.find('h2') provider_name = provider_name.get_text( strip=True) if provider_name else '' item['provider_name'] = provider_name item['provider_url'] = '' # url item['url'] = resp.url # doc doc = soup.find( 'a', id='ctl00_PlaceHolderMain_csDownloadCenter_linkDatasheetUrlJustText' ) item['doc'] = doc.get('href', '') if doc else '' # goods_img and goods_thumb goods_img = soup.find('img', id='previewedMEDImage') item['goods_img'] = goods_img.get('src', '') if goods_img else '' goods_thumb = soup.find('img', id='thumbnail-1') item['goods_thumb'] = goods_thumb.get('src', '') if goods_thumb else '' # catlog item['catlog'] = [] catlog = soup.find('ul', id='breadcrumb-navigation') catlog_list = catlog.find_all('a') for a in catlog_list: breadcrumb_name = a.get_text(strip=True) breadcrumb_url = urlparse.urljoin(resp.url, a.get('href', '')) item['catlog'].append([breadcrumb_name, breadcrumb_url]) # attr item['attr'] = [] product_attr_div = soup.find('div', id='product-details-overview-highlights') product_attr_list = product_attr_div.find_all( 'li') if product_attr_div else [] for li in product_attr_list: attr_name, attr_value = li.get_text(strip=True).split(':') item['attr'].append([attr_name, attr_value]) # tiered try: item['tiered'] = [] price_table = soup.find('table', class_='product-prices') price_tr_list = price_table.find_all('tr', class_='price-break') for tr in price_tr_list: qty_th = tr.find('th') qty = qty_th.get_text(strip=True) if qty_th else 0 qty = box.intval(qty) price_span = tr.find('span') price = price_span.get_text(strip=True) if price_span else 0.00 price = box.floatval(price) # print qty, price if qty and price: item['tiered'].append([qty, price]) else: item['tiered'] = [0, 0.00] except: logger.debug(u"获取tiered失败 URL:{url}".format(url=resp.url)) item['tiered'] = [0, 0.00] # stock、increment、 min_qty try: stock_div = soup.find('div', id='product-qty-content') stock_tr = stock_div.find('tr', class_='qtyInStock') increment_tr = stock_div.find('tr', class_='multipleOf') min_qty_tr = stock_div.find('tr', class_='minOrderQty') stock = stock_tr.find('td', class_='qty').get_text( strip=True) if stock_tr else 0 stock = box.intval(stock) increment = increment_tr.find('td', class_='qty').get_text( strip=True) if increment_tr else 1 increment = box.intval(increment) min_qty = min_qty_tr.find('td', class_='qty').get_text( strip=True) if min_qty_tr else 1 min_qty = box.intval(min_qty) item['stock'] = [stock, min_qty] item['increment'] = increment except: logger.debug(u"获取stock失败 URL:{url}".format(url=resp.url)) item['stock'] = [0, 1] item['increment'] = 1 # rohs rohs_div = soup.find('div', id='ctl00_PlaceHolderMain_imgRoHS') item['rohs'] = 1 if rohs_div else -1 return item
def get_page_detail(url=None, category=None): try: # response = requests.get(url=url, headers=default_headers) response = fetcher(url=url, return_response=True) soup = BeautifulSoup(response.content, 'lxml') except: print 'Failed' goods_div = soup.find('div', class_='shengpin') if goods_div: goods_list = goods_div.find_all('li') for goods in goods_list: data = {} # goods_sn if not goods: continue gid = goods.attrs['data-gdsid'] if gid: _sn = ('%s-%s' % (gid, PN2)).encode('utf-8') data['goods_sn'] = hashlib.md5(_sn).hexdigest() else: continue # category data['category'] = category if category else [] # goods_other_name other_name_span = goods.find('span', class_='top_1') other_name = other_name_span.get_text( strip=True) if other_name_span else '' data['goods_other_name'] = other_name # goods_name goods_name_span = goods.find_all('span', class_='p_21') if goods_name_span: goods_name = goods_name_span[1].get_text(strip=True) data['goods_name'] = goods_name.split(u':')[1] print data['goods_name'] # goods_desc goods_type_span = goods.find('span', class_='p_21') data['goods_desc'] = goods_type_span.get_text( strip=True) if goods_type_span else '' # tiered cn_price price_span = goods.find('span', class_='p_23') hk_price = 0.0 oversea_price = 0.0 cn_price = box.floatval(price_span.get_text( strip=True)) if price_span else 0.00 data['tiered'] = [[1, hk_price, cn_price, oversea_price]] # stock stock_span = goods.find('span', class_='p_24') data['stock'] = box.intval(stock_span.get_text( strip=True)) if stock_span else 0 # url data['url'] = url # img data['goods_img'] = '' yield data
def parse_detail(self, resp): """解析系列型号数据""" item = GoodsItem() root = lxml.html.fromstring(resp.text.encode('utf-8')) item.update({ 'goods_img': '', 'goods_thumb': '', 'provider_url': '', 'attr': [], 'catlog': [], 'rohs': -1, }) _table = root.xpath('//table[@class="partdetail"]') select_parse_mode = len(_table) flag = 'Product Change Notice' in resp.text.encode('utf-8') if select_parse_mode == 1 or flag: detail_table = _table[0] info_table = detail_table.xpath('//table[@id="partinfo"]') goods_sn = info_table[0].xpath( './/td[@class="txtleft"]/h4/text()') if info_table else None if not goods_sn: return item['goods_sn'] = goods_sn[0].strip() item['goods_name'] = item['goods_sn'] # goods_other_name goods_other_name = info_table[0].xpath('.//tr[2]/td[2]/text()') item['goods_other_name'] = goods_other_name[0].strip( ) if goods_other_name else '' # provider_name provider_name = info_table[0].xpath('.//tr[3]/td[2]/text()') item['provider_name'] = provider_name[0].strip( ) if provider_name else '' # goods_desc goods_desc = info_table[0].xpath('.//tr[4]/td[2]/text()') item['goods_desc'] = goods_desc[0].strip() if goods_desc else '' # doc doc = info_table[0].xpath('.//tr[5]//h4/a/@href') item['doc'] = urlparse.urljoin(resp.url, doc[0]) if doc else '' # url item['url'] = resp.url # increment item['increment'] = 1 # tiered price_table = detail_table.xpath('.//table[@class="price-break"]') if not price_table: item['tiered'] = [[0, 0.00]] else: tiered = [] price_tr = price_table[0].findall('tr') for tr in price_tr: tds = tr.findall('td') qty = util.intval(tds[0].text) price = util.floatval(tds[1].text, places=5) if price == 0 or qty == 0: break tiered.append([qty, price]) item['tiered'] = tiered if tiered else [[0, 0.00]] # stock item['stock'] = [0, 1] available = detail_table.xpath('./tr[2]/td[2]/text()') stock = util.intval(available[0].strip()) if available else 0 # qty quantity = detail_table.xpath('./tr[2]/td[4]') input_box = quantity[0].findall('input') if quantity else None if input_box: quantity = quantity[0].xpath( '//input[@class="textbox"]/@value') else: quantity = util.intval(quantity[0].text) if quantity else 1 item['stock'] = [stock, quantity] elif select_parse_mode == 2: stock_table = _table[0].xpath('./tr[2]/td') info_table = _table[1] goods_sn = stock_table[0].text_content() item['goods_sn'] = goods_sn.strip() if not goods_sn: return item['goods_sn'] = goods_sn.strip() item['goods_name'] = item['goods_sn'] # url item['url'] = resp.url # tiered price_table = stock_table[5].xpath( './/table[@class="price-break"]') if not price_table: item['tiered'] = [[0, 0.00]] else: tiered = [] price_tr = price_table[0].findall('tr') for tr in price_tr: tds = tr.findall('td') qty = util.intval(tds[0].text) price = util.floatval(tds[1].text, places=5) if price == 0 or qty == 0: break tiered.append([qty, price]) item['tiered'] = tiered if tiered else [[0, 0.00]] # stock item['stock'] = [0, 1] available = stock_table[1].text_content() stock = util.intval(available) if available.strip() else 0 # qty quantity = stock_table[6] input_box = quantity.findall( 'input') if quantity is not None else None if input_box: input_value = quantity.xpath( '//input[@class="textbox"]/@value') quantity = util.intval( input_value[0]) if len(input_value) else 1 else: quantity = item['tiered'][0][ 0] if item['tiered'][0][0] != 0 else 1 item['stock'] = [stock, quantity] # increment increment = stock_table[4].text_content() item['increment'] = util.intval(increment, index=999) # goods_other_name goods_other_name = info_table.xpath('./tr[3]/td[2]/text()') item['goods_other_name'] = goods_other_name[0].strip() if len( goods_other_name) else '' # provider_name provider_name = info_table.xpath('./tr[4]/td[2]/text()') item['provider_name'] = provider_name[0].strip( ) if provider_name else '' # goods_desc goods_desc = info_table.xpath('./tr[5]/td[2]/text()') item['goods_desc'] = goods_desc[0].strip() if goods_desc else '' # doc doc = info_table.xpath('./tr[7]//a/@href') item['doc'] = urlparse.urljoin(resp.url, doc[0]) if doc else '' # rohs rohs = info_table.xpath('./tr[8]//img') item['rohs'] = 1 if len(rohs) else -1 return item
def parse_detail(self, resp): item = GoodsItem() root = lxml.html.fromstring(resp.text.encode('utf-8')) # goods_name goods_name = root.xpath('//td[@class="lnk11b-colorOff"]') item['goods_name'] = util.clear_text( goods_name[0].text) if goods_name else '' # goods_sn match = self.goods_sn_pattern.search(resp.url) item['goods_sn'] = match.group(1) if match else '' if not item['goods_name'] or not item['goods_sn']: logger.debug( "无法解析goods_name和goods_sn URL:{url}".format(url=resp.url)) if not resp.request.meta.get('retry', None): return Request(url=resp.url, headers=self.headers, meta={'retry': 1}) else: return None # goods_desc goods_desc = root.xpath('//td[@class="txt11"]/text()') item['goods_desc'] = goods_desc[0].replace('\n', '').replace( '\t', '') if goods_desc else '' # tiered tiered = [] price_list = root.xpath('//td[@class="texttable"]') for x in range(0, len(price_list), 2): qty = util.intval(price_list[x].text_content()) price = util.floatval(price_list[x + 1].text_content()) if qty and price: tiered.append([qty, price]) else: tiered = [[0, 0.00]] break if not tiered: price = root.xpath('//td[@class="txt18b-red"]/text()') price = util.floatval(price[0]) if price else 0 if price: tiered = [1, price] else: tiered = [] item['tiered'] = tiered if tiered else [[0, 0.00]] # stock qty = root.xpath('//input[@id="qty"]/@value') qty = util.intval(qty[0]) if qty else 1 stock = root.xpath('//input[@id="custcol7"]/@value') stock = util.intval(stock[0]) if stock else 0 item['stock'] = [stock, qty] # url item['url'] = resp.url # provider_name item['provider_name'] = 'LINEAR' item['provider_url'] = '' # doc catlog item['doc'] = '' item['catlog'] = '' # attr item['attr'] = [] # rohs item['rohs'] = -1 item['goods_other_name'] = '' # increment item['increment'] = 1 # img item['goods_img'] = '' item['goods_thumb'] = '' # 一些信息需要在linear.com.cn获取 search_url = 'http://www.linear.com.cn/search/index.php?q={search}'.format( search=item['goods_name']) _headers = self.headers _headers.update({'Host': 'www.linear.com.cn'}) return Request(url=search_url, headers=_headers, meta={ 'item': item, 'dont_redirect': True, 'handle_httpstatus_list': [302] }, callback=self.manual_handle_of_redirects)
def get_detail(): target = 'https://estore.heilind.com/2BA-AL-36/POM2BA-AL-36.html' # rs = requests.get(url=target, headers=_headers, proxies=proxies) # html = rs.text.encode('utf-8') # with open(r'html/detail3.html', 'w') as fp: # fp.write(html) with open('html/detail3.html', 'r') as fp: html = fp.read() root = lxml.html.fromstring(html) item = { 'goods_img': '', 'goods_thumb': '', 'provider_url': '', 'attr': [], 'catlog': [], 'rohs': -1, } _table = root.xpath('//table[@class="partdetail"]') select_parse_mode = len(_table) if select_parse_mode == 1: detail_table = _table[0] info_table = detail_table.xpath('//table[@id="partinfo"]') goods_sn = info_table[0].xpath( './/td[@class="txtleft"]/h4/text()') if info_table else None if not goods_sn: return item['goods_sn'] = goods_sn[0].strip() item['goods_name'] = item['goods_sn'] # goods_other_name goods_other_name = info_table[0].xpath('.//tr[2]/td[2]/text()') item['goods_other_name'] = goods_other_name[0].strip( ) if goods_other_name else '' # provider_name provider_name = info_table[0].xpath('.//tr[3]/td[2]/text()') item['provider_name'] = provider_name[0].strip( ) if provider_name else '' # goods_desc goods_desc = info_table[0].xpath('.//tr[4]/td[2]/text()') item['goods_desc'] = goods_desc[0].strip() if goods_desc else '' # doc doc = info_table[0].xpath('.//tr[5]//h4/a/@href') item['doc'] = urlparse.urljoin(target, doc[0]) if doc else '' # url item['url'] = '' # increment item['increment'] = 1 # tiered price_table = detail_table.xpath('.//table[@class="price-break"]') if not price_table: item['tiered'] = [[0, 0.00]] else: tiered = [] price_tr = price_table[0].findall('tr') for tr in price_tr: tds = tr.findall('td') qty = util.intval(tds[0].text) price = util.floatval(tds[1].text, places=5) if price == 0 or qty == 0: break tiered.append([qty, price]) item['tiered'] = tiered if tiered else [[0, 0.00]] # stock item['stock'] = [0, 1] available = detail_table.xpath('./tr[2]/td[2]/text()') stock = util.intval(available[0].strip()) if available else 0 # qty quantity = detail_table.xpath('./tr[2]/td[4]') input_box = quantity[0].findall('input') if quantity else None if input_box: quantity = quantity[0].xpath('//input[@class="textbox"]/@value') else: quantity = util.intval(quantity[0].text) if quantity else 1 item['stock'] = [stock, quantity] elif select_parse_mode == 2: stock_table = _table[0].xpath('./tr[2]/td') info_table = _table[1] goods_sn = stock_table[0].text_content() item['goods_sn'] = goods_sn.strip() if not goods_sn: return item['goods_sn'] = goods_sn.strip() item['goods_name'] = item['goods_sn'] # url item['url'] = '' # tiered price_table = stock_table[5].xpath('.//table[@class="price-break"]') if not price_table: item['tiered'] = [[0, 0.00]] else: tiered = [] price_tr = price_table[0].findall('tr') for tr in price_tr: tds = tr.findall('td') qty = util.intval(tds[0].text) price = util.floatval(tds[1].text, places=5) if price == 0 or qty == 0: break tiered.append([qty, price]) item['tiered'] = tiered if tiered else [[0, 0.00]] # stock item['stock'] = [0, 1] available = stock_table[1].text_content() stock = util.intval(available) if available.strip() else 0 # qty quantity = stock_table[6] input_box = quantity.findall('input') if quantity is not None else None if input_box: input_value = quantity.xpath('//input[@class="textbox"]/@value') quantity = util.intval(input_value[0]) if len(input_value) else 1 else: quantity = item['tiered'][0][0] if item['tiered'][0][0] != 0 else 1 item['stock'] = [stock, quantity] # increment increment = stock_table[4].text_content() item['increment'] = util.intval(increment, index=999) # goods_other_name goods_other_name = info_table.xpath('./tr[3]/td[2]/text()') item['goods_other_name'] = goods_other_name[0].strip() if len( goods_other_name) else '' # provider_name provider_name = info_table.xpath('./tr[4]/td[2]/text()') item['provider_name'] = provider_name[0].strip( ) if provider_name else '' # goods_desc goods_desc = info_table.xpath('./tr[5]/td[2]/text()') item['goods_desc'] = goods_desc[0].strip() if goods_desc else '' # doc doc = info_table.xpath('./tr[7]//a/@href') item['doc'] = urlparse.urljoin(target, doc[0]) if doc else '' # rohs rohs = info_table.xpath('./tr[8]//img') item['rohs'] = 1 if len(rohs) else -1 return item
def parse_detail(self, resp): item = GoodsItem() root = lxml.html.fromstring(resp.text.encode('utf-8')) # goods_sn goods_sn_match = re.search(r'productId=(\d+)', resp.url) if goods_sn_match: item['goods_sn'] = goods_sn_match.group(1) else: logger.debug(u"解析 goods_sn 失败,重试URL:{url}".format(url=resp.url)) return None # goods_name, provider_name, goods_desc try: title = root.xpath('//span[@class="ContentTitle"]')[0] item['goods_name'] = util.cleartext(title.text) provider_name = title.xpath('a') item['goods_desc'] = title.text_content().strip(' ') item['provider_name'] = util.cleartext( provider_name[0].text) if provider_name else '' item['provider_url'] = '' except IndexError: logger.debug(u"解析 goods_name 失败,重试URL:{url}".format(url=resp.url)) return Request(url=resp.url, headers=self.headers) # goods_other_name goods_other_name = root.xpath('//span[@style="font-weight:bold;"]') for x in goods_other_name: match = re.search('MFG\s*Part\s*Number:\s*([^\s]+)', x.text, re.IGNORECASE) item['goods_other_name'] = match.group(1) if match else '' # url item['url'] = resp.url # catlog item['catlog'] = [] catlog_div = root.xpath('//div[@class="breadcrumb"]//a') for catlog in catlog_div: catlog_name = util.cleartext(catlog.text) catlog_url = util.urljoin(resp.url, catlog.xpath('./@href')[0]) if catlog_name and catlog_url: if '/Pages/Home.aspx' in catlog_url or 'productCategory=All' in catlog_url: continue item['catlog'].append([catlog_name, catlog_url]) # attr and tiered div div = root.xpath('//div[@id="div2"]') # 获取不到div就重试一次 if not div and not resp.request.meta.get('retry'): logger.debug(u'网页加载不完整。重试一次 URL:{url}'.format(url=resp.url)) return Request(url=resp.url, headers=self.headers, meta={'retry': 1}) # rohs rohs_img = div[0].xpath('.//img[contains(@title, "ROHS")]/@src') item['rohs'] = 1 if rohs_img else -1 # img img_thumb = div[0].xpath('.//table[@align="Right"]//img/@src') item['goods_thumb'] = util.urljoin(resp.url, img_thumb[0]) if img_thumb else '' img_large = div[0].xpath( './/table[@align="Right"]//a[@id="imgFull"]/@href') item['goods_img'] = util.urljoin(resp.url, img_large[0]) if img_large else '' # attr item['attr'] = [] try: attr_table = div[0].xpath( './/td[@align="left"]//table[@class="PDTable"]//td') for x in range(0, len(attr_table), 2): attr_key = attr_table[x].text attr_value = attr_table[x + 1].text if attr_key: attr_key = attr_key.strip(' ') attr_value = attr_value.strip(' ') if attr_value else '' if attr_value: item['attr'].append([attr_key, attr_value]) else: break except IndexError: logger.debug(u"无法查找到属性列表 URL:{url}".format(url=resp.url)) # tiered item['tiered'] = [] try: price_table = div[0].xpath( './/td[@align="center"]//table[@class="PDTable"]/tr') stock = [] for tr in price_table: td = tr.findall('td') if len(td) == 1: if "Quote Required" in td[0].text: item['tiered'] = [[0, 0.00]] break else: stock.append(util.intval(td[0].text)) elif len(td) == 2: qty = util.intval(td[0].text) price = util.floatval(td[1].text) if price: item['tiered'].append([qty, price]) else: continue # 可能 Manufacturer Stock 并没有显示在表格中,将其设置为0,并添加到stock中 if len(stock) == 1: stock.append(0) # 从价格阶梯中获取最小起订量加入stock min_qty = item['tiered'][0][0] if item['tiered'][0][0] else 1 stock.insert(1, min_qty) item['stock'] = stock except IndexError: logger.debug(u"无法正确解析价格列表 URL:{url}".format(url=resp.url)) item['stock'] = [0, 1, 0] item['tiered'] = [[0, 0.00]] # doc doc_link = root.xpath('//a[@id="docDown"]/@href') item['doc'] = doc_link[0] if doc_link else '' # increment item['increment'] = 1 return item