def parse_price(self, resp): """解析库存价格数据""" items = resp.meta.get('items') if not items: logger.error('request meta data error, url: %s', resp.url) return prices = {} try: data = json.loads(resp.body) for entprice in data['EntitledPrice']: tiered = [] if 'RangePrice' not in entprice: entprice['RangePrice'] = [] for vo in entprice['RangePrice']: qty = util.intval(vo['minimumQuantity']['value']) if 'minimumQuantity' in vo else 1 price = util.floatval(vo['priceInRange']['value']) if 'priceInRange' in vo else 0 if not qty or (tiered and qty < tiered[-1][0]): continue tiered.append([qty, price]) if not tiered: tiered.append([0, 0.0]) prices[entprice['productId']] = tiered except: logger.exception('parse stock price error, url: {0}---price_Json_error---{1}'.format(resp.url, resp.body) ) for item in items: if item['goods_sn'] in prices: item['tiered'] = prices[item['goods_sn']] yield item
def parse_model_detail(self, response): '''解析产品详情''' json_html = re.findall( r'<script type="application/ld\+json">(.*?)</script>', response.body, re.S) if not json_html: raise DropItem('匹配源码内容异常 请检查:{0}'.format(response.url)) json_data = json.loads(json_html[0]) product_list = json_data['offers'] pre_url = 'https://www.ti.com.cn/product/cn/{}'.format( json_data['mpn']) description = json_data['description'] doc_url = urljoin( self.base_url, response.xpath( '//div/a[@data-navtitle="data sheet"]/@href').extract_first()) attrs_items = response.xpath( '//ti-multicolumn-list/ti-multicolumn-list-row') attr_list = [] # 获取属性列表 for attrs_item in attrs_items: attr = attrs_item.xpath( './ti-multicolumn-list-cell/span/text()').extract() if not attr: continue key = util.cleartext(attr[0]) val = util.cleartext(attr[1]) if key and val: attr_list.append((key, val)) # 获取分类列表 cat_list = [] cat_items = response.xpath( '//ti-breadcrumb/ti-breadcrumb-section/a')[1:] for cat_item in cat_items: ckey = util.cleartext(cat_item.xpath('./text()').extract_first()) cval = urljoin(self.base_url, cat_item.xpath('./@href').extract_first()) cat_list.append((ckey, cval)) for data in product_list: item = GoodsItem() data = data['itemOffered'] item['url'] = pre_url item['goods_sn'] = data['sku'] item['goods_other_name'] = item['goods_name'] = data['mpn'] item['provider_name'] = data['brand'] item['provider_url'] = '' item['goods_desc'] = description item['goods_img'] = item['goods_thumb'] = '' item['doc'] = doc_url item['rohs'] = 0 shop_price = data['offers'].get('price') item['tiered'] = [] if not shop_price: item['stock'] = [0, 1] # 库存 item['increment'] = 1 else: # 庫存判斷 if not data['offers'].get('inventoryLevel'): item['stock'] = [0, 1] else: item['stock'] = [ util.intval(data['offers']['inventoryLevel']), 1 ] # 库存 for price_item in data['offers']['priceSpecification']: pnum = price_item['eligibleQuantity']['minValue'] pval = price_item['price'] item['tiered'].append( (util.intval(pnum), util.floatval(pval))) item['increment'] = item['tiered'][0][0] if not item['tiered']: item['tiered'] = [[0, 0.00]] # 属性 item['attr'] = attr_list # 分类 item['catlog'] = cat_list yield item
def _parse_detail_data(resp, headers=None, **kwargs): """ 解析详情数据,独立出来 @param data 页面数据 @param url 解析的页面url(方便记录异常) @param kwargs 扩展参数 """ item = {} root = lxml.html.fromstring(resp.text.encode('utf-8')) # goods_name goods_name = root.xpath('//td[@class="lnk11b-colorOff"]') item['goods_name'] = util.cleartext( goods_name[0].text) if goods_name else '' # goods_sn match = goods_sn_pattern.search(resp.url) item['goods_sn'] = match.group(1) if match else '' if not item['goods_name'] or not item['goods_sn']: logger.debug("无法解析goods_name和goods_sn URL:{url}".format(url=resp.url)) return -404 # goods_desc goods_desc = root.xpath('//td[@class="txt11"]/text()') item['desc'] = util.cleartext(goods_desc[0], '\n', '\t') if goods_desc else '' # tiered tiered = [] price_list = root.xpath('//td[@class="texttable"]') for x in range(0, len(price_list), 2): qty = util.intval(price_list[x].text_content()) price = util.floatval(price_list[x + 1].text_content()) if qty and price: tiered.append([qty, price]) else: tiered = [[0, 0.00]] break if not tiered: price = root.xpath('//td[@class="txt18b-red"]/text()') price = util.floatval(price[0]) if price else 0 if price: tiered = [1, price] else: tiered = [] item['tiered'] = tiered if tiered else [[0, 0.00]] # stock qty = root.xpath('//input[@id="qty"]/@value') qty = util.intval(qty[0]) if qty else 1 stock = root.xpath('//input[@id="custcol7"]/@value') stock = util.intval(stock[0]) if stock else 0 item['stock'] = [stock, qty] # url item['url'] = resp.url # provider_name item['provider_name'] = 'LINEAR' item['provider_url'] = '' # doc catlog item['doc'] = '' item['catlog'] = '' # attr item['attr'] = [] # rohs item['rohs'] = -1 item['goods_other_name'] = '' # increment item['increment'] = 1 # img item['goods_img'] = '' item['goods_thumb'] = '' # 一些信息需要在linear.com.cn获取 return handle_of_redirects(item)
def parse_detail(self, resp): """解析系列型号数据""" item = GoodsItem() try: soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml') except Exception as e: logger.debug(u"初始化BS4对象失败,重试一次 URL:{url}".format(url=resp.url)) # 重试一次 return Request(url=resp.url, headers=self.headers, cookies=self.cookies) # goods_sn product_id = self.product_id_pattern_1.search( resp.url) or self.product_id_pattern_2.search(resp.url) goods_sn = product_id.group(1) if product_id else '' item['goods_sn'] = goods_sn if not goods_sn: logger.debug(u"获取goods_sn失败 URL:{url}".format(url=resp.url)) return None try: # goods_name product_ref = soup.find('p', class_='ref') goods_name = '' if product_ref: goods_name_pattern = re.compile( ur'<b>制造商零件编号:</b>\s*([^\"\'<>/]+)') product_ref_list = unicode(product_ref).split('<br/>') for x in product_ref_list: match = goods_name_pattern.search(x) if match: goods_name = match.group(1) break item['goods_name'] = goods_name # goods_other_name item['goods_other_name'] = '' except: logger.debug(u"获取goods_name失败 URL:{url}".format(url=resp.url)) item['goods_name'] = '' item['goods_other_name'] = '' # goods_desc goods_desc = soup.find('p', class_='desc') if not goods_desc: logger.debug(u"获取goods_desc失败 URL:{url}".format(url=resp.url)) item['goods_desc'] = goods_desc.get_text( strip=True) if goods_desc else '' # provider_name and provider_url provider_name = soup.find('img', id='ctl00_PlaceHolderMain_mfrLogo') item['provider_name'] = provider_name.get('title', '') # 如果在商标图片中无法获取 provider_name ,尝试从 product-desc 中获取 if not provider_name: desc_div = soup.find('div', id='product-desc') provider_name = desc_div.find('h2') provider_name = provider_name.get_text( strip=True) if provider_name else '' item['provider_name'] = provider_name item['provider_url'] = '' # url item['url'] = resp.url # doc doc = soup.find( 'a', id='ctl00_PlaceHolderMain_csDownloadCenter_linkDatasheetUrlJustText' ) item['doc'] = doc.get('href', '') # goods_img and goods_thumb goods_img = soup.find('img', id='previewedMEDImage') item['goods_img'] = goods_img.get('src', '') goods_thumb = soup.find('img', id='thumbnail-1') item['goods_thumb'] = goods_thumb.get('src', '') # catlog item['catlog'] = [] catlog = soup.find('ul', id='breadcrumb-navigation') catlog_list = catlog.find_all('a') for a in catlog_list: breadcrumb_name = a.get_text(strip=True) breadcrumb_url = util.urljoin(resp.url, a.get('href', '')) item['catlog'].append([breadcrumb_name, breadcrumb_url]) # attr item['attr'] = [] product_attr_div = soup.find('div', id='product-details-overview-highlights') product_attr_list = product_attr_div.find_all( 'li') if product_attr_div else [] for li in product_attr_list: attr_name, attr_value = li.get_text(strip=True).split(':') item['attr'].append([attr_name, attr_value]) # tiered try: item['tiered'] = [] price_table = soup.find('table', class_='product-prices') price_tr_list = price_table.find_all('tr', class_='price-break') for tr in price_tr_list: qty_th = tr.find('th') qty = qty_th.get_text(strip=True) if qty_th else 0 qty = util.intval(qty) price_span = tr.find('span') price = price_span.get_text(strip=True) if price_span else 0.00 price = util.floatval(price) # print qty, price if qty and price: item['tiered'].append([qty, price]) else: item['tiered'] = [0, 0.00] except: logger.debug(u"获取tiered失败 URL:{url}".format(url=resp.url)) item['tiered'] = [0, 0.00] # stock、increment、 min_qty try: stock_div = soup.find('div', id='product-qty-content') stock_tr = stock_div.find('tr', class_='qtyInStock') increment_tr = stock_div.find('tr', class_='multipleOf') min_qty_tr = stock_div.find('tr', class_='minOrderQty') stock = stock_tr.find('td', class_='qty').get_text( strip=True) if stock_tr else 0 stock = util.intval(stock) increment = increment_tr.find('td', class_='qty').get_text( strip=True) if increment_tr else 1 increment = util.intval(increment) min_qty = min_qty_tr.find('td', class_='qty').get_text( strip=True) if min_qty_tr else 1 min_qty = util.intval(min_qty) item['stock'] = [stock, min_qty] item['increment'] = increment except: logger.debug(u"获取stock失败 URL:{url}".format(url=resp.url)) item['stock'] = [0, 1] item['increment'] = 1 # rohs rohs_div = soup.find('div', id='ctl00_PlaceHolderMain_imgRoHS') item['rohs'] = 1 if rohs_div else -1 return item
def get_detail(gpn=None, **kwargs): data = dict() if not gpn: yield data # return product_list url = "http://www.ti.com/product/%s/samplebuy" % gpn try: proxies = kwargs.get('proxies') html = requests.get(url=url, headers=default_headers, timeout=30, proxies=proxies) if 'Page not found' in html.content: raise StopIteration except: raise StopIteration if html.status_code != 200: raise StopIteration soup = BeautifulSoup(html.content, "lxml") # category breadcrumb_div = soup.find('div', class_='breadcrumb') breadcrumb_div = breadcrumb_div.find_all('a') if breadcrumb_div else [] cat_log = [] for a in breadcrumb_div: if 'TI Home' in a.get_text(strip=True): continue cat_log.append([a.get_text(strip=True), a['href']]) data['catlog'] = cat_log if cat_log else [] # goods_img, goods_thumb img_div = soup.find('div', class_='image') img = img_div.img['src'] if img_div else '' data['goods_img'] = img data['goods_thumb'] = img # pretty table table = soup.find('table', id='tblBuy') # 存在一些GPN商品组内没有商品列表,直接返回默认值 if not table: data['goods_sn'] = gpn data['tiered'] = [[0, 0.00]] data['stock'] = [0, 1] yield data body_div = table.tbody if table else None # 如果获取不到商品列表就退出 if not body_div: raise StopIteration ths = table.find_all('th') if table else [] th_td = dict() for th in ths: if 'Part' in th.get_text(strip=True): th_td['PartNum'] = ths.index(th) if 'Price' in th.get_text(strip=True): th_td['Price'] = ths.index(th) if 'Inventory' in th.get_text(strip=True): th_td['Inventory'] = ths.index(th) tds = body_div.find_all('td') if body_div else [] step = len(ths) tr = [tds[x:x + step] for x in range(0, len(tds), step)] total_parts = len(tr) for td in tr: logger.info("GPN:%s 共有%d个商品需要抓取,正在抓取第%d个。" % (gpn.encode('utf-8'), total_parts, tr.index(td) + 1)) # tiered price = th_td.get('Price') pattern_price = re.compile(r'\s*(\d+.\d+)\s*\|\s*(\d+)ku\s*') if td[price].script: td[price].script.extract() tiered = pattern_price.search(td[price].get_text()) if tiered: price = tiered.group(1) qty = int(tiered.group(2)) * 1000 data['tiered'] = [[util.intval(qty), util.floatval(price)]] else: data['tiered'] = [[0, 0.00]] # goods_sn part_num = th_td.get('PartNum') data['goods_sn'] = '' for x in td[part_num].contents: if x.name == 'script': continue elif x.name == 'a': data['goods_sn'] = str(x.string).strip() # data['tiered'] = get_tiered(x['href'], **kwargs) stock, tiered = get_stock(data['goods_sn'], x['href'], **kwargs) data['tiered'] = tiered data['url'] = x['href'] data['provider_name'] = 'TI' data['stock'] = [util.intval(stock), 1] elif x.string and str(x.string).strip(): data['goods_sn'] = str(x.string).strip() data['stock'] = [0, 1] data['provider_name'] = '' # data['url'] = "https://store.ti.com/%s.aspx" % data['goods_sn'] data['url'] = "http://www.ti.com/product/%s" % gpn yield data
def add_to_cart(url, only_session, **kwargs): form_data = { "ctl00$ctl00$ScriptManager1": "ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$BuyProductPanel|ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$btnBuyPaid", "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": "", "__VIEWSTATEGENERATOR": "", "__VIEWSTATEENCRYPTED": "", "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$SearchPhrase": "", "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$hiLastHeaderAction": "none", "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$hiSearchFilterValue": "none", "__ASYNCPOST": "true", "ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$btnBuyPaid": "Buy", } proxies = kwargs.get('proxies') try: stock_page = only_session.get(url=url, proxies=proxies) except: return 0 if stock_page.status_code == 200: soup = BeautifulSoup(stock_page.content, 'lxml') view_state = soup.find('input', id="__VIEWSTATE") form_data['__VIEWSTATE'] = view_state.value if view_state else '' view_state_generator = soup.find('input', id="__VIEWSTATEGENERATOR") form_data[ '__VIEWSTATEGENERATOR'] = view_state_generator.value if view_state_generator else '' # tiered tiered = [] table = soup.find( 'table', id= 'ctl00_ctl00_NestedMaster_PageContent_ctl00_BuyProductDialog1_PricingTierList' ) if table: for tr in table.find_all('tr')[1:]: tds = tr.find_all('td') qty = tds[0].get_text(strip=True) price = tds[1].get_text(strip=True) tiered.append([util.intval(qty), util.floatval(price)]) else: tiered = [[0, 0.00]] else: return 0 # post try: resp = only_session.post(url=url, data=form_data, proxies=proxies) except: return 0 # print resp.content return tiered
def _parse_tool_detail(resp, **kwargs): items = {'list': []} item = {} pattern_gpn = re.compile(r'/tool/([^/\?\.%]+)') # gpn gpn = pattern_gpn.search(resp.url).group(1) try: soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml') except: logger.exception('Parse Error Product URL: %s' % resp.url) return -403 # category breadcrumb_div = soup.find('div', class_='breadcrumbs') cat_log = [] if breadcrumb_div: for a in breadcrumb_div.find_all('a'): if 'TI Home' in a.get_text(strip=True): continue cat_log.append([a.get_text(strip=True), a['href']]) item['catlog'] = cat_log if cat_log else [] # 添加默认值 item['provider_name'] = '' item['provider_url'] = '' item['increment'] = 1 item['rohs'] = -1 # attr item['attr'] = [] # doc item['doc'] = '' # url item['url'] = resp.url # goods_img, goods_thumb item['goods_img'] = '' item['goods_thumb'] = '' # pretty table table = soup.find('table', attrs={'class': 'tblstandard'}) if not table: logger.exception('No Product in URL: %s' % resp.url) return trs = table.find_all('tr')[1:] for tr in trs: # goods_sn:description if 'Contact a Distributor' in tr.get_text(strip=True): break try: part = tr.find('h2').get_text(strip=True).split(':') except: desc = soup.find('h1', id="mainHeader") desc = desc.get_text(strip=True) if desc else '' part = [gpn, desc] item['goods_sn'] = part[0] item['goods_name'] = part[0] item['goods_other_name'] = '' item['desc'] = part[1] # price price = re.search(r'\$(\d+.?\d+)\(USD\)', tr.get_text(strip=True)) price = price.group(1) if price else 0.00 item['provider_name'] = 'TI' if price else '' item['tiered'] = [[1, util.floatval(price)] ] if not util.floatval(price) == 0.0 else [[0, 0.00]] # 需要询价 item['stock'] = [0, 1] items['list'].append(copy.copy(item)) if not items['list']: logger.debug('status: -403; 解析商品详情失败, url: %s', str(resp.url)) return -403 return items
def _parse_detail_data(resp, headers=None, **kwargs): """ 解析详情数据,独立出来 @param data 页面数据 @param url 解析的页面url(方便记录异常) @param kwargs 扩展参数 """ item = {} try: soup = BeautifulSoup(resp.text, 'lxml') if soup is None: logger.debug('初始化商品详情页面失败 URL: %s', resp.url) return -404 except Exception as e: logger.debug('初始化商品详情页面失败 URL: %s ERROR: %s', (resp.url, util.traceback_info(e))) return -404 # goods_sn url_path_list = resp.url.split('/') goods_sn_pattern = re.compile(r'.*-\d{19}') for path in url_path_list[::-1]: if goods_sn_pattern.findall(path): item['goods_sn'] = path break if not item.get('goods_sn', False): logger.debug("无法从链接中解析goods_sn URL: {url} ".format(url=resp.url)) return -400 # goods_name goods_info_div = soup.find('div', class_='section-left') item['goods_name'] = goods_info_div.find('h1').get_text( strip=True) if goods_info_div else item['goods_sn'] # url item['url'] = resp.url # goods_img img_div = soup.find('div', id="outer-div1") img = img_div.find('img') if img_div else None item['goods_img'] = util.urljoin(resp.url, img.get('src')) if img else '' # goods_thumb item['goods_thumb'] = item['goods_img'] # desc desc_p = soup.find('p', class_='RB-pdp_short_Desc') item['desc'] = desc_p.get_text(strip=True) if desc_p else '' # provider_name item['provider_name'] = "AVNET" # provider_url item['provider_url'] = '' # attr: [[None, None]] attr_body = soup.find('div', id="techAttr") attr_div = attr_body.find_all('div', class_='pdpDescriptionsBodyContent') attr = [] if attr_div is not None: for content in attr_div: att_name = content.find('div', class_='pdpDescriptionColumn') attr_value = content.find('div', class_='pdpValueColumn') if att_name and attr_value: attr.append([ att_name.get_text(strip=True), attr_value.get_text(strip=True) ]) else: continue item['attr'] = attr else: item['attr'] = attr # tiered: [[0, 0.00]] tiered_span = soup.find_all('span', class_='usdpart1') tiered = [] if tiered_span: for span in tiered_span: qty_span = span.find('span', class_='pdpTierMinQty') qty = qty_span.get_text(strip=True) if qty_span else 0 price_p = span.find('p') price = price_p.get_text(strip=True) if price_p else 0.00 if qty and price: tiered.append([util.intval(qty), util.floatval(price)]) else: tiered = [[0, 0.00]] break item['tiered'] = tiered else: item['tiered'] = [[0, 0.00]] # stock: [0, 1] >> [stock, qty] stock_input = soup.find('input', id='inStock') stock = stock_input.get('value') if stock_input else 0 stock = util.intval(stock) # qty min_qty_input = soup.find('input', attrs={'name': 'min'}) min_qty = min_qty_input.get('value') if min_qty_input else 1 min_qty = util.intval(min_qty) item['stock'] = [stock, min_qty] if stock else ['0', '1'] # increment: 1 multi_input = soup.find('input', attrs={'name': 'mult'}) item['increment'] = util.intval( multi_input.get('value')) if multi_input else 1 # doc doc_div = soup.find('div', class_='pdfcontent') if doc_div is not None: doc_url = doc_div.find('a', class_='datasheet_align') item['doc'] = doc_url.get('href') if doc_url else '' else: item['doc'] = '' # rohs: -1 rohs_div = soup.find('div', class_='leafcontent') item['rohs'] = 1 if rohs_div else -1 # catlog: [[name, url]] nav = soup.find('nav', class_='breadcrumb') nav_ul = nav.find('ul', class_='nav') catlog = [] if nav is not None: lis = nav.find_all('a') for a in lis: cat_name = a.get_text(strip=True) cat_url = util.urljoin(resp.url, a.get('href')) if cat_name and cat_url: catlog.append([cat_name, cat_url]) else: continue item['catlog'] = catlog else: item['catlog'] = catlog # goods_other_name item['goods_other_name'] = '' # product_id # family_sn return item
def parse_detail(self, resp): """解析系列型号数据""" item = GoodsItem() # with open('1.html', 'w') as fp: # fp.write(resp.text.encode('utf-8')) try: systems_catalog = resp.meta.get('systemsCatalog') product_dict = json.loads(resp.text.encode('utf-8')) # 获取页面产品列表 item_list = product_dict.get('parts').get('records', []) for it in item_list: # 商品标识 item['goods_sn'] = it.get('partsNumber', '') item['goods_name'] = it.get('mfrPartNumber', '') item['goods_other_name'] = it.get('partsNumber', '') # 商品描述 item['goods_desc'] = it.get('abbreviatedPartsDescriptionHTML', '') # 厂商标识 item['provider_name'] = it.get('manufacturer', '') item['provider_url'] = '' for x in item['provider_name'].split(): for k in self.manufacturers.keys(): if x.lower() in k: if not item['provider_url']: item['provider_url'] = self.manufacturers.get( k) else: break # 商品图片 item['goods_img'] = it.get('prefixedLocalImageLink', '') item['goods_thumb'] = it.get('prefixedThumbnailLocalImageLink', '') # 商品文档 item['doc'] = it.get('datasheetURL', '') # rohs item['rohs'] = 1 if it.get('roHSTTI') == 'Y' else -1 # [库存, 最小起订量] item['stock'] = [0, 0] item['stock'] = [ it.get('ttiWebAtsInt', 0), it.get('ttiSalesMinInt', 0) ] # 增长量 item['increment'] = it.get('ttiSalesMultInt') if item['stock'][0] == 0: item['increment'] = 1 # 价格阶梯 item['tiered'] = [] prices_list = it.get('prices', []) for prices in prices_list: item['tiered'].append([ prices.get('quantity'), util.floatval(prices.get('price')) ]) if not item['tiered']: item['tiered'] = [[0, 0.00]] # 属性 item['attr'] = [] attr_dict = it.get('parametricMap', {}) for k, v in attr_dict.items(): item['attr'].append([k, v]) # 分类 breadcrumb = product_dict.get('breadcrumbOptions').get( 'producttype').get('All Systems Catalog') item['catlog'] = [] for vo in breadcrumb: catalog_text = vo.get('displayText') catalog_value = vo.get('submitValue') catalog_url = util.urljoin( self.tti, '/content/ttiinc/en/apps/part-search.html?manufacturers=&' ';searchTerms=&systemsCatalog=%s' % (catalog_value)) item['catlog'].append([catalog_text, catalog_url]) # url mfrShortname = it.get('mfgShortname', '') partsNumber = it.get('partsNumber') minQty = it.get('ttiSalesMin') product_url = '/content/ttiinc/en/apps/part-detail.html?mfrShortname=%s&partsNumber=%s&customerPartNumber=&minQty=%s&customerId=' % ( mfrShortname, partsNumber, minQty) item['url'] = util.urljoin(self.tti, product_url) yield item except: with open('worry.htm', 'w') as fp: fp.write(resp.text.encode('utf-8')) logger.exception('Parse error, systemsCatalog: %s', systems_catalog)