def handle_of_redirects(item=None): item = item if item else {} if not item: return -404 search_url = 'http://www.linear.com.cn/search/index.php?q={search}'.format( search=item['goods_name']) _headers = copy.copy(default_headers) _headers.update({'Host': 'www.linear.com.cn'}) resp = requests.get(url=search_url, headers=_headers, allow_redirects=False) location = util.urljoin(resp.url, resp.headers.get('Location')) if 'product/' in location or 'solutions/' in location: try: response = requests.get(url=location, headers=_headers) except: logger.error("获取目录和文档失败 URL{url}".format(url=location)) return -404 return parse_more(item, response) elif 'search.php' in location: try: response = requests.get(url=location, headers=_headers) except: logger.error("获取搜索列表 URL{url}".format(url=location)) return -404 return filter_search_result(item, response)
def parse_more(item=None, response=None): if not item or not response: return -404 root = lxml.html.fromstring(response.text.encode('utf-8')) data = {} # family_sn match = family_sn_pattern.search(response.url) data['family_sn'] = match.group(1) if match else item['goods_name'] # catlog breadcrumb = root.xpath('//p[@class="breadcrumb"]/a') data['catlog'] = [] for catlog in breadcrumb: catlog_name = util.cleartext(catlog.text_content()) catlog_url = util.urljoin(response.url, catlog.xpath('./@href')[0]) if catlog_name and catlog_url: data['catlog'].append([catlog_name, catlog_url]) else: data['catlog'] = [] break else: data['catlog'].append([data['family_sn'], response.url]) # doc doc = root.xpath('//li[@class="pdf"]/a[@class="doclink"]/@title') data['doc'] = "http://cds.linear.com/docs/en/datasheet/{title}".format( title=doc[0]) if doc else '' item.update(data) return item
def parse_detail(self, data, category=None): """解析系列型号数据""" if category is None: category = {} item = GoodsItem() item['url'] = urlparse.urljoin(self.base_url, data['avn_pdp_seo_path']) item['goods_sn'] = data['uniqueID'] item['goods_name'] = data['mfPartNumber_ntk'].upper() if not item['goods_name']: return None if 'packageTypeCode' in item: item['goods_other_name'] = '{0}/{1}'.format(item['goods_name'], item['packageTypeCode']).upper() item['provider_name'] = data['manufacturer'] item['provider_url'] = '' item['goods_desc'] = data['shortDescription'] if 'shortDescription' in data else '' if 'avn_thumbnail' in data and data['avn_thumbnail']: item['goods_thumb'] = util.urljoin(self.base_url, data['avn_thumbnail']) else: item['goods_thumb'] = '' item['goods_img'] = item['goods_thumb'].replace('icon_thumb', 'icon_web') if 'auxDescription2' in data and data['auxDescription2']: item['doc'] = data['auxDescription2'] else: item['doc'] = '' min_qty = int(data['xcatField1']) if 'xcatField1' in data else 1 if 'multQuantity' in data: increment = int(data['multQuantity']) else: increment = 1 if 'inv_strlocqty' in data: stock_qty = util.intval(data['inv_strlocqty']) else: stock_qty = 0 item['rohs'] = 1 if 'ROHSComplianceCode' in data and data['ROHSComplianceCode'] == 'Y' else 0 item['tiered'] = [[0, 0.0]] item['stock'] = [stock_qty, min_qty] # 库存 item['increment'] = increment # 属性 item['attr'] = [] if 'attributes' not in data: data['attributes'] = [] for vo in data['attributes']: try: item['attr'].append([vo['name'], vo['values'][0]['value']]) except: pass # 分类 item['catlog'] = [] catelogs = data['parentCatgroup_id_path'].split('_')[-1].split(':') for vo in catelogs: if vo not in category: continue item['catlog'].append((category[vo], vo)) item['region'] = 'AMERICAS' item['id'] = 16 return item
def start_requests(self): match = [] url = self.start_urls[0] rs = requests.get(url, headers=self.headers) js_cookies = {} for vo in rs.cookies: js_cookies[vo.name] = vo.value rs = requests.get(url, headers=self.headers, cookies=js_cookies) js_cookies = _parse_incapsula_page(rs.text, cookies=js_cookies, headers=self.headers) resp = requests.get( url='https://www.ttiinc.com/content/ttiinc/en/manufacturers.html', headers=self.headers, cookies=js_cookies) manufacturers = re.findall( r'(/content/ttiinc/en/manufacturers/.*/(.*).html)', resp.text.encode('utf-8')) for v, k in manufacturers: self.manufacturers[k] = util.urljoin(self.tti, v) rs = requests.get(url, headers=self.headers, cookies=js_cookies) match = re.findall(r'/.*/part-search.html.*systemsCatalog=(\d+)', rs.text.encode('utf-8')) # if not match: # with open(os.path.split(os.path.realpath(__file__))[0] + r'\tti_category_values.txt', 'r') as fp: # for line in fp.readlines(): # match.append(line.strip()) for systems_catalog in match: try: self.form_data['systemsCatalog'] = systems_catalog # print '*'*50 # print self.form_data yield Request(url=self.processData_url, method='POST', headers=self.headers, body=json.dumps(self.form_data), meta={'systemsCatalog': systems_catalog}) except: logger.exception('Request error, systemsCatalog: %s', systems_catalog)
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 richardsonrfpd 中关键词:%s 的相关数据' % keyword url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx' elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } response = requests.get(url, headers=_headers, timeout=30, proxies=proxies) resp = do_search(response, keyword) if isinstance(resp, int): raise ValueError except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 if 'Search-Results.aspx' in resp.url: product_list = analyse_product_url(resp) root = lxml.html.fromstring(resp.text.encode('utf-8')) product_list = root.xpath('//tr[@valign="top"][@height=85]') if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 for product in product_list: detail = product.xpath('.//a[@class="lnk12b-blackOff"]') detail_url = util.urljoin( resp.url, detail[0].xpath('./@href')[0]) if detail else '' match = goods_sn_pattern.search(detail_url) if not match and detail_url: logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url)) return -404 goods_sn = match.group(1) goods_name = detail[0].text_content() if detail else '' data_dict['url'].append({ 'id': id, 'url': detail_url, 'goods_sn': goods_sn, 'goods_name': goods_name, }) if 'showMore=true' in url: return 200 count = root.xpath('//td[@class="medtext"]') count = util.number_format(count[0].text, places=0, index=999, smart=True) if count else 0 page_num = int(math.ceil(count / 10.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) page_list = root.xpath('//td[@class="medtext"]/a/@href') for x in xrange(1, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format( search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count) data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def parse_detail(self, resp): """解析系列型号数据""" item = GoodsItem() try: soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml') except Exception as e: logger.debug(u"初始化BS4对象失败,重试一次 URL:{url}".format(url=resp.url)) # 重试一次 return Request(url=resp.url, headers=self.headers, cookies=self.cookies) # goods_sn product_id = self.product_id_pattern_1.search( resp.url) or self.product_id_pattern_2.search(resp.url) goods_sn = product_id.group(1) if product_id else '' item['goods_sn'] = goods_sn if not goods_sn: logger.debug(u"获取goods_sn失败 URL:{url}".format(url=resp.url)) return None try: # goods_name product_ref = soup.find('p', class_='ref') goods_name = '' if product_ref: goods_name_pattern = re.compile( ur'<b>制造商零件编号:</b>\s*([^\"\'<>/]+)') product_ref_list = unicode(product_ref).split('<br/>') for x in product_ref_list: match = goods_name_pattern.search(x) if match: goods_name = match.group(1) break item['goods_name'] = goods_name # goods_other_name item['goods_other_name'] = '' except: logger.debug(u"获取goods_name失败 URL:{url}".format(url=resp.url)) item['goods_name'] = '' item['goods_other_name'] = '' # goods_desc goods_desc = soup.find('p', class_='desc') if not goods_desc: logger.debug(u"获取goods_desc失败 URL:{url}".format(url=resp.url)) item['goods_desc'] = goods_desc.get_text( strip=True) if goods_desc else '' # provider_name and provider_url provider_name = soup.find('img', id='ctl00_PlaceHolderMain_mfrLogo') item['provider_name'] = provider_name.get('title', '') # 如果在商标图片中无法获取 provider_name ,尝试从 product-desc 中获取 if not provider_name: desc_div = soup.find('div', id='product-desc') provider_name = desc_div.find('h2') provider_name = provider_name.get_text( strip=True) if provider_name else '' item['provider_name'] = provider_name item['provider_url'] = '' # url item['url'] = resp.url # doc doc = soup.find( 'a', id='ctl00_PlaceHolderMain_csDownloadCenter_linkDatasheetUrlJustText' ) item['doc'] = doc.get('href', '') # goods_img and goods_thumb goods_img = soup.find('img', id='previewedMEDImage') item['goods_img'] = goods_img.get('src', '') goods_thumb = soup.find('img', id='thumbnail-1') item['goods_thumb'] = goods_thumb.get('src', '') # catlog item['catlog'] = [] catlog = soup.find('ul', id='breadcrumb-navigation') catlog_list = catlog.find_all('a') for a in catlog_list: breadcrumb_name = a.get_text(strip=True) breadcrumb_url = util.urljoin(resp.url, a.get('href', '')) item['catlog'].append([breadcrumb_name, breadcrumb_url]) # attr item['attr'] = [] product_attr_div = soup.find('div', id='product-details-overview-highlights') product_attr_list = product_attr_div.find_all( 'li') if product_attr_div else [] for li in product_attr_list: attr_name, attr_value = li.get_text(strip=True).split(':') item['attr'].append([attr_name, attr_value]) # tiered try: item['tiered'] = [] price_table = soup.find('table', class_='product-prices') price_tr_list = price_table.find_all('tr', class_='price-break') for tr in price_tr_list: qty_th = tr.find('th') qty = qty_th.get_text(strip=True) if qty_th else 0 qty = util.intval(qty) price_span = tr.find('span') price = price_span.get_text(strip=True) if price_span else 0.00 price = util.floatval(price) # print qty, price if qty and price: item['tiered'].append([qty, price]) else: item['tiered'] = [0, 0.00] except: logger.debug(u"获取tiered失败 URL:{url}".format(url=resp.url)) item['tiered'] = [0, 0.00] # stock、increment、 min_qty try: stock_div = soup.find('div', id='product-qty-content') stock_tr = stock_div.find('tr', class_='qtyInStock') increment_tr = stock_div.find('tr', class_='multipleOf') min_qty_tr = stock_div.find('tr', class_='minOrderQty') stock = stock_tr.find('td', class_='qty').get_text( strip=True) if stock_tr else 0 stock = util.intval(stock) increment = increment_tr.find('td', class_='qty').get_text( strip=True) if increment_tr else 1 increment = util.intval(increment) min_qty = min_qty_tr.find('td', class_='qty').get_text( strip=True) if min_qty_tr else 1 min_qty = util.intval(min_qty) item['stock'] = [stock, min_qty] item['increment'] = increment except: logger.debug(u"获取stock失败 URL:{url}".format(url=resp.url)) item['stock'] = [0, 1] item['increment'] = 1 # rohs rohs_div = soup.find('div', id='ctl00_PlaceHolderMain_imgRoHS') item['rohs'] = 1 if rohs_div else -1 return item
except Exception as e: logger.exception(u'解析失败, 商品详情 URL: %s ' % url) return -400 if html.status_code == 200: item = {} _desc = soup.find( 'tr', id= 'ctl00_ctl00_NestedMaster_PageContent_ctl00_BuyProductDialog1_trSku' ) item['goods_name'] = _desc.find('h1').get_text( strip=True) if _desc else '' item['goods_sn'] = item['goods_name'] item['desc'] = _desc.get_text(strip=True) if _desc else '' _img = soup.find('img', id='ProductImage') item['goods_img'] = util.urljoin(url, _img.get('src')) stock_info = get_stock(goods_sn=goods_sn, url=url) if stock_info: item['stock'] = [util.intval(stock_info[0]), 1] item['tiered'] = stock_info[1] else: item['stock'] = [0, 1] item['tiered'] = [[0, 0.00]] item['provider_name'] = 'TI' item["increment"] = 1 item['url'] = url return item # def _parse_store_ti_com(url, **kwargs): # """
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 avnet 中关键词:%s 的相关数据' % keyword url = "https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1¤cy=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber=1&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag%3ANPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&wt=json".format( keyword=keyword) elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 search_dict = {} try: search_dict = json.loads(resp.text.encode('utf-8')) product_list = search_dict.get('catalogEntryView', []) except: product_list = [] logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url) if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 # sn = product.xpath('.//td[@class="partColHeader"]//span[@class="defaultSearchText"]') for product in product_list: goods_sn = product.get('seo_token_ntk', '') base_url = 'https://www.avnet.com/shop/apac/' product_url = product.get('avn_pdp_seo_path', '') data_dict['url'].append({ 'id': id, 'url': util.urljoin(base_url, product_url), 'goods_sn': goods_sn }) if 'showMore=true' in url: return 200 count = search_dict.get('recordSetTotal', 0) page_num = int(math.ceil(count / 20.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) for x in xrange(2, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_More_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1¤cy=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber={next_page}&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag:NPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&showMore=true&wt=json'.format( next_page=x, keyword=keyword) # print page_url data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def _parse_detail_data(resp, headers=None, **kwargs): """ 解析详情数据,独立出来 @param data 页面数据 @param url 解析的页面url(方便记录异常) @param kwargs 扩展参数 """ item = {} try: soup = BeautifulSoup(resp.text, 'lxml') if soup is None: logger.debug('初始化商品详情页面失败 URL: %s', resp.url) return -404 except Exception as e: logger.debug('初始化商品详情页面失败 URL: %s ERROR: %s', (resp.url, util.traceback_info(e))) return -404 # goods_sn url_path_list = resp.url.split('/') goods_sn_pattern = re.compile(r'.*-\d{19}') for path in url_path_list[::-1]: if goods_sn_pattern.findall(path): item['goods_sn'] = path break if not item.get('goods_sn', False): logger.debug("无法从链接中解析goods_sn URL: {url} ".format(url=resp.url)) return -400 # goods_name goods_info_div = soup.find('div', class_='section-left') item['goods_name'] = goods_info_div.find('h1').get_text( strip=True) if goods_info_div else item['goods_sn'] # url item['url'] = resp.url # goods_img img_div = soup.find('div', id="outer-div1") img = img_div.find('img') if img_div else None item['goods_img'] = util.urljoin(resp.url, img.get('src')) if img else '' # goods_thumb item['goods_thumb'] = item['goods_img'] # desc desc_p = soup.find('p', class_='RB-pdp_short_Desc') item['desc'] = desc_p.get_text(strip=True) if desc_p else '' # provider_name item['provider_name'] = "AVNET" # provider_url item['provider_url'] = '' # attr: [[None, None]] attr_body = soup.find('div', id="techAttr") attr_div = attr_body.find_all('div', class_='pdpDescriptionsBodyContent') attr = [] if attr_div is not None: for content in attr_div: att_name = content.find('div', class_='pdpDescriptionColumn') attr_value = content.find('div', class_='pdpValueColumn') if att_name and attr_value: attr.append([ att_name.get_text(strip=True), attr_value.get_text(strip=True) ]) else: continue item['attr'] = attr else: item['attr'] = attr # tiered: [[0, 0.00]] tiered_span = soup.find_all('span', class_='usdpart1') tiered = [] if tiered_span: for span in tiered_span: qty_span = span.find('span', class_='pdpTierMinQty') qty = qty_span.get_text(strip=True) if qty_span else 0 price_p = span.find('p') price = price_p.get_text(strip=True) if price_p else 0.00 if qty and price: tiered.append([util.intval(qty), util.floatval(price)]) else: tiered = [[0, 0.00]] break item['tiered'] = tiered else: item['tiered'] = [[0, 0.00]] # stock: [0, 1] >> [stock, qty] stock_input = soup.find('input', id='inStock') stock = stock_input.get('value') if stock_input else 0 stock = util.intval(stock) # qty min_qty_input = soup.find('input', attrs={'name': 'min'}) min_qty = min_qty_input.get('value') if min_qty_input else 1 min_qty = util.intval(min_qty) item['stock'] = [stock, min_qty] if stock else ['0', '1'] # increment: 1 multi_input = soup.find('input', attrs={'name': 'mult'}) item['increment'] = util.intval( multi_input.get('value')) if multi_input else 1 # doc doc_div = soup.find('div', class_='pdfcontent') if doc_div is not None: doc_url = doc_div.find('a', class_='datasheet_align') item['doc'] = doc_url.get('href') if doc_url else '' else: item['doc'] = '' # rohs: -1 rohs_div = soup.find('div', class_='leafcontent') item['rohs'] = 1 if rohs_div else -1 # catlog: [[name, url]] nav = soup.find('nav', class_='breadcrumb') nav_ul = nav.find('ul', class_='nav') catlog = [] if nav is not None: lis = nav.find_all('a') for a in lis: cat_name = a.get_text(strip=True) cat_url = util.urljoin(resp.url, a.get('href')) if cat_name and cat_url: catlog.append([cat_name, cat_url]) else: continue item['catlog'] = catlog else: item['catlog'] = catlog # goods_other_name item['goods_other_name'] = '' # product_id # family_sn return item
def parse_detail(self, resp): """解析系列型号数据""" item = GoodsItem() # with open('1.html', 'w') as fp: # fp.write(resp.text.encode('utf-8')) try: systems_catalog = resp.meta.get('systemsCatalog') product_dict = json.loads(resp.text.encode('utf-8')) # 获取页面产品列表 item_list = product_dict.get('parts').get('records', []) for it in item_list: # 商品标识 item['goods_sn'] = it.get('partsNumber', '') item['goods_name'] = it.get('mfrPartNumber', '') item['goods_other_name'] = it.get('partsNumber', '') # 商品描述 item['goods_desc'] = it.get('abbreviatedPartsDescriptionHTML', '') # 厂商标识 item['provider_name'] = it.get('manufacturer', '') item['provider_url'] = '' for x in item['provider_name'].split(): for k in self.manufacturers.keys(): if x.lower() in k: if not item['provider_url']: item['provider_url'] = self.manufacturers.get( k) else: break # 商品图片 item['goods_img'] = it.get('prefixedLocalImageLink', '') item['goods_thumb'] = it.get('prefixedThumbnailLocalImageLink', '') # 商品文档 item['doc'] = it.get('datasheetURL', '') # rohs item['rohs'] = 1 if it.get('roHSTTI') == 'Y' else -1 # [库存, 最小起订量] item['stock'] = [0, 0] item['stock'] = [ it.get('ttiWebAtsInt', 0), it.get('ttiSalesMinInt', 0) ] # 增长量 item['increment'] = it.get('ttiSalesMultInt') if item['stock'][0] == 0: item['increment'] = 1 # 价格阶梯 item['tiered'] = [] prices_list = it.get('prices', []) for prices in prices_list: item['tiered'].append([ prices.get('quantity'), util.floatval(prices.get('price')) ]) if not item['tiered']: item['tiered'] = [[0, 0.00]] # 属性 item['attr'] = [] attr_dict = it.get('parametricMap', {}) for k, v in attr_dict.items(): item['attr'].append([k, v]) # 分类 breadcrumb = product_dict.get('breadcrumbOptions').get( 'producttype').get('All Systems Catalog') item['catlog'] = [] for vo in breadcrumb: catalog_text = vo.get('displayText') catalog_value = vo.get('submitValue') catalog_url = util.urljoin( self.tti, '/content/ttiinc/en/apps/part-search.html?manufacturers=&' ';searchTerms=&systemsCatalog=%s' % (catalog_value)) item['catlog'].append([catalog_text, catalog_url]) # url mfrShortname = it.get('mfgShortname', '') partsNumber = it.get('partsNumber') minQty = it.get('ttiSalesMin') product_url = '/content/ttiinc/en/apps/part-detail.html?mfrShortname=%s&partsNumber=%s&customerPartNumber=&minQty=%s&customerId=' % ( mfrShortname, partsNumber, minQty) item['url'] = util.urljoin(self.tti, product_url) yield item except: with open('worry.htm', 'w') as fp: fp.write(resp.text.encode('utf-8')) logger.exception('Parse error, systemsCatalog: %s', systems_catalog)