def parse(self, response): a_nodes = response.xpath('//div[@class="count-box"]/a[translate(normalize-space(text())," ","")!="0"]') for a in a_nodes: cat_url = strip(a.xpath('./@href').get()) if not cat_url: continue parent = strip(a.xpath('./text()').get()) yield Request(urljoin(self.base_url, cat_url), callback=self.parse, meta={'parent': parent}) prd_urls = response.xpath('//h3[@class="prodname"]/a/@href').getall() for prd_url in prd_urls: yield Request(urljoin(self.base_url, prd_url), callback=self.parse_detail, meta={'parent': response.meta.get('parent')} ) next_page = strip(response.xpath( '//span[@class="selectedpage"]/../following-sibling::li/a[not(parent::li/span)]/text()' ).get()) if next_page: url, *_ = response.url.split('?') params = urlencode({ 'custguid': '', 'custclsid': '', 'pn': next_page, }) yield Request(f'{url}?{params}', callback=self.parse, meta={'parent': response.meta.get('parent')})
def parse_detail(self, response): tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()' rel_img = response.xpath('//div[@class="pic"]/img/@src').get() cat_no = response.xpath('//div/span[@style]/text()').get() d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': response.xpath('//div/span/@data-nameen').get(), 'cas': response.xpath(tmp.format("CAS:")).get(), 'mdl': response.xpath(tmp.format("MDL:")).get(), 'mf': formula_trans(strip(response.xpath(tmp.format("分子式:")).get())), 'mw': response.xpath(tmp.format("分子量:")).get(), 'smiles': response.xpath(tmp.format("SMILES code:")).get(), 'purity': response.xpath(tmp.format("化学纯度:")).get(), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//div[@class="table-1"]//tbody/tr') for row in rows: package = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'price': strip(row.xpath('./td[2]/text()').get()), 'stock_num': row.xpath('./td[5]/text()').get(), 'currency': 'RMB', } yield ProductPackage(**package)
def parse_list(self, response): parent = response.meta.get('parent') rows = response.xpath('//li/form') for row in rows: rel_url = row.xpath('.//span[@class="title"]/a/@href').get() yield Request( urljoin(self.base_url, rel_url), callback=self.parse_detail, meta={ 'parent': parent, 'cat_no': strip(row.xpath('./span[@class="number"]/text()').get()), 'en_name': strip(row.xpath('./span[@class="title"]/a/text()').get()), 'package': strip(row.xpath('./span[@class="size"]/text()').get()), }) next_page = response.xpath( '//div[contains(text(),"Page")]/a[contains(@class,"current")]/following-sibling::a/@href' ).get() if next_page: yield Request(urljoin(self.base_url, next_page), callback=self.parse_list, meta={'parent': parent})
def parse_table(self, response): d = { 'info3': strip(response.xpath('//td[@class="skusize"]/text()').get()), 'info4': strip(response.xpath('//span[@class="price"]/text()').get()), 'stock_info': strip(response.xpath('//span[contains(@class, "stockstatus")]/text()').get()), } yield RawData(**response.meta.get('prd_info', {}), **d)
def parse_detail(self, response): tmp = 'normalize-space(//td[contains(div/text(), {!r})]/following-sibling::td/text())' rel_img = response.xpath('//div[@class="c_c_p"]//div/img/@src').get() cat_no = strip(response.xpath(tmp.format("产品号/Catalog#")).get()) d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': cat_no, 'en_name': strip(response.xpath(tmp.format("Product Name:")).get()), 'chs_name': strip(response.xpath(tmp.format("产品名称:")).get()), 'cas': strip(response.xpath(tmp.format("CAS#:")).get()), 'mf': strip(response.xpath(tmp.format("分子式/Formula:")).get()), 'mw': strip(response.xpath(tmp.format("分子量/MW:")).get()), 'purity': strip(response.xpath(tmp.format("纯度/Purity (%):")).get()), 'info1': strip(response.xpath(tmp.format("Synonyms:")).get()), 'info2': strip(response.xpath(tmp.format("储藏条件/Storage:")).get()), 'appearance': strip(response.xpath(tmp.format("颜色/Color:")).get()), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } for k in d: d[k] = d[k] if d[k] != 'NA' else None yield RawData(**d) rows = response.xpath( '//table[@class="c_p_size"]//tr[td and td/text()!="NA"]') for row in rows: dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'price': row.xpath('./td[1]/text()').get(), } yield ProductPackage(**dd)
def detail_parse(self, response): tmp = '//th[contains(text(),{0!r})]/following-sibling::td/descendant-or-self::text()' img_url = response.xpath( '//th[contains(text(),"Structure")]/following-sibling::td/img/@src' ).get() cat_no = strip(response.xpath(tmp.format("Product No.")).get()) d = { "brand": self.brand, "cat_no": cat_no, "parent": response.xpath(tmp.format("Category")).get(), "info1": "".join(response.xpath(tmp.format("Synonym(s)")).extract()), "mw": response.xpath(tmp.format("Molecular Weight")).get(), "mf": "".join(response.xpath(tmp.format("Formula")).extract()), "cas": response.xpath(tmp.format("CAS Number")).get(), "en_name": strip("".join( response.xpath( '//div[@class="product-name"]/span/descendant-or-self::text()' ).extract())), "img_url": img_url and urljoin(self.base_url, img_url), "stock_info": response.xpath( '//table[@id="product-matrix"]//td[@class="unit-price"]/text()' ).get(), "prd_url": response.url, } yield RawData(**d) matrix = first( re.findall(r'var matrixChildrenProducts = ({.+});', response.text), None) if not matrix: return packages = json.loads(matrix) for _, item in packages.items(): sku = item.get('sku') if not sku: continue package = sku.replace(f'{cat_no}-', '') dd = { 'brand': self.brand, 'cat_no': cat_no, 'cat_no_unit': sku, 'package': strip(package), 'price': item.get('price'), 'currency': 'USD', 'delivery_time': 'In-stock' if item.get('is_in_stock') else None } yield ProductPackage(**dd)
def parse(self, response): a_nodes = response.xpath('//div[@id="lnav"]//li[not(child::ul)]/a') for a in a_nodes: parent = strip(a.xpath('./text()').get()) rel_url = strip(a.xpath('./@href').get()) yield Request(urljoin(self.base_url, rel_url), callback=self.parse_list, meta={'parent': parent})
def parse_detail(self, response): tmp = '//td[contains(./span/text(), {!r})]/following-sibling::td//span//text()' tmp2 = '//td[contains(./span/text(), {!r})]/following-sibling::td/p[{}]/span/text()' en_name = strip(response.xpath(tmp2.format("Product Name", 1)).get()) or \ strip(response.xpath(tmp.format("Product Name")).get()) d = { 'brand': self.brand, 'cat_no': en_name, 'en_name': en_name, 'chs_name': strip(response.xpath(tmp2.format("Product Name", 2)).get()), 'cas': strip(response.xpath(tmp.format("Cas No.")).get()), 'info1': strip(response.xpath(tmp.format("Sequence")).get()), 'mf': strip(''.join( response.xpath(tmp.format("Molecular Formula")).getall())), 'mw': strip(response.xpath(tmp.format("Molar Mass")).get()), 'purity': strip(''.join(response.xpath(tmp.format("Purity")).getall())), 'info2': strip(response.xpath(tmp.format("Storage Temperature")).get()), 'img_url': response.xpath( '//div[contains(@class, "slick-slide")][1]/a/img/@src').get(), 'prd_url': response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//span[contains(text(), {!r})]/following-sibling::span//text()' d = { 'brand': 'chemimpex', 'parent': response.meta.get('parent'), 'cat_no': response.xpath(tmp.format("Catalog Number:")).get(), 'en_name': strip(''.join(response.xpath('//h1[@itemprop="name"]//text()[not(parent::span)]').getall())), 'purity': strip(response.xpath('//h1[@itemprop="name"]/span[@style]/text()').get()), 'mf': strip(''.join(response.xpath(tmp.format('Molecular Formula:')).getall())), 'mw': strip(response.xpath(tmp.format('Molecular Weight:')).get()), 'cas': strip(response.xpath(tmp.format('CAS No:')).get()), 'appearance': strip(response.xpath(tmp.format('Appearance:')).get()), 'info1': strip(';'.join(response.xpath(tmp.format('Synonyms:')).getall())), 'info2': strip(response.xpath(tmp.format('Storage Temp:')).get()), 'img_url': strip(response.xpath('//div[@id="catalog_content"]/img/@src').get()), 'prd_url': response.url, } m = re.search(r'push\(({.+\})\);', response.text) if not m: yield RawData(**d) return j_obj = json.loads(m.group(1)) params = [j_obj.get(f'param{i}', '') for i in range(1, 7)] url = 'https://www.chemimpex.com/Widgets-product/gethtml_skulist/{}/{}/{}/{}/{}/{}'.format(*params) yield Request(url, callback=self.parse_table, meta={'prd_info': d})
def detail_parse(self, response): tmp = 'normalize-space(//div[@class="product1_l"]//span[contains(text(), "{}")]/../text())' rel_img = response.xpath('//div[@class="product1"]/img/@src').get() d = { "brand": "synpharmatech", "cat_no": strip(response.xpath(tmp.format("Cat. No")).get()), "en_name": strip( response.xpath('//div[@class="product1_l"]//h1/text()').get()), "info1": strip(response.xpath(tmp.format("Synonyms")).get()), "cas": strip(response.xpath(tmp.format("CAS No")).get()), "mf": strip(response.xpath(tmp.format("Formula")).get()), "mw": strip(response.xpath(tmp.format("F.W")).get()), "purity": strip(response.xpath(tmp.format("Purity")).get()), "stock_info": strip( response.xpath( 'normalize-space(//div[@class="product2"]//tr[position()>1]/td[4]/text())' ).get()) or None, "prd_url": response.url, "img_url": urljoin(self.base_url, rel_img) if rel_img else None, } yield RawData(**d)
def parse_detail(self, response): tmp = '//strong[contains(text(),{!r})]/following-sibling::text()' rel_img = response.xpath('//article//a/img/@src').get() d = { "brand": "cprd", # "parent": response.xpath('//p[@class="catalogue_number"]/a/text()').get(), "cat_no": strip(response.xpath(tmp.format("Catalogue Number:")).get()), "cas": strip(response.xpath(tmp.format("CAS Number:")).get()), "en_name": strip(response.xpath(tmp.format("Chemical Name:")).get()), "img_url": rel_img and urljoin(response.url, rel_img), "mf": strip(response.xpath(tmp.format("Molecular Formula:")).get()), "mw": strip(response.xpath(tmp.format("Molecular Weight:")).get()), "prd_url": response.url, } yield RawData(**d)
def parse_detail(self, response): cas = response.xpath('//tr[@class="style17"]/td[3]/text()').getall() cas = tuple(filter(lambda x: x, (strip(i) for i in cas))) d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': response.meta.get('cat_no'), 'en_name': response.meta.get('en_name'), 'cas': first(cas, None) if len(cas) == 1 else None, 'info1': ';'.join(set(cas)), 'info3': response.meta.get('package'), 'info4': response.xpath( '//p[contains(text(), "Price:")]/strong/text()').get(), 'prd_url': response.url, } yield RawData(**d)
def parse_list(self, response): parent = strip(response.xpath('//strong//text()').get()) urls = response.xpath('//div[@class="iproimg"]/a/@href').getall() for url in urls: yield Request(url, callback=self.parse_detail, meta={'parent': parent})
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()' cat_no = response.xpath(tmp.format('Catalog #')).get() d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': response.xpath('//td[@class="pageTitle"]/text()').get(), 'cas': response.xpath(tmp.format('CAS#')).get(), 'stock_info': response.xpath(tmp.format('In Stock')).get(), 'prd_url': response.url, } yield RawData(**d) raw_price = strip(response.xpath( 'normalize-space(//td[contains(text(), "Retail Price:")]/following-sibling::td/text())' ).get()) price = None if raw_price: raw_price = re.sub(r'\s+', ' ', raw_price) price = first(map(lambda m: m.group(0) if m is not None else None, re.finditer(r'(\d+(\.\d+)?)', raw_price)), None) dd = { 'brand': self.brand, 'cat_no': cat_no, 'price': price, 'currency': 'USD', 'info': raw_price, 'delivery_time': response.xpath(tmp.format('In Stock')).get(), } yield ProductPackage(**dd)
def detail_parse(self, response): cat_no_unit = response.xpath('//span[@itemprop="sku"]/text()').get("") m = re.match(r'[A-Z]{3}-\d+', cat_no_unit) cat_no = m.group(0) if m else cat_no_unit rel_img = response.xpath('//img[@class="zoomImg"]/@src').get() full_name = response.xpath('//h1[@itemprop="name"][1]/text()').get( "").title() tmp_full_name = response.xpath( '//div[@itemprop="description"]/text()').get("").title() if '-' in full_name: en_name, package = full_name.rsplit('-', 1) elif '-' in tmp_full_name: en_name, package = tmp_full_name.rsplit('-', 1) else: en_name, package = full_name, 'kit' d = { "brand": self.brand, "parent": self.extract_value(response, "Chemical Family: "), "cat_no": cat_no, "en_name": strip(en_name), "cas": self.extract_value(response, "CAS: "), "mf": self.extract_value(response, "Chemical Formula: "), "mw": self.extract_value(response, "Formula Weight: "), "info2": self.extract_value(response, "Long Term Storage: "), "appearance": self.extract_value(response, "Appearance: "), "purity": self.extract_value(response, "Purity: "), 'img_url': rel_img and urljoin(self.base_url, rel_img), "prd_url": response.url, } yield RawData(**d) stock_num = response.xpath( '//div[@class="items_left"]//em/text()').get() package = strip(package) dd = { 'brand': self.brand, 'cat_no_unit': cat_no_unit, 'cat_no': cat_no, 'package': package and package.lower(), 'price': response.xpath('//span[@itemprop="price"]/@content').get(), 'currency': 'USD', 'stock_num': stock_num and first(re.findall(r'\d+', stock_num), None), } yield ProductPackage(**dd)
def parse(self, response): a_nodes = response.xpath('//a[@class="sort-alpha"]') for a in a_nodes: parent = strip(a.xpath('./text()').get()) rel_url = a.xpath('./@href').get() if not rel_url: continue yield Request(urljoin(response.url, rel_url), callback=self.parse_list, meta={'parent': parent})
def process_item(self, item, spider): adapter = ItemAdapter(item) cas = strip(adapter.get('cas')) if cas is None or not isinstance(cas, str): return item adapter['cas'] = None if cas.lower() in {'n/a', 'na', 'null', '' } else cas return item
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()' package = strip( response.xpath('normalize-space(//td/table//td[1]/text())').get()) d = { 'brand': 'medicalisotopes', 'parent': response.meta.get('parent'), 'cat_no': strip(response.xpath(tmp.format("Catalog Number:")).get()), 'en_name': strip( response.xpath( '//th[contains(text(), "Product:")]/following-sibling::th/text()' ).get()), 'cas': strip(response.xpath(tmp.format("CAS Number:")).get()), 'mf': strip(''.join(response.xpath(tmp.format("Formula:")).getall())), 'mw': strip(response.xpath(tmp.format("Molecular Weight:")).get()), 'info3': package and package.rstrip('\xa0='), 'info4': strip(response.xpath('//td/table//td[2]/text()').get()), 'prd_url': response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//b[text()={!r}]/following-sibling::text()' catagory = strip( response.xpath( '//b[text()="Category:"]/following-sibling::a/text()').get()) d = { 'brand': 'cpachem', 'parent': catagory, 'cat_no': strip(response.xpath(tmp.format("Ref Num:")).get()), 'en_name': strip(response.xpath(tmp.format("Full Name:")).get()), 'info2': strip( response.xpath(tmp.format("Shelf Life on Ship Date:")).get()), 'info3': strip(response.xpath(tmp.format("Vol.:")).get()), 'info4': strip( response.xpath('//h3[contains(text(), "Price:")]/text()').get( )).lstrip('Price: '), 'stock_info': strip( response.xpath( '//p[@style="padding:15px 0px 5px 0px;"]/text()').get()), 'prd_url': response.url, } yield RawData(**d)
def parse_list(self, response): xp_boxes = response.xpath("//table[@id]//div[@class='PRODUCT_box']") for xp_box in xp_boxes: div = xp_box.xpath(".//div[2][@class='left_right mulu_text']") brand = strip( div.xpath( './/li[@id="ctl00_cph_Content_li_lt_Brand"]/text()').get(), '') rel_url = div.xpath('.//a[@class="name"]/@href').get() img_url = div.xpath('.//img/@src').get() d = { 'brand': brand.replace('-', '') or None, "purity": div.xpath(".//li[1]/text()").get('').split(u":")[-1].strip(), "cas": strip(div.xpath(".//li[2]//a/text()").get()), "cat_no": div.xpath(".//li[4]/text()").get().split(u":")[-1].strip(), "en_name": strip(xp_box.xpath(".//a[@class='name']/text()").get()), "cn_name": strip( xp_box.xpath(".//a[@class='name']//span[1]/text()").get()), 'prd_url': rel_url and urljoin(response.url, rel_url), 'img_url': img_url and urljoin(response.url, img_url), } data_jkid = xp_box.xpath(".//div[@data-jkid]/@data-jkid").get() data_cid = xp_box.xpath(".//div[@data-cid]/@data-cid").get() yield Request(self.prd_size_url.format(value=data_jkid, cid=data_cid, ts=int(time())), body=u"", meta={"prd_data": d}, callback=self.parse_package) next_page = response.xpath('//a[contains(text(), "下一页")]/@href').get() if next_page: yield Request(urljoin(response.url, next_page), callback=self.parse_list)
def parse_detail(self, response): d = { 'brand': '海岸鸿蒙', 'parent': response.meta.get('parent'), 'cat_no': strip(response.xpath('//span[contains(@class, "kj_customno")]/text()').get()), 'cas': strip(response.xpath('//p/text()[contains(self::text(), "CAS")]/following-sibling::span/text()').get()), 'cn_name': strip(response.xpath('//h4[@class="c red1"]/text()').get()), 'prd_url': response.url, } pd_id = response.xpath('//input[@id="nowproductid"]/@value').get() if not pd_id: return yield Request( 'http://www.bjhongmeng.com/ajaxpro/Web960.Web.index,Web960.Web.ashx', method='POST', body=json.dumps({'pd_id': pd_id, }), headers={'X-AjaxPro-Method': 'LoadGoods', }, callback=self.parse_price, meta={'product': d} )
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()' parent = response.meta.get('parent') name = strip(response.xpath('//h1[@class]/text()').get()) chemical_name = response.xpath(tmp.format("Chemical name")).get() d = { 'brand': 'pharmaffiliates', 'parent': parent and parent.title(), 'cat_no': response.xpath(tmp.format("Catalogue number")).get(), 'en_name': name or chemical_name, 'cas': strip(response.xpath('//h2[contains(text(), "CAS Number")]/../following-sibling::td//text()').get()), 'mf': ''.join(response.xpath(tmp.format("Molecular form")).getall()), 'mw': response.xpath(tmp.format("Mol. Weight")).get(), 'appearance': response.xpath(tmp.format("Appearance")).get(), 'info1': response.xpath(tmp.format("Synonyms")).get() or chemical_name, 'info2': strip(response.xpath(tmp.format("Storage")).get()), 'img_url': response.xpath('//img[@id="mainimg"]/@src').get(), 'prd_url': response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//div[contains(text(), {!r})]/following-sibling::div/text()' rel_img = response.xpath('//img[@class="pic"]/@src').get() d = { 'brand': 'sdd', 'cat_no': response.xpath('//tr/td[1]/text()').get(), 'en_name': response.xpath('//div[@class="row"]//dl/dd/text()').get(), 'chs_name': response.xpath('//div[@class="row"]//dl/dt/text()').get(), 'cas': strip(response.xpath(tmp.format("CAS NO.")).get()), 'mf': strip(response.xpath(tmp.format("分子式")).get()), 'mw': strip(response.xpath(tmp.format("分子量")).get()), 'info1': strip(response.xpath(tmp.format("英文异名")).get()), 'info2': response.xpath( '//td[contains(text(), "存储条件")]/following-sibling::td[1]/text()' ).get(), 'info3': response.xpath('//tr/td[6]/text()').get(), 'info4': response.xpath('//tr/td[5]/text()').get(), 'stock_info': response.xpath('//tr/td[7]/text()').get(), 'appearance': response.xpath( '//td[contains(text(), "性状")]/following-sibling::td[1]/text()' ).get(), 'img_url': rel_img and urljoin(self.base_url, rel_img), 'prd_url': response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()' d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': strip(response.xpath(tmp.format("产品编号:")).get()), 'en_name': strip(response.xpath('//div[@class="proinftit_t"]/text()').get()), 'cas': strip(response.xpath(tmp.format("CAS号:")).get()), 'mf': strip(''.join(response.xpath(tmp.format("分子式:")).getall())), 'mw': strip(response.xpath(tmp.format("分子量:")).get()), 'info1': strip(response.xpath(tmp.format("化学名:")).get()), 'img_url': response.xpath('//div[@class="proinfotableimg"]/img/@src').get(), 'prd_url': response.url, } yield RawData(**d)
def detail_parse(self, response): tmp = '//td[contains(descendant-or-self::text(), "{}")]//following-sibling::td/text()' d = { "brand": "qcc", "parent": response.meta.get('parent'), "cat_no": response.xpath(tmp.format("QCC Cat No.:")).get(), "cas": strip(response.xpath(tmp.format("CAS No.:")).get()), "en_name": strip(response.xpath(tmp.format("Chemical Name:")).get()), "info1": strip(response.xpath(tmp.format("Synonyms:")).get()), "mf": strip(response.xpath(tmp.format("Molecular Formula:")).get()), "mw": strip(response.xpath(tmp.format("Molecular Weight:")).get()), "prd_url": response.url, } img_url = urljoin( self.base_url, response.xpath( '//table//td/div[@style and not(div)]//img/@src').get()) if img_url and not img_url.endswith('Uploads/'): d['img_url'] = img_url yield RawData(**d)
def parse_detail(self, response): tmp = '//li[contains(text(), {!r})]/text()' func = lambda res, t: res.xpath(tmp.format(t)).get('').lstrip(t ) or None img_rel = response.xpath('//td/img/@src').get() cat_no = response.xpath('//tr[@id][1]/td[2]/text()').get() if not cat_no: return d = { 'brand': self.brand, 'cat_no': cat_no, 'parent': response.meta.get('parent'), 'en_name': strip(response.xpath('//h2/text()[1]').get()), 'chs_name': strip(response.xpath('//h2/text()[2]').get()), 'cas': func(response, 'CAS号:'), 'mf': func(response, '分子式:'), 'mw': func(response, '分子量:'), 'purity': func(response, '韶远库存批次纯度:'), 'info3': response.xpath('//tr[@id][1]/td[4]/text()').get(), 'info4': response.xpath('//tr[@id][1]/td[5]/text()').get(), 'stock_info': response.xpath('//tr[@id][1]/td[8]/text()').get(), 'img_url': img_rel and urljoin(self.base_url, img_rel), 'prd_url': response.url, } yield RawData(**d) for tr in response.xpath('//tr[@id]'): d_package = { 'brand': self.brand, 'cat_no': cat_no, 'package': tr.xpath('./td[4]/text()').get(), 'price': tr.xpath('./td[5]/text()').get(), 'currency': 'RMB', 'delivery_time': tr.xpath('./td[8]/text()').get(), } if d_package['package'] == 'bulk': continue yield ProductPackage(**d_package)
def parse_detail(self, response): tmp = '//span[@class={!r}]/text()' tmp2 = '//td[contains(text(), {!r})]/following-sibling::td/text()' cat_no = response.xpath(tmp.format("code productVal")).get() mw = strip(response.xpath(tmp2.format("分子式/分子量")).get()) img_rel = response.xpath('//div[@data-attr]/@data-attr').get() d = { 'brand': self.brand, 'parent': '_'.join(response.xpath( '//div[@class="subCategory clearfix"][1]//span[@class="startPoint"]//a/text()').getall()), 'cat_no': cat_no, 'en_name': ''.join(response.xpath('//h1[@class="name"]//text()').getall()), 'cas': response.xpath(tmp.format("cas productVal")).get(), 'mf': ''.join(response.xpath('//span[@id="molecularFormula"]//text()').getall()).replace('_', ''), 'mw': mw and mw.replace('=', ''), 'purity': response.xpath(tmp2.format("纯度/分析方法")).get(), 'appearance': response.xpath(tmp2.format("外观与形状")).get(), 'info2': response.xpath(tmp2.format("储存温度")).get(), 'mdl': response.xpath(tmp2.format("MDL编号")).get(), 'img_url': img_rel and urljoin(self.base_url, img_rel), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//table[@id="PricingTable"]/tbody/tr') for row in rows: stock_num = strip(row.xpath('./td[3]/text()').get()) package = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'delivery_time': '现货' if stock_num != '0' else None, 'price': strip(row.xpath('./td[2]/div/text()').get()), 'stock_num': stock_num, 'currency': 'RMB', } yield ProductPackage(**package)
def parse(self, response): a_nodes = response.xpath('//ul[not(@id) and not(@class)]/li/a') for a in a_nodes: parent = strip(a.xpath('./text()').get()) rel_url = a.xpath('./@href').get() url = urljoin(response.url, rel_url) if rel_url.startswith('..'): yield Request(url, callback=self.parse_detail, meta={'parent': parent}) else: yield Request(url, callback=self.parse_list, meta={'parent': parent})
def parse_list(self, response): tables = response.xpath('//table') for table in tables: en_name = table.xpath('.//td[@class="info"]/h5[not(@class)]/strong//text()').get('') short_desc = table.xpath('normalize-space(.//td[@class="info"]/h5[@class="short_desc"]/strong//text())').get('') en_name = en_name.strip(' :') tmp = short_desc.split(';') tmp = map(str.strip, tmp) tmp = tuple(filter(bool, tmp)) m_cas = re.search(r'\d+-\d{2}-\d', short_desc) m_mw = re.search(r'Mol\. Wt\.: ([^;]+);', short_desc) m_mf = re.search(r'CAS : [^;]+; ([^;]+)', short_desc) d = { 'brand': 'srinidhiindsynth', 'parent': response.meta.get('parent'), 'cat_no': en_name, 'en_name': en_name, 'cas': m_cas and m_cas.group(), 'mf': m_mf and strip(m_mf.group(1)), 'mw': m_mw and strip(m_mw.group(1)), 'img_url': table.xpath('.//img/@src').get(), 'prd_url': response.url, }
def parse_detail(self, response): mf = strip(''.join(response.xpath('//label[text()="Mol. Formula : "]/..//text()[not(parent::label)]').getall())) row = response.xpath( '//div[not(@style)]/table[@class="table table-condensed"]/tbody/tr[position()=1 and position()!=last()]' ) price = row.xpath('./td[2]/text()').get() cas = strip(response.xpath('//b[contains(text(), "CAS")]/../following-sibling::div/text()').get()) d = { 'brand': 'syninnova', 'parent': response.meta.get('category'), 'cat_no': response.xpath('//div[contains(@class, "productinfo")]/h2[1]/text()').get(), 'en_name': response.xpath('//div[contains(@class, "productinfo")]/h2[2]/text()').get(), 'cas': cas and cas.translate(t), 'mf': mf, 'mw': strip(response.xpath('//label[text()="Mol. Weight : "]/following-sibling::text()').get()), 'appearance': strip(response.xpath('//label[text()="Appearance : "]/following-sibling::text()').get()), 'info3': row.xpath('./td[1]/text()').get(), 'info4': price and f'USD {price}', 'stock_info': row.xpath('./td[4]/text()').get(), 'img_url': response.xpath('//div[@class="prodImage"]/img/@src').get(), 'prd_url': response.url, } yield RawData(**d)