def parse_detail(self, response): tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()' rel_img = response.xpath('//div[@class="pic"]/img/@src').get() cat_no = response.xpath('//div/span[@style]/text()').get() d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': response.xpath('//div/span/@data-nameen').get(), 'cas': response.xpath(tmp.format("CAS:")).get(), 'mdl': response.xpath(tmp.format("MDL:")).get(), 'mf': formula_trans(strip(response.xpath(tmp.format("分子式:")).get())), 'mw': response.xpath(tmp.format("分子量:")).get(), 'smiles': response.xpath(tmp.format("SMILES code:")).get(), 'purity': response.xpath(tmp.format("化学纯度:")).get(), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//div[@class="table-1"]//tbody/tr') for row in rows: package = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'price': strip(row.xpath('./td[2]/text()').get()), 'stock_num': row.xpath('./td[5]/text()').get(), 'currency': 'RMB', } yield ProductPackage(**package)
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()' cat_no = response.xpath(tmp.format('Catalog #')).get() d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': response.xpath('//td[@class="pageTitle"]/text()').get(), 'cas': response.xpath(tmp.format('CAS#')).get(), 'stock_info': response.xpath(tmp.format('In Stock')).get(), 'prd_url': response.url, } yield RawData(**d) raw_price = strip(response.xpath( 'normalize-space(//td[contains(text(), "Retail Price:")]/following-sibling::td/text())' ).get()) price = None if raw_price: raw_price = re.sub(r'\s+', ' ', raw_price) price = first(map(lambda m: m.group(0) if m is not None else None, re.finditer(r'(\d+(\.\d+)?)', raw_price)), None) dd = { 'brand': self.brand, 'cat_no': cat_no, 'price': price, 'currency': 'USD', 'info': raw_price, 'delivery_time': response.xpath(tmp.format('In Stock')).get(), } yield ProductPackage(**dd)
def parse(self, response): xml = XML(response.body) prds = xml.xpath('//Reference') for prd in prds: cat_no = first(prd.xpath('./Order_Code/text()'), None) d = { "brand": self.brand, "cat_no": cat_no, "cas": first(prd.xpath('./CAS_Registry_Number/text()'), None), "en_name": first(prd.xpath('./Reference_Standard/text()'), None), "info2": first(prd.xpath('./Storage/text()'), None), "info3": first(prd.xpath('./Quantity_per_vial/text()'), None), "info4": first(prd.xpath('./Price/text()'), None), "prd_url": f"https://crs.edqm.eu/db/4DCGI/View={first(prd.xpath('./Order_Code/text()'), '')}", } yield RawData(**d) price = first(prd.xpath('./Price/text()'), None) yield ProductPackage( brand=self.brand, cat_no=cat_no, package=first(prd.xpath('./Quantity_per_vial/text()'), None), price=price and price.replace('€', ''), currency='EUR', )
def parse_detail(self, response): tmp = '//div[contains(*/text(), {!r})]/following-sibling::div/*/text()' cat_no = response.xpath('//span[@id="catalogNo"]/text()').get() rel_img = response.xpath('//input[@id="image"]/@value').get() d = { 'brand': self.brand, 'parent': '_'.join(response.xpath('//li[@class="active"]/following-sibling::li/a/text()').getall()), 'cat_no': cat_no, 'en_name': response.xpath('//h2/span/text()').get(), 'purity': response.xpath('//span[@class="d-purity"]/text()').get(), 'cas': response.xpath(tmp.format("CAS 号")).get(), 'mf': response.xpath(tmp.format("分子式")).get(), 'mw': response.xpath(tmp.format("分子量")).get(), 'smiles': response.xpath(tmp.format("Smiles Code")).get(), 'info2': response.xpath(tmp.format("存储条件")).get(), 'mdl': response.xpath(tmp.format("MDL 号")).get(), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//div[@class="table-responsive"]//tr[position()!=1]') for row in rows: package = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[@id="packing"]/text()').get(), 'price': row.xpath('./td[@id="money"]/text()').get(), 'currency': 'RMB', 'stock_num': row.xpath('./td[@id="stock"]/text()').get(), } yield ProductPackage(**package)
def parse_detail(self, response): tmp = 'normalize-space(//td[contains(div/text(), {!r})]/following-sibling::td/text())' rel_img = response.xpath('//div[@class="c_c_p"]//div/img/@src').get() cat_no = strip(response.xpath(tmp.format("产品号/Catalog#")).get()) d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': cat_no, 'en_name': strip(response.xpath(tmp.format("Product Name:")).get()), 'chs_name': strip(response.xpath(tmp.format("产品名称:")).get()), 'cas': strip(response.xpath(tmp.format("CAS#:")).get()), 'mf': strip(response.xpath(tmp.format("分子式/Formula:")).get()), 'mw': strip(response.xpath(tmp.format("分子量/MW:")).get()), 'purity': strip(response.xpath(tmp.format("纯度/Purity (%):")).get()), 'info1': strip(response.xpath(tmp.format("Synonyms:")).get()), 'info2': strip(response.xpath(tmp.format("储藏条件/Storage:")).get()), 'appearance': strip(response.xpath(tmp.format("颜色/Color:")).get()), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } for k in d: d[k] = d[k] if d[k] != 'NA' else None yield RawData(**d) rows = response.xpath( '//table[@class="c_p_size"]//tr[td and td/text()!="NA"]') for row in rows: dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'price': row.xpath('./td[1]/text()').get(), } yield ProductPackage(**dd)
def parse_detail(self, response): parent = response.meta.get('parent') cat_no = response.xpath("//span[@class='variant-sku']//text()").get() cat_no = first(re.findall(r'SKU:(.+)-', cat_no), None) d = { "brand": self.name, "parent": parent, "en_name": response.xpath("//h1[@class='product-header']/text()").get(), "cat_no": cat_no, "prd_url": response.url, "mf": response.xpath('//td[contains(text(), "Molecular Formula:")]/following-sibling::td/text()').get(), "mw": response.xpath('//td[contains(text(), "Molecular Weight:")]/following-sibling::td/text()').get(), "cas": response.xpath('//td[contains(text(), "CAS Number:")]/following-sibling::td/text()').get(), "smiles": response.xpath('//td[contains(text(), "SMILES:")]/following-sibling::td/text()').get(), "purity": response.xpath('//td[contains(text(), "Purity (HPLC):")]/following-sibling::td/text()').get(), "info1": response.xpath('//td[contains(text(), "Synonyms:")]/following-sibling::td/text()').get(), "info2": response.xpath('//td[contains(text(), "Storage Conditions:")]/following-sibling::td/text()').get(), "img_url": (m := response.xpath('//noscript/img/@src').get()) and urljoin(response.url, m), } yield RawData(**d) rows = response.xpath('//select[@id="product-select-product-template"]/option/text()').getall() for row in rows: package, price = row.split("-") price = price.replace("$", '') dd = { "brand": self.name, "cat_no": cat_no, "package": package, "currency": "USD", "price": price } yield ProductPackage(**dd)
def detail_parse(self, response): tmp = '//th[contains(text(),{0!r})]/following-sibling::td/descendant-or-self::text()' img_url = response.xpath( '//th[contains(text(),"Structure")]/following-sibling::td/img/@src' ).get() cat_no = strip(response.xpath(tmp.format("Product No.")).get()) d = { "brand": self.brand, "cat_no": cat_no, "parent": response.xpath(tmp.format("Category")).get(), "info1": "".join(response.xpath(tmp.format("Synonym(s)")).extract()), "mw": response.xpath(tmp.format("Molecular Weight")).get(), "mf": "".join(response.xpath(tmp.format("Formula")).extract()), "cas": response.xpath(tmp.format("CAS Number")).get(), "en_name": strip("".join( response.xpath( '//div[@class="product-name"]/span/descendant-or-self::text()' ).extract())), "img_url": img_url and urljoin(self.base_url, img_url), "stock_info": response.xpath( '//table[@id="product-matrix"]//td[@class="unit-price"]/text()' ).get(), "prd_url": response.url, } yield RawData(**d) matrix = first( re.findall(r'var matrixChildrenProducts = ({.+});', response.text), None) if not matrix: return packages = json.loads(matrix) for _, item in packages.items(): sku = item.get('sku') if not sku: continue package = sku.replace(f'{cat_no}-', '') dd = { 'brand': self.brand, 'cat_no': cat_no, 'cat_no_unit': sku, 'package': strip(package), 'price': item.get('price'), 'currency': 'USD', 'delivery_time': 'In-stock' if item.get('is_in_stock') else None } yield ProductPackage(**dd)
def parse_detail(self, response): parent = response.xpath( "//div[@class='breadcrumb']//li[last()]/strong[@class='current-item']/text()" ).get() cat_no = response.xpath( "//div[@class='short-description']//strong[contains(text(), 'Catalog:')]/following-sibling::span/text()" ).get() d = { "brand": self.name, "parent": parent, "cat_no": cat_no, "en_name": response.xpath("//h1[@itemprop='name']/text()").get(), "cas": response.xpath( "//div[@class='short-description']//strong[contains(text(), 'CAS:')]/following-sibling::span/text()" ).get(), "smiles": response.xpath( "//b[contains(text(), 'Smiles: ')]/parent::td/following-sibling::td/text()" ).get(), "mf": response.xpath( "//b[contains(text(), 'Formula:')]/parent::td/following-sibling::td/text()" ).get(), "mw": response.xpath( "//b[contains(text(), 'Mol Weight: ')]/parent::td/following-sibling::td/text()" ).get(), "prd_url": response.url, "img_url": response.xpath("//div[@class='picture']//img/@src").get(), } yield RawData(**d) rows = response.xpath("//ul[@class='option-list']//tr[position()>1]") for row in rows: dd = { "brand": self.name, "cat_no": cat_no, "package": row.xpath(".//td[@class='attribute_name']/span/text()").get(), "price": row.xpath( ".//td[@class='attribute_price']/input/@value").get(), "currency": "USD", } yield ProductPackage(**dd)
def parse_detail(self, response): img_url = response.xpath("//div[@class='detail_img']/img/@src").get() cat_no = response.xpath( "//td[contains(text(), 'Catalog Number')]/following-sibling::td/text()" ).get() d = { "brand": self.name, "prd_url": response.url, "en_name": response.xpath("//div[@class='detail_des']/h2/text()").get(), "img_url": urljoin(self.base_url, img_url), "cat_no": cat_no, "mdl": response.xpath( "//td[contains(text(), 'MDL Number')]/following-sibling::td/text()" ).get(), "smiles": response.xpath( "//td[contains(text(), 'SMILES')]/following-sibling::td/text()" ).get(), "info1": response.xpath( "//td[contains(text(), 'Chemical Name')]/following-sibling::td/text()" ).get(), "cas": response.xpath( "//td[contains(text(), 'CAS Number')]/following-sibling::td/text()" ).get(), "mf": response.xpath( "//td[contains(text(), 'Molecular Formula')]/following-sibling::td/text()" ).get(), "mw": response.xpath( "//td[contains(text(), 'Molecular Weight')]/following-sibling::td/text()" ).get(), } yield RawData(**d) rows = response.xpath("//div[@class='detail']//tr[position()>1]") for row in rows: price = row.xpath('./td[3]/text()').get() price = price.replace("$", '') dd = { "brand": self.name, "cat_no": cat_no, "package": row.xpath('./td[1]/text()').get(), "currency": "USD", "price": price, } yield ProductPackage(**dd)
def parse_detail(self, response): cat_no = response.xpath( "//td[contains(text(), 'Catalog Number:')]/following-sibling::td/text()" ).get() d = { "brand": self.name, "parent": response.xpath("//div[@class='crumbs']//a[last()]/text()").get(), "cat_no": cat_no, "en_name": response.xpath( "//td[contains(text(), 'Chemical Name:')]/following-sibling::td/text()" ).get(), "cas": response.xpath( "//td[contains(text(), 'CAS Number:')]/following-sibling::td/text()" ).get(), "smiles": response.xpath( "//td[contains(text(), 'SMILES:')]/following-sibling::td/text()" ).get(), "mf": response.xpath( "//td[contains(text(), 'Molecular Formula:')]/following-sibling::td/text()" ).get(), "mw": response.xpath( "//td[contains(text(), 'Molecular Weight:')]/following-sibling::td/text()" ).get(), "prd_url": response.url, "img_url": response.xpath("//div[@class='pd_f1']/img/@src").get(), "info1": response.xpath( "//td[contains(text(), 'IUPAC Name:')]/following-sibling::td/text()" ).get(), } yield RawData(**d) rows = response.xpath( "//table[@class='q_table']//tbody//tr[position()>0]") for row in rows: price = row.xpath(".//td[5]/text()").get() price = price.replace("$", '') dd = { "brand": self.name, "cat_no": cat_no, "package": row.xpath(".//td[1]/text()").get(), "price": price, "currency": 'USD', } yield ProductPackage(**dd)
def detail_parse(self, response): cat_no_unit = response.xpath('//span[@itemprop="sku"]/text()').get("") m = re.match(r'[A-Z]{3}-\d+', cat_no_unit) cat_no = m.group(0) if m else cat_no_unit rel_img = response.xpath('//img[@class="zoomImg"]/@src').get() full_name = response.xpath('//h1[@itemprop="name"][1]/text()').get( "").title() tmp_full_name = response.xpath( '//div[@itemprop="description"]/text()').get("").title() if '-' in full_name: en_name, package = full_name.rsplit('-', 1) elif '-' in tmp_full_name: en_name, package = tmp_full_name.rsplit('-', 1) else: en_name, package = full_name, 'kit' d = { "brand": self.brand, "parent": self.extract_value(response, "Chemical Family: "), "cat_no": cat_no, "en_name": strip(en_name), "cas": self.extract_value(response, "CAS: "), "mf": self.extract_value(response, "Chemical Formula: "), "mw": self.extract_value(response, "Formula Weight: "), "info2": self.extract_value(response, "Long Term Storage: "), "appearance": self.extract_value(response, "Appearance: "), "purity": self.extract_value(response, "Purity: "), 'img_url': rel_img and urljoin(self.base_url, rel_img), "prd_url": response.url, } yield RawData(**d) stock_num = response.xpath( '//div[@class="items_left"]//em/text()').get() package = strip(package) dd = { 'brand': self.brand, 'cat_no_unit': cat_no_unit, 'cat_no': cat_no, 'package': package and package.lower(), 'price': response.xpath('//span[@itemprop="price"]/@content').get(), 'currency': 'USD', 'stock_num': stock_num and first(re.findall(r'\d+', stock_num), None), } yield ProductPackage(**dd)
def parse_list(self, response): j_obj = json.loads(response.text) parent = response.meta.get('parent') tmp = 'http://www.bepurestandards.com/show/{}/{}/Y/true' products = j_obj.get('table2', []) for product in products: name = product.get('name') cas = first(re.findall(r'\d+-\d{2}-\d', name), None) cat_no = product.get('code') d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': product.get('name2'), 'chs_name': product.get('name'), 'stock_info': product.get('cnum'), 'cas': cas, 'purity': product.get('purity'), 'info3': product.get('pack'), 'info4': product.get('price'), 'expiry_date': product.get('enddate'), 'prd_url': tmp.format(product.get('id'), quote(parent)) } yield RawData(**d) dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': product.get('pack'), 'price': product.get('price'), 'currency': 'RMB', } yield ProductPackage(**dd) page_table = first(j_obj.get('table1'), {}) total_page = int(page_table.get('pagecount', 0)) params = response.meta.get('params') cur_page = int(params.get('page', 1)) if cur_page >= total_page: return params['page'] = str(int(params['page']) + 1) yield Request(self.api_url + urlencode(params), callback=self.parse_list, meta={ 'parent': parent, 'params': params, })
def parse_detail(self, response): if response.status == 521: yield from self.handle_521(response, callback=self.parse_detail) return tmp = '//el-form-item[contains(@label, {!r})]/span/text()' brand = strip(response.xpath(tmp.format("品牌")).get(), "") brand = '_'.join(('Tanmo', brand)).lower() cat_no = strip(response.xpath(tmp.format("产品编号")).get()) good_obj = demjson.decode( first(re.findall(r'goodObj: ({[^}]+}),', response.text), '{}')) d = { 'brand': brand, 'cat_no': cat_no, 'chs_name': strip(response.xpath('//h2[@class="p-right-title"]/text()').get()), 'cas': strip(response.xpath(tmp.format("CAS号")).get()), 'stock_info': good_obj.get('number', 0), 'expiry_date': good_obj.get('date', 0), 'purity': strip(response.xpath(tmp.format("标准值")).get()), 'info2': strip(response.xpath(tmp.format("储存条件")).get()), 'info3': strip(response.xpath(tmp.format("规格")).get()), 'info4': good_obj.get('price', '咨询'), 'prd_url': response.url, } yield RawData(**d) dd = { 'brand': brand, 'cat_no': cat_no, 'package': strip(response.xpath(tmp.format("规格")).get()), 'price': good_obj.get('price', '咨询'), 'currency': 'RMB', } yield ProductPackage(**dd)
def parse_detail(self, response): tmp = '//span[contains(text(), {!r})]/following-sibling::text()' cat_no = strip(response.xpath(tmp.format("产品编号:")).get()) sub_brand = response.xpath(tmp.format("品牌:")).get('') rel_img = response.xpath('//div[@class="riliimg-aa"]/img/@src').get() d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': cat_no, 'en_name': strip(response.xpath('//div[@class="tit-aa"]/text()').get()), 'chs_name': strip(response.xpath(tmp.format('中文名称:')).get()), 'cas': strip(response.xpath(tmp.format('CAS No:')).get()), 'mf': strip(response.xpath(tmp.format('分子式:')).get()), 'mw': strip(response.xpath(tmp.format('分子量:')).get()), 'purity': strip(response.xpath(tmp.format('纯度:')).get()), 'mdl': strip(response.xpath(tmp.format('MDL号:')).get()), 'img_url': rel_img and urljoin(self.base_url, rel_img), 'prd_url': response.url, } if 'amatek' not in sub_brand.lower(): print(f'{cat_no}, have weird brand') return yield RawData(**d) rows = response.xpath('//div[@class="tablpp"]//tr[position()>1]') for row in rows: price = row.xpath('./td[3]/text()').get() if price is None or 'Inquire' == price: continue stock_num = row.xpath('./td[2]/text()').get('') delivery_time = 'in-stock' if stock_num.isdigit() and int( stock_num) else None dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'price': price, 'currency': 'RMB', 'delivery_time': delivery_time, 'stock_num': stock_num, } yield ProductPackage(**dd)
def parse_detail(self, response): tmp = '//th[contains(text(), {!r})]/following-sibling::td[1]//text()' cat_no = response.meta.get('cat_no') parent = response.meta.get('parent') if response.xpath('//span[contains(text(), "请按住滑块,拖动到最右边")]'): return d = { 'brand': self.brand, 'parent': parent, 'cat_no': cat_no, 'en_name': strip(response.xpath('//div[@class="product-general"]/span/text()').get()), 'chs_name': strip(response.xpath(tmp.format("别名:")).get()) or response.xpath('//h1/text()').get(), 'cas': strip(response.xpath(tmp.format("Cas号:")).get()), 'mf': strip(''.join(response.xpath(tmp.format("分子式:")).getall())), 'mw': strip(response.xpath(tmp.format("分子量:")).get()), 'einecs': strip(response.xpath(tmp.format("EINECS编号:")).get()), 'mdl': strip(response.xpath(tmp.format("MDL号:")).get()), 'info2': strip(response.xpath(tmp.format("储存条件:")).get()), 'appearance': strip(response.xpath(tmp.format("颜色:")).get()), 'img_url': response.xpath('//td/img/@src').get(), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//div[@class="shopping"]//tbody/tr') for row in rows: cat_no_unit = strip(row.xpath('./td[1]/text()').get()) package = cat_no_unit.replace(f'{cat_no}-', '') if package == 'bulk': return dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': package, 'cat_no_unit': cat_no_unit, 'price': strip(row.xpath('./td[5]/text()').get()), 'currency': 'RMB', } yield ProductPackage(**dd)
def parse_detail(self, response): tmp = '//li[contains(text(), {!r})]/text()' func = lambda res, t: res.xpath(tmp.format(t)).get('').lstrip(t ) or None img_rel = response.xpath('//td/img/@src').get() cat_no = response.xpath('//tr[@id][1]/td[2]/text()').get() if not cat_no: return d = { 'brand': self.brand, 'cat_no': cat_no, 'parent': response.meta.get('parent'), 'en_name': strip(response.xpath('//h2/text()[1]').get()), 'chs_name': strip(response.xpath('//h2/text()[2]').get()), 'cas': func(response, 'CAS号:'), 'mf': func(response, '分子式:'), 'mw': func(response, '分子量:'), 'purity': func(response, '韶远库存批次纯度:'), 'info3': response.xpath('//tr[@id][1]/td[4]/text()').get(), 'info4': response.xpath('//tr[@id][1]/td[5]/text()').get(), 'stock_info': response.xpath('//tr[@id][1]/td[8]/text()').get(), 'img_url': img_rel and urljoin(self.base_url, img_rel), 'prd_url': response.url, } yield RawData(**d) for tr in response.xpath('//tr[@id]'): d_package = { 'brand': self.brand, 'cat_no': cat_no, 'package': tr.xpath('./td[4]/text()').get(), 'price': tr.xpath('./td[5]/text()').get(), 'currency': 'RMB', 'delivery_time': tr.xpath('./td[8]/text()').get(), } if d_package['package'] == 'bulk': continue yield ProductPackage(**d_package)
def parse_detail(self, response): tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()' p = re.compile(r'(\d+(\.\d+)?)') d = { 'brand': self.brand, 'cat_no': response.meta.get('cat_no'), 'en_name': response.meta.get('en_name'), 'cas': strip(response.xpath(tmp.format("CAS Number")).get()), 'mf': strip(response.xpath(tmp.format("Molecular Formula")).get()), 'mw': strip(response.xpath(tmp.format("Molecular Weight")).get()), 'purity': strip(response.xpath(tmp.format("Purity")).get()), 'mdl': strip(response.xpath(tmp.format("MDL Number")).get()), 'prd_url': response.url, 'img_url': response.xpath('//div[@id="tabs-Structure"]/img/@src').get(), } yield RawData(**d) rows = response.xpath('//table[@id="tblPricing"]//tr[position()>1]') for row in rows: price = row.xpath('./td[3]/text()').get() dd = { 'brand': self.brand, 'cat_no': response.meta.get('cat_no'), 'package': strip(row.xpath('./td[1]/text()').get()), 'price': price and first(first(p.findall(price), None), None), 'stock_num': strip(row.xpath('./td[4]/text()').get()), 'currency': 'GBP', } yield ProductPackage(**dd)
def parse_detail(self, response): tmp = '//span[@class={!r}]/text()' tmp2 = '//td[contains(text(), {!r})]/following-sibling::td/text()' cat_no = response.xpath(tmp.format("code productVal")).get() mw = strip(response.xpath(tmp2.format("分子式/分子量")).get()) img_rel = response.xpath('//div[@data-attr]/@data-attr').get() d = { 'brand': self.brand, 'parent': '_'.join(response.xpath( '//div[@class="subCategory clearfix"][1]//span[@class="startPoint"]//a/text()').getall()), 'cat_no': cat_no, 'en_name': ''.join(response.xpath('//h1[@class="name"]//text()').getall()), 'cas': response.xpath(tmp.format("cas productVal")).get(), 'mf': ''.join(response.xpath('//span[@id="molecularFormula"]//text()').getall()).replace('_', ''), 'mw': mw and mw.replace('=', ''), 'purity': response.xpath(tmp2.format("纯度/分析方法")).get(), 'appearance': response.xpath(tmp2.format("外观与形状")).get(), 'info2': response.xpath(tmp2.format("储存温度")).get(), 'mdl': response.xpath(tmp2.format("MDL编号")).get(), 'img_url': img_rel and urljoin(self.base_url, img_rel), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//table[@id="PricingTable"]/tbody/tr') for row in rows: stock_num = strip(row.xpath('./td[3]/text()').get()) package = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'delivery_time': '现货' if stock_num != '0' else None, 'price': strip(row.xpath('./td[2]/div/text()').get()), 'stock_num': stock_num, 'currency': 'RMB', } yield ProductPackage(**package)
def parse(self, response): rows = response.xpath('//table//tr[position()>2 and @class]') for row in rows: cat_no = row.xpath('./td[2]/a/text()').get() rel_url = row.xpath('./td[2]/a/@href').get() d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': row.xpath('./td[3]/text()').get(), 'info3': row.xpath('./td[4]/text()').get(), 'info4': strip(row.xpath('./td[5]/text()').get()), 'prd_url': urljoin(response.url, rel_url), 'expiry_date': row.xpath('./td[6]/text()').get(), } yield RawData(**d) dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[4]/text()').get(), 'price': strip(row.xpath('./td[5]/text()').get()), 'currency': 'USD', } yield ProductPackage(**dd)
def parse_detail(self, response): tmp = '//th[contains(text(), {!r})]/following-sibling::td//p//text()' package = '//tr[td and td[@class="pro_price_3"]/span[not(@class)]]/td[@class="pro_price_1"]' rel_img = response.xpath( '//div[@class="struct-img-wrapper"]/img/@src').get() cat_no = response.xpath('//dt/span/text()').get('').replace( 'Cat. No.: ', '').replace('目录号: ', '') tmp_package = strip( response.xpath(f'normalize-space({package}/text())').get()) d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': cat_no, 'en_name': response.xpath('//h1/strong/text()').get(), 'cas': strip(response.xpath(tmp.format("CAS No.")).get()), 'mf': formula_trans(strip(response.xpath(tmp.format("Formula")).get())), 'mw': strip(response.xpath(tmp.format("Molecular Weight")).get()), 'smiles': strip(''.join(response.xpath(tmp.format("SMILES")).getall())), 'info3': tmp_package and tmp_package.replace('\xa0', ' '), 'info4': strip( response.xpath( f'{package}/following-sibling::td[1]/text()').get()), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d) if not cat_no: return rows = response.xpath( '//tr[td and td[@class="pro_price_3"]/span[not(@class)]]') for row in rows: price = strip(row.xpath('./td[@class="pro_price_2"]/text()').get()) tmp_package = strip( row.xpath('normalize-space(./td[@class="pro_price_1"]/text())' ).get()) dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': tmp_package and tmp_package.replace('\xa0', ' '), 'price': price and price.strip('¥'), 'delivery_time': strip(''.join( row.xpath( './td[@class="pro_price_3"]/span//text()').getall())) or None, 'currency': 'RMB', } yield ProductPackage(**dd)
def parse_detail(self, response): mw = response.xpath( "//div[@class='woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab']//p/text()[contains(self::text(),'Molecular Weight:')]" ).get() mw2 = response.xpath( "//tr[contains(@class, 'woocommerce-product-attributes-item--attribute_pa_mw')]//p/text()" ).get() cas = response.xpath( "//div[@class='woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab']/p[contains(text(),'CAS Number:')]/text()" ).get() cas2 = response.xpath( '//tr[@class="woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_cas"]//td[@class="woocommerce-product-attributes-item__value"]//p/text()' ).get() purity = response.xpath( "//div[@class='woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab']//p/text()[contains(self::text(),'Purity:')]" ).get() purity2 = response.xpath( "//tr[@class='woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_purity']//p/text()" ).get() cat_no = response.xpath( "//div[@class='woocommerce-product-details__short-description']//p/text()" ).get('') cat_no = re.sub(r'Product Number:', '', cat_no, 0, re.IGNORECASE) info = response.xpath( "//div[@class='woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab']//p/text()[contains(self::text(),'Storage:')]" ).get() info2 = response.xpath( "//tr[@class='woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_storage']//td[@class='woocommerce-product-attributes-item__value']//p/text()" ).get() d = { "brand": self.name, "cat_no": cat_no, "parent": response.xpath( '//tr[@class="woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_categories"]//td[@class="woocommerce-product-attributes-item__value"]/p/text()' ).get(), "cas": (cas and first(re.findall(r'CAS Number: (.+)', cas), None)) or cas2, "mf": response.xpath( "//tr[@class='woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_molecular-formula']//td[@class='woocommerce-product-attributes-item__value']//p/text()" ).get(), 'mw': (mw and first(re.findall(r'Molecular Weight: (.+)', mw), None)) or mw2, 'purity': (purity and first(re.findall(r'Purity: (.+)', purity), None)) or purity2, 'img_url': response.xpath( "//div[@class='woocommerce-product-gallery woocommerce-product-gallery--with-images woocommerce-product-gallery--columns-4 images']//a/@href" ).get(), 'prd_url': response.url, 'en_name': response.xpath( "//h1[@class='product_title entry-title']//text()").get(), "info2": (info and first(re.findall(r'Storage: (.+)', info), None)) or info2 } yield RawData(**d) rows = response.xpath( "//table[@class='woocommerce-grouped-product-list group_table']//tr" ) for row in rows: dd = { "brand": self.name, "cat_no": cat_no, "package": row.xpath( ".//td[@class='woocommerce-grouped-product-list-item__label']/label/text()" ).get(), "currency": 'USD', "price": row.xpath( ".//span[@class='woocommerce-Price-currencySymbol']//parent::bdi/text()" ).get(), } dd["package"] = first(re.findall(r'[^(]+', dd["package"]), None) yield ProductPackage(**dd)