Exemplo n.º 1
0
    def parse_detail(self, response):
        tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()'
        rel_img = response.xpath('//div[@class="pic"]/img/@src').get()
        cat_no = response.xpath('//div/span[@style]/text()').get()
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'en_name': response.xpath('//div/span/@data-nameen').get(),
            'cas': response.xpath(tmp.format("CAS:")).get(),
            'mdl': response.xpath(tmp.format("MDL:")).get(),
            'mf':
            formula_trans(strip(response.xpath(tmp.format("分子式:")).get())),
            'mw': response.xpath(tmp.format("分子量:")).get(),
            'smiles': response.xpath(tmp.format("SMILES code:")).get(),
            'purity': response.xpath(tmp.format("化学纯度:")).get(),
            'img_url': rel_img and urljoin(response.url, rel_img),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//div[@class="table-1"]//tbody/tr')
        for row in rows:
            package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[1]/text()').get(),
                'price': strip(row.xpath('./td[2]/text()').get()),
                'stock_num': row.xpath('./td[5]/text()').get(),
                'currency': 'RMB',
            }
            yield ProductPackage(**package)
Exemplo n.º 2
0
    def parse_detail(self, response):
        tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()'
        cat_no = response.xpath(tmp.format('Catalog #')).get()
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'en_name': response.xpath('//td[@class="pageTitle"]/text()').get(),
            'cas': response.xpath(tmp.format('CAS#')).get(),
            'stock_info': response.xpath(tmp.format('In Stock')).get(),
            'prd_url': response.url,
        }
        yield RawData(**d)

        raw_price = strip(response.xpath(
            'normalize-space(//td[contains(text(), "Retail Price:")]/following-sibling::td/text())'
        ).get())
        price = None
        if raw_price:
            raw_price = re.sub(r'\s+', ' ', raw_price)
            price = first(map(lambda m: m.group(0) if m is not None else None,
                              re.finditer(r'(\d+(\.\d+)?)', raw_price)), None)
        dd = {
            'brand': self.brand,
            'cat_no': cat_no,
            'price': price,
            'currency': 'USD',
            'info': raw_price,
            'delivery_time': response.xpath(tmp.format('In Stock')).get(),
        }
        yield ProductPackage(**dd)
Exemplo n.º 3
0
    def parse(self, response):
        xml = XML(response.body)
        prds = xml.xpath('//Reference')
        for prd in prds:
            cat_no = first(prd.xpath('./Order_Code/text()'), None)
            d = {
                "brand":
                self.brand,
                "cat_no":
                cat_no,
                "cas":
                first(prd.xpath('./CAS_Registry_Number/text()'), None),
                "en_name":
                first(prd.xpath('./Reference_Standard/text()'), None),
                "info2":
                first(prd.xpath('./Storage/text()'), None),
                "info3":
                first(prd.xpath('./Quantity_per_vial/text()'), None),
                "info4":
                first(prd.xpath('./Price/text()'), None),
                "prd_url":
                f"https://crs.edqm.eu/db/4DCGI/View={first(prd.xpath('./Order_Code/text()'), '')}",
            }
            yield RawData(**d)

            price = first(prd.xpath('./Price/text()'), None)
            yield ProductPackage(
                brand=self.brand,
                cat_no=cat_no,
                package=first(prd.xpath('./Quantity_per_vial/text()'), None),
                price=price and price.replace('€', ''),
                currency='EUR',
            )
Exemplo n.º 4
0
    def parse_detail(self, response):
        tmp = '//div[contains(*/text(), {!r})]/following-sibling::div/*/text()'
        cat_no = response.xpath('//span[@id="catalogNo"]/text()').get()
        rel_img = response.xpath('//input[@id="image"]/@value').get()
        d = {
            'brand': self.brand,
            'parent': '_'.join(response.xpath('//li[@class="active"]/following-sibling::li/a/text()').getall()),
            'cat_no': cat_no,
            'en_name': response.xpath('//h2/span/text()').get(),
            'purity': response.xpath('//span[@class="d-purity"]/text()').get(),
            'cas': response.xpath(tmp.format("CAS 号")).get(),
            'mf': response.xpath(tmp.format("分子式")).get(),
            'mw': response.xpath(tmp.format("分子量")).get(),
            'smiles': response.xpath(tmp.format("Smiles Code")).get(),
            'info2': response.xpath(tmp.format("存储条件")).get(),
            'mdl': response.xpath(tmp.format("MDL 号")).get(),

            'img_url': rel_img and urljoin(response.url, rel_img),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//div[@class="table-responsive"]//tr[position()!=1]')
        for row in rows:
            package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[@id="packing"]/text()').get(),
                'price': row.xpath('./td[@id="money"]/text()').get(),
                'currency': 'RMB',
                'stock_num': row.xpath('./td[@id="stock"]/text()').get(),
            }
            yield ProductPackage(**package)
Exemplo n.º 5
0
 def parse_detail(self, response):
     tmp = 'normalize-space(//td[contains(div/text(), {!r})]/following-sibling::td/text())'
     rel_img = response.xpath('//div[@class="c_c_p"]//div/img/@src').get()
     cat_no = strip(response.xpath(tmp.format("产品号/Catalog#")).get())
     d = {
         'brand': self.brand,
         'parent': response.meta.get('parent'),
         'cat_no': cat_no,
         'en_name':
         strip(response.xpath(tmp.format("Product Name:")).get()),
         'chs_name': strip(response.xpath(tmp.format("产品名称:")).get()),
         'cas': strip(response.xpath(tmp.format("CAS#:")).get()),
         'mf': strip(response.xpath(tmp.format("分子式/Formula:")).get()),
         'mw': strip(response.xpath(tmp.format("分子量/MW:")).get()),
         'purity':
         strip(response.xpath(tmp.format("纯度/Purity (%):")).get()),
         'info1': strip(response.xpath(tmp.format("Synonyms:")).get()),
         'info2': strip(response.xpath(tmp.format("储藏条件/Storage:")).get()),
         'appearance': strip(response.xpath(tmp.format("颜色/Color:")).get()),
         'img_url': rel_img and urljoin(response.url, rel_img),
         'prd_url': response.url,
     }
     for k in d:
         d[k] = d[k] if d[k] != 'NA' else None
     yield RawData(**d)
     rows = response.xpath(
         '//table[@class="c_p_size"]//tr[td and td/text()!="NA"]')
     for row in rows:
         dd = {
             'brand': self.brand,
             'cat_no': cat_no,
             'package': row.xpath('./td[1]/text()').get(),
             'price': row.xpath('./td[1]/text()').get(),
         }
         yield ProductPackage(**dd)
    def parse_detail(self, response):
        parent = response.meta.get('parent')
        cat_no = response.xpath("//span[@class='variant-sku']//text()").get()
        cat_no = first(re.findall(r'SKU:(.+)-', cat_no), None)
        d = {
            "brand": self.name,
            "parent": parent,
            "en_name": response.xpath("//h1[@class='product-header']/text()").get(),
            "cat_no": cat_no,
            "prd_url": response.url,
            "mf": response.xpath('//td[contains(text(), "Molecular Formula:")]/following-sibling::td/text()').get(),
            "mw": response.xpath('//td[contains(text(), "Molecular Weight:")]/following-sibling::td/text()').get(),
            "cas": response.xpath('//td[contains(text(), "CAS Number:")]/following-sibling::td/text()').get(),
            "smiles": response.xpath('//td[contains(text(), "SMILES:")]/following-sibling::td/text()').get(),
            "purity": response.xpath('//td[contains(text(), "Purity (HPLC):")]/following-sibling::td/text()').get(),
            "info1": response.xpath('//td[contains(text(), "Synonyms:")]/following-sibling::td/text()').get(),
            "info2": response.xpath('//td[contains(text(), "Storage Conditions:")]/following-sibling::td/text()').get(),
            "img_url": (m := response.xpath('//noscript/img/@src').get()) and urljoin(response.url, m),
        }
        yield RawData(**d)

        rows = response.xpath('//select[@id="product-select-product-template"]/option/text()').getall()
        for row in rows:
            package, price = row.split("-")
            price = price.replace("$", '')
            dd = {
                "brand": self.name,
                "cat_no": cat_no,
                "package": package,
                "currency": "USD",
                "price": price
            }
            yield ProductPackage(**dd)
Exemplo n.º 7
0
    def detail_parse(self, response):
        tmp = '//th[contains(text(),{0!r})]/following-sibling::td/descendant-or-self::text()'
        img_url = response.xpath(
            '//th[contains(text(),"Structure")]/following-sibling::td/img/@src'
        ).get()
        cat_no = strip(response.xpath(tmp.format("Product No.")).get())
        d = {
            "brand":
            self.brand,
            "cat_no":
            cat_no,
            "parent":
            response.xpath(tmp.format("Category")).get(),
            "info1":
            "".join(response.xpath(tmp.format("Synonym(s)")).extract()),
            "mw":
            response.xpath(tmp.format("Molecular Weight")).get(),
            "mf":
            "".join(response.xpath(tmp.format("Formula")).extract()),
            "cas":
            response.xpath(tmp.format("CAS Number")).get(),
            "en_name":
            strip("".join(
                response.xpath(
                    '//div[@class="product-name"]/span/descendant-or-self::text()'
                ).extract())),
            "img_url":
            img_url and urljoin(self.base_url, img_url),
            "stock_info":
            response.xpath(
                '//table[@id="product-matrix"]//td[@class="unit-price"]/text()'
            ).get(),
            "prd_url":
            response.url,
        }
        yield RawData(**d)

        matrix = first(
            re.findall(r'var matrixChildrenProducts = ({.+});', response.text),
            None)
        if not matrix:
            return
        packages = json.loads(matrix)
        for _, item in packages.items():
            sku = item.get('sku')
            if not sku:
                continue
            package = sku.replace(f'{cat_no}-', '')
            dd = {
                'brand': self.brand,
                'cat_no': cat_no,
                'cat_no_unit': sku,
                'package': strip(package),
                'price': item.get('price'),
                'currency': 'USD',
                'delivery_time':
                'In-stock' if item.get('is_in_stock') else None
            }
            yield ProductPackage(**dd)
Exemplo n.º 8
0
    def parse_detail(self, response):
        parent = response.xpath(
            "//div[@class='breadcrumb']//li[last()]/strong[@class='current-item']/text()"
        ).get()
        cat_no = response.xpath(
            "//div[@class='short-description']//strong[contains(text(), 'Catalog:')]/following-sibling::span/text()"
        ).get()
        d = {
            "brand":
            self.name,
            "parent":
            parent,
            "cat_no":
            cat_no,
            "en_name":
            response.xpath("//h1[@itemprop='name']/text()").get(),
            "cas":
            response.xpath(
                "//div[@class='short-description']//strong[contains(text(), 'CAS:')]/following-sibling::span/text()"
            ).get(),
            "smiles":
            response.xpath(
                "//b[contains(text(), 'Smiles:  ')]/parent::td/following-sibling::td/text()"
            ).get(),
            "mf":
            response.xpath(
                "//b[contains(text(), 'Formula:')]/parent::td/following-sibling::td/text()"
            ).get(),
            "mw":
            response.xpath(
                "//b[contains(text(), 'Mol Weight: ')]/parent::td/following-sibling::td/text()"
            ).get(),
            "prd_url":
            response.url,
            "img_url":
            response.xpath("//div[@class='picture']//img/@src").get(),
        }
        yield RawData(**d)

        rows = response.xpath("//ul[@class='option-list']//tr[position()>1]")
        for row in rows:
            dd = {
                "brand":
                self.name,
                "cat_no":
                cat_no,
                "package":
                row.xpath(".//td[@class='attribute_name']/span/text()").get(),
                "price":
                row.xpath(
                    ".//td[@class='attribute_price']/input/@value").get(),
                "currency":
                "USD",
            }

            yield ProductPackage(**dd)
    def parse_detail(self, response):
        img_url = response.xpath("//div[@class='detail_img']/img/@src").get()

        cat_no = response.xpath(
            "//td[contains(text(), 'Catalog Number')]/following-sibling::td/text()"
        ).get()
        d = {
            "brand":
            self.name,
            "prd_url":
            response.url,
            "en_name":
            response.xpath("//div[@class='detail_des']/h2/text()").get(),
            "img_url":
            urljoin(self.base_url, img_url),
            "cat_no":
            cat_no,
            "mdl":
            response.xpath(
                "//td[contains(text(), 'MDL Number')]/following-sibling::td/text()"
            ).get(),
            "smiles":
            response.xpath(
                "//td[contains(text(), 'SMILES')]/following-sibling::td/text()"
            ).get(),
            "info1":
            response.xpath(
                "//td[contains(text(), 'Chemical Name')]/following-sibling::td/text()"
            ).get(),
            "cas":
            response.xpath(
                "//td[contains(text(), 'CAS Number')]/following-sibling::td/text()"
            ).get(),
            "mf":
            response.xpath(
                "//td[contains(text(), 'Molecular Formula')]/following-sibling::td/text()"
            ).get(),
            "mw":
            response.xpath(
                "//td[contains(text(), 'Molecular Weight')]/following-sibling::td/text()"
            ).get(),
        }
        yield RawData(**d)

        rows = response.xpath("//div[@class='detail']//tr[position()>1]")
        for row in rows:
            price = row.xpath('./td[3]/text()').get()
            price = price.replace("$", '')
            dd = {
                "brand": self.name,
                "cat_no": cat_no,
                "package": row.xpath('./td[1]/text()').get(),
                "currency": "USD",
                "price": price,
            }
            yield ProductPackage(**dd)
Exemplo n.º 10
0
    def parse_detail(self, response):
        cat_no = response.xpath(
            "//td[contains(text(), 'Catalog Number:')]/following-sibling::td/text()"
        ).get()
        d = {
            "brand":
            self.name,
            "parent":
            response.xpath("//div[@class='crumbs']//a[last()]/text()").get(),
            "cat_no":
            cat_no,
            "en_name":
            response.xpath(
                "//td[contains(text(), 'Chemical Name:')]/following-sibling::td/text()"
            ).get(),
            "cas":
            response.xpath(
                "//td[contains(text(), 'CAS Number:')]/following-sibling::td/text()"
            ).get(),
            "smiles":
            response.xpath(
                "//td[contains(text(), 'SMILES:')]/following-sibling::td/text()"
            ).get(),
            "mf":
            response.xpath(
                "//td[contains(text(), 'Molecular Formula:')]/following-sibling::td/text()"
            ).get(),
            "mw":
            response.xpath(
                "//td[contains(text(), 'Molecular Weight:')]/following-sibling::td/text()"
            ).get(),
            "prd_url":
            response.url,
            "img_url":
            response.xpath("//div[@class='pd_f1']/img/@src").get(),
            "info1":
            response.xpath(
                "//td[contains(text(), 'IUPAC Name:')]/following-sibling::td/text()"
            ).get(),
        }
        yield RawData(**d)

        rows = response.xpath(
            "//table[@class='q_table']//tbody//tr[position()>0]")
        for row in rows:
            price = row.xpath(".//td[5]/text()").get()
            price = price.replace("$", '')
            dd = {
                "brand": self.name,
                "cat_no": cat_no,
                "package": row.xpath(".//td[1]/text()").get(),
                "price": price,
                "currency": 'USD',
            }
            yield ProductPackage(**dd)
Exemplo n.º 11
0
    def detail_parse(self, response):
        cat_no_unit = response.xpath('//span[@itemprop="sku"]/text()').get("")
        m = re.match(r'[A-Z]{3}-\d+', cat_no_unit)
        cat_no = m.group(0) if m else cat_no_unit
        rel_img = response.xpath('//img[@class="zoomImg"]/@src').get()
        full_name = response.xpath('//h1[@itemprop="name"][1]/text()').get(
            "").title()
        tmp_full_name = response.xpath(
            '//div[@itemprop="description"]/text()').get("").title()
        if '-' in full_name:
            en_name, package = full_name.rsplit('-', 1)
        elif '-' in tmp_full_name:
            en_name, package = tmp_full_name.rsplit('-', 1)
        else:
            en_name, package = full_name, 'kit'

        d = {
            "brand": self.brand,
            "parent": self.extract_value(response, "Chemical Family: "),
            "cat_no": cat_no,
            "en_name": strip(en_name),
            "cas": self.extract_value(response, "CAS: "),
            "mf": self.extract_value(response, "Chemical Formula: "),
            "mw": self.extract_value(response, "Formula Weight: "),
            "info2": self.extract_value(response, "Long Term Storage: "),
            "appearance": self.extract_value(response, "Appearance: "),
            "purity": self.extract_value(response, "Purity: "),
            'img_url': rel_img and urljoin(self.base_url, rel_img),
            "prd_url": response.url,
        }
        yield RawData(**d)

        stock_num = response.xpath(
            '//div[@class="items_left"]//em/text()').get()
        package = strip(package)
        dd = {
            'brand': self.brand,
            'cat_no_unit': cat_no_unit,
            'cat_no': cat_no,
            'package': package and package.lower(),
            'price':
            response.xpath('//span[@itemprop="price"]/@content').get(),
            'currency': 'USD',
            'stock_num': stock_num
            and first(re.findall(r'\d+', stock_num), None),
        }
        yield ProductPackage(**dd)
Exemplo n.º 12
0
    def parse_list(self, response):
        j_obj = json.loads(response.text)
        parent = response.meta.get('parent')
        tmp = 'http://www.bepurestandards.com/show/{}/{}/Y/true'
        products = j_obj.get('table2', [])
        for product in products:
            name = product.get('name')
            cas = first(re.findall(r'\d+-\d{2}-\d', name), None)
            cat_no = product.get('code')
            d = {
                'brand': self.brand,
                'cat_no': cat_no,
                'en_name': product.get('name2'),
                'chs_name': product.get('name'),
                'stock_info': product.get('cnum'),
                'cas': cas,
                'purity': product.get('purity'),
                'info3': product.get('pack'),
                'info4': product.get('price'),
                'expiry_date': product.get('enddate'),
                'prd_url': tmp.format(product.get('id'), quote(parent))
            }
            yield RawData(**d)

            dd = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': product.get('pack'),
                'price': product.get('price'),
                'currency': 'RMB',
            }
            yield ProductPackage(**dd)

        page_table = first(j_obj.get('table1'), {})
        total_page = int(page_table.get('pagecount', 0))
        params = response.meta.get('params')
        cur_page = int(params.get('page', 1))
        if cur_page >= total_page:
            return
        params['page'] = str(int(params['page']) + 1)
        yield Request(self.api_url + urlencode(params),
                      callback=self.parse_list,
                      meta={
                          'parent': parent,
                          'params': params,
                      })
Exemplo n.º 13
0
    def parse_detail(self, response):
        if response.status == 521:
            yield from self.handle_521(response, callback=self.parse_detail)
            return
        tmp = '//el-form-item[contains(@label, {!r})]/span/text()'
        brand = strip(response.xpath(tmp.format("品牌")).get(), "")
        brand = '_'.join(('Tanmo', brand)).lower()
        cat_no = strip(response.xpath(tmp.format("产品编号")).get())
        good_obj = demjson.decode(
            first(re.findall(r'goodObj: ({[^}]+}),', response.text), '{}'))

        d = {
            'brand':
            brand,
            'cat_no':
            cat_no,
            'chs_name':
            strip(response.xpath('//h2[@class="p-right-title"]/text()').get()),
            'cas':
            strip(response.xpath(tmp.format("CAS号")).get()),
            'stock_info':
            good_obj.get('number', 0),
            'expiry_date':
            good_obj.get('date', 0),
            'purity':
            strip(response.xpath(tmp.format("标准值")).get()),
            'info2':
            strip(response.xpath(tmp.format("储存条件")).get()),
            'info3':
            strip(response.xpath(tmp.format("规格")).get()),
            'info4':
            good_obj.get('price', '咨询'),
            'prd_url':
            response.url,
        }
        yield RawData(**d)

        dd = {
            'brand': brand,
            'cat_no': cat_no,
            'package': strip(response.xpath(tmp.format("规格")).get()),
            'price': good_obj.get('price', '咨询'),
            'currency': 'RMB',
        }
        yield ProductPackage(**dd)
Exemplo n.º 14
0
    def parse_detail(self, response):
        tmp = '//span[contains(text(), {!r})]/following-sibling::text()'
        cat_no = strip(response.xpath(tmp.format("产品编号:")).get())
        sub_brand = response.xpath(tmp.format("品牌:")).get('')
        rel_img = response.xpath('//div[@class="riliimg-aa"]/img/@src').get()
        d = {
            'brand': self.brand,
            'parent': response.meta.get('parent'),
            'cat_no': cat_no,
            'en_name':
            strip(response.xpath('//div[@class="tit-aa"]/text()').get()),
            'chs_name': strip(response.xpath(tmp.format('中文名称:')).get()),
            'cas': strip(response.xpath(tmp.format('CAS No:')).get()),
            'mf': strip(response.xpath(tmp.format('分子式:')).get()),
            'mw': strip(response.xpath(tmp.format('分子量:')).get()),
            'purity': strip(response.xpath(tmp.format('纯度:')).get()),
            'mdl': strip(response.xpath(tmp.format('MDL号:')).get()),
            'img_url': rel_img and urljoin(self.base_url, rel_img),
            'prd_url': response.url,
        }
        if 'amatek' not in sub_brand.lower():
            print(f'{cat_no}, have weird brand')
            return
        yield RawData(**d)

        rows = response.xpath('//div[@class="tablpp"]//tr[position()>1]')
        for row in rows:
            price = row.xpath('./td[3]/text()').get()
            if price is None or 'Inquire' == price:
                continue
            stock_num = row.xpath('./td[2]/text()').get('')
            delivery_time = 'in-stock' if stock_num.isdigit() and int(
                stock_num) else None
            dd = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[1]/text()').get(),
                'price': price,
                'currency': 'RMB',
                'delivery_time': delivery_time,
                'stock_num': stock_num,
            }
            yield ProductPackage(**dd)
Exemplo n.º 15
0
    def parse_detail(self, response):
        tmp = '//th[contains(text(), {!r})]/following-sibling::td[1]//text()'
        cat_no = response.meta.get('cat_no')
        parent = response.meta.get('parent')
        if response.xpath('//span[contains(text(), "请按住滑块,拖动到最右边")]'):
            return
        d = {
            'brand': self.brand,
            'parent': parent,
            'cat_no': cat_no,
            'en_name': strip(response.xpath('//div[@class="product-general"]/span/text()').get()),
            'chs_name': strip(response.xpath(tmp.format("别名:")).get()) or response.xpath('//h1/text()').get(),
            'cas': strip(response.xpath(tmp.format("Cas号:")).get()),
            'mf': strip(''.join(response.xpath(tmp.format("分子式:")).getall())),
            'mw': strip(response.xpath(tmp.format("分子量:")).get()),
            'einecs': strip(response.xpath(tmp.format("EINECS编号:")).get()),
            'mdl': strip(response.xpath(tmp.format("MDL号:")).get()),
            'info2': strip(response.xpath(tmp.format("储存条件:")).get()),
            'appearance': strip(response.xpath(tmp.format("颜色:")).get()),

            'img_url': response.xpath('//td/img/@src').get(),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//div[@class="shopping"]//tbody/tr')
        for row in rows:
            cat_no_unit = strip(row.xpath('./td[1]/text()').get())
            package = cat_no_unit.replace(f'{cat_no}-', '')
            if package == 'bulk':
                return
            dd = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': package,
                'cat_no_unit': cat_no_unit,
                'price': strip(row.xpath('./td[5]/text()').get()),
                'currency': 'RMB',
            }
            yield ProductPackage(**dd)
Exemplo n.º 16
0
    def parse_detail(self, response):
        tmp = '//li[contains(text(), {!r})]/text()'
        func = lambda res, t: res.xpath(tmp.format(t)).get('').lstrip(t
                                                                      ) or None
        img_rel = response.xpath('//td/img/@src').get()

        cat_no = response.xpath('//tr[@id][1]/td[2]/text()').get()
        if not cat_no:
            return
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'parent': response.meta.get('parent'),
            'en_name': strip(response.xpath('//h2/text()[1]').get()),
            'chs_name': strip(response.xpath('//h2/text()[2]').get()),
            'cas': func(response, 'CAS号:'),
            'mf': func(response, '分子式:'),
            'mw': func(response, '分子量:'),
            'purity': func(response, '韶远库存批次纯度:'),
            'info3': response.xpath('//tr[@id][1]/td[4]/text()').get(),
            'info4': response.xpath('//tr[@id][1]/td[5]/text()').get(),
            'stock_info': response.xpath('//tr[@id][1]/td[8]/text()').get(),
            'img_url': img_rel and urljoin(self.base_url, img_rel),
            'prd_url': response.url,
        }
        yield RawData(**d)

        for tr in response.xpath('//tr[@id]'):
            d_package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': tr.xpath('./td[4]/text()').get(),
                'price': tr.xpath('./td[5]/text()').get(),
                'currency': 'RMB',
                'delivery_time': tr.xpath('./td[8]/text()').get(),
            }
            if d_package['package'] == 'bulk':
                continue
            yield ProductPackage(**d_package)
Exemplo n.º 17
0
    def parse_detail(self, response):
        tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()'
        p = re.compile(r'(\d+(\.\d+)?)')
        d = {
            'brand':
            self.brand,
            'cat_no':
            response.meta.get('cat_no'),
            'en_name':
            response.meta.get('en_name'),
            'cas':
            strip(response.xpath(tmp.format("CAS Number")).get()),
            'mf':
            strip(response.xpath(tmp.format("Molecular Formula")).get()),
            'mw':
            strip(response.xpath(tmp.format("Molecular Weight")).get()),
            'purity':
            strip(response.xpath(tmp.format("Purity")).get()),
            'mdl':
            strip(response.xpath(tmp.format("MDL Number")).get()),
            'prd_url':
            response.url,
            'img_url':
            response.xpath('//div[@id="tabs-Structure"]/img/@src').get(),
        }
        yield RawData(**d)

        rows = response.xpath('//table[@id="tblPricing"]//tr[position()>1]')
        for row in rows:
            price = row.xpath('./td[3]/text()').get()
            dd = {
                'brand': self.brand,
                'cat_no': response.meta.get('cat_no'),
                'package': strip(row.xpath('./td[1]/text()').get()),
                'price': price and first(first(p.findall(price), None), None),
                'stock_num': strip(row.xpath('./td[4]/text()').get()),
                'currency': 'GBP',
            }
            yield ProductPackage(**dd)
Exemplo n.º 18
0
    def parse_detail(self, response):
        tmp = '//span[@class={!r}]/text()'
        tmp2 = '//td[contains(text(), {!r})]/following-sibling::td/text()'
        cat_no = response.xpath(tmp.format("code productVal")).get()
        mw = strip(response.xpath(tmp2.format("分子式/分子量")).get())
        img_rel = response.xpath('//div[@data-attr]/@data-attr').get()
        d = {
            'brand': self.brand,
            'parent': '_'.join(response.xpath(
                '//div[@class="subCategory clearfix"][1]//span[@class="startPoint"]//a/text()').getall()),
            'cat_no': cat_no,
            'en_name': ''.join(response.xpath('//h1[@class="name"]//text()').getall()),
            'cas': response.xpath(tmp.format("cas productVal")).get(),
            'mf': ''.join(response.xpath('//span[@id="molecularFormula"]//text()').getall()).replace('_', ''),
            'mw': mw and mw.replace('=', ''),
            'purity': response.xpath(tmp2.format("纯度/分析方法")).get(),
            'appearance': response.xpath(tmp2.format("外观与形状")).get(),
            'info2': response.xpath(tmp2.format("储存温度")).get(),
            'mdl': response.xpath(tmp2.format("MDL编号")).get(),

            'img_url': img_rel and urljoin(self.base_url, img_rel),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//table[@id="PricingTable"]/tbody/tr')
        for row in rows:
            stock_num = strip(row.xpath('./td[3]/text()').get())
            package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[1]/text()').get(),
                'delivery_time': '现货' if stock_num != '0' else None,
                'price': strip(row.xpath('./td[2]/div/text()').get()),
                'stock_num': stock_num,
                'currency': 'RMB',
            }
            yield ProductPackage(**package)
Exemplo n.º 19
0
    def parse(self, response):
        rows = response.xpath('//table//tr[position()>2 and @class]')
        for row in rows:
            cat_no = row.xpath('./td[2]/a/text()').get()
            rel_url = row.xpath('./td[2]/a/@href').get()
            d = {
                'brand': self.brand,
                'cat_no': cat_no,
                'en_name': row.xpath('./td[3]/text()').get(),
                'info3': row.xpath('./td[4]/text()').get(),
                'info4': strip(row.xpath('./td[5]/text()').get()),
                'prd_url': urljoin(response.url, rel_url),
                'expiry_date': row.xpath('./td[6]/text()').get(),
            }
            yield RawData(**d)

            dd = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[4]/text()').get(),
                'price': strip(row.xpath('./td[5]/text()').get()),
                'currency': 'USD',
            }
            yield ProductPackage(**dd)
Exemplo n.º 20
0
 def parse_detail(self, response):
     tmp = '//th[contains(text(), {!r})]/following-sibling::td//p//text()'
     package = '//tr[td and td[@class="pro_price_3"]/span[not(@class)]]/td[@class="pro_price_1"]'
     rel_img = response.xpath(
         '//div[@class="struct-img-wrapper"]/img/@src').get()
     cat_no = response.xpath('//dt/span/text()').get('').replace(
         'Cat. No.: ', '').replace('目录号: ', '')
     tmp_package = strip(
         response.xpath(f'normalize-space({package}/text())').get())
     d = {
         'brand':
         self.brand,
         'parent':
         response.meta.get('parent'),
         'cat_no':
         cat_no,
         'en_name':
         response.xpath('//h1/strong/text()').get(),
         'cas':
         strip(response.xpath(tmp.format("CAS No.")).get()),
         'mf':
         formula_trans(strip(response.xpath(tmp.format("Formula")).get())),
         'mw':
         strip(response.xpath(tmp.format("Molecular Weight")).get()),
         'smiles':
         strip(''.join(response.xpath(tmp.format("SMILES")).getall())),
         'info3':
         tmp_package and tmp_package.replace('\xa0', ' '),
         'info4':
         strip(
             response.xpath(
                 f'{package}/following-sibling::td[1]/text()').get()),
         'img_url':
         rel_img and urljoin(response.url, rel_img),
         'prd_url':
         response.url,
     }
     yield RawData(**d)
     if not cat_no:
         return
     rows = response.xpath(
         '//tr[td and td[@class="pro_price_3"]/span[not(@class)]]')
     for row in rows:
         price = strip(row.xpath('./td[@class="pro_price_2"]/text()').get())
         tmp_package = strip(
             row.xpath('normalize-space(./td[@class="pro_price_1"]/text())'
                       ).get())
         dd = {
             'brand':
             self.brand,
             'cat_no':
             cat_no,
             'package':
             tmp_package and tmp_package.replace('\xa0', ' '),
             'price':
             price and price.strip('¥'),
             'delivery_time':
             strip(''.join(
                 row.xpath(
                     './td[@class="pro_price_3"]/span//text()').getall()))
             or None,
             'currency':
             'RMB',
         }
         yield ProductPackage(**dd)
Exemplo n.º 21
0
    def parse_detail(self, response):
        mw = response.xpath(
            "//div[@class='woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab']//p/text()[contains(self::text(),'Molecular Weight:')]"
        ).get()
        mw2 = response.xpath(
            "//tr[contains(@class, 'woocommerce-product-attributes-item--attribute_pa_mw')]//p/text()"
        ).get()
        cas = response.xpath(
            "//div[@class='woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab']/p[contains(text(),'CAS Number:')]/text()"
        ).get()
        cas2 = response.xpath(
            '//tr[@class="woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_cas"]//td[@class="woocommerce-product-attributes-item__value"]//p/text()'
        ).get()
        purity = response.xpath(
            "//div[@class='woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab']//p/text()[contains(self::text(),'Purity:')]"
        ).get()
        purity2 = response.xpath(
            "//tr[@class='woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_purity']//p/text()"
        ).get()
        cat_no = response.xpath(
            "//div[@class='woocommerce-product-details__short-description']//p/text()"
        ).get('')
        cat_no = re.sub(r'Product Number:', '', cat_no, 0, re.IGNORECASE)
        info = response.xpath(
            "//div[@class='woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab']//p/text()[contains(self::text(),'Storage:')]"
        ).get()
        info2 = response.xpath(
            "//tr[@class='woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_storage']//td[@class='woocommerce-product-attributes-item__value']//p/text()"
        ).get()

        d = {
            "brand":
            self.name,
            "cat_no":
            cat_no,
            "parent":
            response.xpath(
                '//tr[@class="woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_categories"]//td[@class="woocommerce-product-attributes-item__value"]/p/text()'
            ).get(),
            "cas": (cas and first(re.findall(r'CAS Number: (.+)', cas), None))
            or cas2,
            "mf":
            response.xpath(
                "//tr[@class='woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_molecular-formula']//td[@class='woocommerce-product-attributes-item__value']//p/text()"
            ).get(),
            'mw':
            (mw and first(re.findall(r'Molecular Weight: (.+)', mw), None))
            or mw2,
            'purity':
            (purity and first(re.findall(r'Purity: (.+)', purity), None))
            or purity2,
            'img_url':
            response.xpath(
                "//div[@class='woocommerce-product-gallery woocommerce-product-gallery--with-images woocommerce-product-gallery--columns-4 images']//a/@href"
            ).get(),
            'prd_url':
            response.url,
            'en_name':
            response.xpath(
                "//h1[@class='product_title entry-title']//text()").get(),
            "info2": (info and first(re.findall(r'Storage: (.+)', info), None))
            or info2
        }

        yield RawData(**d)

        rows = response.xpath(
            "//table[@class='woocommerce-grouped-product-list group_table']//tr"
        )
        for row in rows:
            dd = {
                "brand":
                self.name,
                "cat_no":
                cat_no,
                "package":
                row.xpath(
                    ".//td[@class='woocommerce-grouped-product-list-item__label']/label/text()"
                ).get(),
                "currency":
                'USD',
                "price":
                row.xpath(
                    ".//span[@class='woocommerce-Price-currencySymbol']//parent::bdi/text()"
                ).get(),
            }
            dd["package"] = first(re.findall(r'[^(]+', dd["package"]), None)

            yield ProductPackage(**dd)