def parse(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//doc', response_content, xml_type='XML') for seg in segs: name = crawlerTool.getXpath1('//str[@name="name"]/text()', seg) cas = crawlerTool.getXpath1('//str[@name="casNumber"]/text()', seg) function = crawlerTool.getXpath1('//str[@name="tagline"]/text()', seg) # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg) data_obj = Caymanchem() data_obj['name'] = name data_obj['cas'] = cas data_obj['function'] = function data_obj['cat'] = cat_name data_obj['url'] = name + cat_name + cas yield data_obj totalnum = int( crawlerTool.getXpath1('//result[@name="response"]//@numFound', response_content, xml_type='XML')) if not response.meta.get('depth'): print totalnum for i in range(1, totalnum, 1): url = 'https://www.caymanchem.com/solr/cchProduct/select?facet=true&facet.field=raptas'+\ '&facet.field=newProduct&facet.limit=100000&fl=isEUSellable%2Cname%2CmarkupName%2CcatalogNum%2CproductImage%2Csynonyms%2CcasNumber%2Ctagline%2Cscore%2CitemGroupId%2CprimaryVendorId&spellcheck=true&spellcheck.collate=true&spellcheck.count=10&spellcheck.extendedResults=true&spellcheck.onlyMorePopular=false&facet.mincount=1&rows=10&version=2.2&json.nl=map&'+\ 'q=*%3A*&start='+str(i)+'&fq=('+cats[cat_name]+')AND(!raptas%3ARAP000101%20AND%20websiteNotSearchable%3Afalse)' yield scrapy.Request(url, callback=self.parse, meta={ 'cat_name': cat_name, 'depth': 1 })
def parse1(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//div[@class="product_list_left_in"]//li', response_content) for seg in segs: ChemicalName,CASNumber,MolFormula,SearchImg,Synonyms,url = ['' for i in range(6)] SearchImg = crawlerTool.getXpath1('//div[@class="leftSearchImg"]/a/img/@src', seg) SearchImg = 'https://www.trc-canada.com' + SearchImg contents = crawlerTool.getXpath('//div[@class="ContentDesc"]', seg) for content in contents: content=content.replace('\r','').replace('\n','') if 'Chemical Name:' in content: ChemicalName = crawlerTool.getRegex('</label>(.*?)<',content).strip() elif 'CAS number:' in content: CASNumber = crawlerTool.getRegex('</label>(.*?)<', content).strip() elif 'Mol. Formula:' in content: MolFormula = crawlerTool.getRegex('</label>(.*?)<', content).strip() elif 'Synonyms' in content: Synonyms = crawlerTool.getRegex('</label>(.*?)<', content).strip() # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg) data_obj = Trc_Item() data_obj['ChemicalName'] = ChemicalName data_obj['CASNumber'] = CASNumber data_obj['MolFormula'] = MolFormula data_obj['SearchImg'] = SearchImg data_obj['Synonyms'] = Synonyms data_obj['api_name'] = cat_name data_obj['url'] = SearchImg yield data_obj
def parse(self, response): base_url = get_base_url(response) url_now = response.url response_content = response.body # 乱码处理 segs = crawlerTool.getXpath('//div[@class="cas_default_list_star "]//ul', response_content) for seg in segs[1:-1]: data_obj = SeekchemItem() lis=crawlerTool.getXpath('//li',seg) data_obj['url'] = crawlerTool.getXpath1('//a/@href',lis[0]) data_obj['cas'] = crawlerTool.getXpath1('//b/text()',lis[0]) data_obj['name'] = crawlerTool.getXpath1('//text()',lis[1]) yield data_obj # next_page = crawlerTool.getXpath1("//a[@class='next']/@href", response_content) # next_page_url = urljoin(url_now,next_page) # yield scrapy.Request(url=next_page_url, callback=self.parse) page_urls = crawlerTool.getXpath( '//div[@class="pages"]/a/@href', response_content) for page_url in page_urls: yield scrapy.Request(urljoin(url_now,page_url), callback=self.parse)
def parse(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//table[@id="product-list"]/tbody/tr', response_content) for seg in segs: name, MolecularFormula, MolecularWeight, image, cas, url = [ '' for i in range(6) ] SearchImg = crawlerTool.getXpath1( '//img[@class="dg-picture-zoom acc_img_container acc_zoomer"]/@src', seg) contents = crawlerTool.getXpath('//table//tr', seg) for content in contents: content = content.replace('\r', '').replace('\n', '') if 'Name' in content: name = crawlerTool.getXpath1('//td[2]', content) name = crawlerTool.getRegex('>(.*?)<', name).strip() elif 'CAS No' in content: cas = crawlerTool.getXpath1('//td[2]', content) cas = crawlerTool.getRegex('>(.*?)<', cas).strip() elif 'Molecular Formula' in content: MolecularFormula = crawlerTool.getXpath1( '//td[2]', content) MolecularFormula = re.sub('<.*?>', '', MolecularFormula).strip() elif 'Molecular Weight' in content: MolecularWeight = crawlerTool.getXpath1('//td[2]', content) MolecularWeight = crawlerTool.getRegex( '>(.*?)<', MolecularWeight).strip() # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg) data_obj = acccorporation_Item() data_obj['url'] = name data_obj['name'] = name data_obj['MolecularFormula'] = MolecularFormula data_obj['MolecularWeight'] = MolecularWeight data_obj['image'] = SearchImg data_obj['cas'] = cas yield data_obj
def parser_sub(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 url = response.url #detail =crawlerTool.getXpath('//div[@id="ContentPlaceHolder1_SupplierContact"]',response_content)[0] # 关于我们 # response_content = unicode(response_content, 'gbk') # http://www.hxchem.net/companydetaildesenborn.html 这个就不行了! # lxwm = HTMLParser().unescape(lxwm) # lxwm=lxwm.encode('utf8') data_obj = ChemicalBook() data_obj['url'] = url data_obj['name'] = crawlerTool.getXpath( '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[2]/td[2]/a/text()', response_content)[0] data_obj['lxdh'] = crawlerTool.getXpath1( '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[3]/td[2]//text()', response_content) data_obj['email'] = crawlerTool.getXpath1( '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[5]/td[2]//text()', response_content) data_obj['wz'] = crawlerTool.getXpath1( '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[6]/td[2]//text()', response_content) cplb_div = crawlerTool.getXpath( '//div[@id="ContentPlaceHolder1_ProductSupplier"]//table', response_content)[3:-1] print data_obj['name'].encode('unicode-escape').decode('string_escape') cplb = [] for cp in cplb_div: chinese_name = crawlerTool.getXpath('//tr/td[2]/text()', cp) chinese_name = chinese_name[0] if chinese_name else '' cps = crawlerTool.getXpath('//tr/td[3]/text()', cp) cps = cps[0] if cps else '' cplb.append(' '.join([chinese_name, cps])) data_obj['cplb'] = cplb # print lxr,dz,yb,dh,sj yield data_obj page_urls = crawlerTool.getXpath( '//div[@id="ContentPlaceHolder1_ProductSupplier"]//table[2]//tr[2]/td[2]//a/@href', response_content) for page_url in page_urls: page_url = urljoin(base_url, page_url) yield scrapy.Request(url=page_url, callback=self.parser_sub)
def parse(self,response): base_url = get_base_url(response) response_content = response.body # 乱码处理 url= response.url ''' url = scrapy.Field() IUPACname = scrapy.Field() CAS = scrapy.Field() Chemspaceid = scrapy.Field() Molformula = scrapy.Field() Molweight = scrapy.Field() ''' data_obj = ChemspaceItem() data_obj['url'] = url data_obj['IUPACname'] = crawlerTool.getXpath1('//div[@class="iupac-name"]//text()',response_content) data_obj['CAS'] = crawlerTool.getRegex('<dt>CAS</dt>[^<]?<dd>([\d-]+)</dd>',response_content) data_obj['Molformula'] = crawlerTool.getRegex('<dt>Mol formula</dt>[^<]?<dd>([\d\w]+)</dd>',response_content.replace('</sub>','').replace('<sub>','')) data_obj['Molweight'] = crawlerTool.getRegex('<dt>Mol weight</dt>[^<]?<dd>([\d\.]+)</dd>',response_content) print data_obj yield data_obj
def parse(self, response): base_url = get_base_url(response) content = response.body # 乱码处理 segs = crawlerTool.getXpath( "//table//td[2]//td/table[2]//td//table//tr//td//tr", content) for seg in segs[1:]: tds = crawlerTool.getXpath("//td", seg) if len(tds) < 4: continue cat_no = tds[0] product_name = tds[1] cas = tds[2] assay = tds[3] rovathin_item = RovathinItem() rovathin_item['cat_no'] = re.sub('\s*<.*?>\s*', '', cat_no) rovathin_item['product_name'] = re.sub('\s*<.*?>\s*', '', product_name) rovathin_item['cas'] = re.sub('\s*<.*?>\s*', '', cas) rovathin_item['assay'] = re.sub('\s*<.*?>\s*', '', assay) rovathin_item['url'] = crawlerTool.getXpath1( "//a/@href", product_name) yield rovathin_item