def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts1 = hxs.select('//table[@class="tb"]//td//text()').extract() texts2 = hxs.select('//div[@class="t2"]//text()').extract() result_doc1 = blur_ana_patent(texts1) result_doc2 = blur_ana_patent(texts2) patent_name = ''.join(hxs.select('//div[@class="t1"]//text()').extract()) abstract = ''.join(hxs.select('//div[@class="con2"]//text()').extract()) doc = item['doc'] doc.update(result_doc1) doc.update(result_doc2) doc['patent_name'] = patent_name doc['abstract'] = abstract doc['application_number'] = doc['application_number'].lstrip('/专利号: ') attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) item['attachment_urls'] += image_urls yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta["item"] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts = hxs.select('//table[@id="perildical2_dl"]//text()').extract() texts = clean_wanfang_texts(texts) result_doc = blur_ana_patent(texts) abstract = "".join(hxs.select('//div[@class="abstracts"]//text()').extract()) doc = item["doc"] doc.update(result_doc) doc["abstract"] = abstract attachments = item["attachments"] attach1 = { "url": response.url, "data": response.body_as_unicode(), "mime_type": get_mime_type_in_response(response), } attachments.append(attach1) image_urls = get_image_urls(response) item["attachment_urls"] += image_urls # more_url = response.url.replace('_free', '') # next_request = Request(more_url, callback=self.parse_more_page) # item['next_request'] = next_request # hotfix for patent_type patent_type = "".join(hxs.select('//th[contains(.//text(),"专利类型")]/../td//text()').extract()) doc["patent_type"] = patent_type yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts1 = hxs.select('//span[@class="detailtitle"]//text()').extract() texts2 = hxs.select('//table[@class="datainfo"]//text()').extract() texts3 = hxs.select('//table[@id="PatentContentTable"]//text()').extract() texts1 = clean_soopat_texts(texts1) texts2 = clean_soopat_texts(texts2) texts3 = clean_soopat_texts(texts3) result_doc1 = blur_ana_patent(texts1) result_doc2 = blur_ana_patent(texts2) result_doc3 = blur_ana_patent(texts3) patent_name = ''.join(hxs.select('//span[@class="detailtitle"]/h1//text()').extract()) abstract = ''.join(hxs.select('//td[@class="sum f14"]//text()').extract()) doc = item['doc'] doc.update(result_doc1) doc.update(result_doc2) doc.update(result_doc3) doc['patent_name'] = patent_name doc['abstract'] = abstract attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) img_url = ''.join(hxs.select('//a[@class="jqzoom"]/@href').extract()) image_urls.append(img_url) item['attachment_urls'] += image_urls yield self.item_or_request(item)
def parse_images(self, response): item = response.meta['item'] image_urls = get_image_urls(response) item['attachment_urls'] += image_urls yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts_hxs = hxs.select('//table[contains(@class,"patent-bibdata")]//tr') texts = [''.join(hxs.select('.//text()').extract()) for hxs in texts_hxs] # texts = clean_google_texts(texts) result_doc = blur_ana_patent(texts) classification_hxs = hxs.select('//td[text()="國際專利分類號"]/parent::* | ' '//td[text()="国际分类号"]/parent::* |' '//td[text()="International Classification"]/parent::*') patent_state = ''.join(hxs.select('//td[text()="出版類型"]/../td[2]//text()').extract()) texts1 = [''.join(classification_hxs.select('.//text()').extract())] result_doc1 = blur_ana_patent(texts1) doc = item['doc'] doc.update(result_doc) doc.update(result_doc1) patent_name = ''.join(hxs.select('//span[@class="patent-title"]//text()').extract()) abstract = ''.join( hxs.select('//div[@class="patent-section patent-abstract-section"]//div[@class="patent-text"]//text()') .extract()) description = ''.join( hxs.select('//div[@class="patent-section patent-description-section"]//div[@class="patent-text"]//text()') .extract()) claims = ''.join( hxs.select('//div[@class="patent-section patent-claims-section"]//div[@class="patent-text"]//text()') .extract()) doc['patent_name'] = patent_name doc['abstract'] = abstract doc['description'] = description doc['claims'] = claims doc['patent_state'] = patent_state doc['patent_type'] = '' attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) item['attachment_urls'] += image_urls yield self.item_or_request(item) #如果有中文版本,对中文版本进行抓取 link_ex = LxmlParserLinkExtractor(unique=False) links = link_ex.extract_links(response) for link in links: if link.text in ['Chinese', 'chinese', '中文']: request = Request(link.url, callback=self.parse_detail_page) doc = { 'data_source': 'google专利搜索', 'url': link.url, } cn_item = PatentItem(doc=doc, next_request=request, list_url=item['list_url'], query=item['query'], attachments=[], attachment_urls=[]) yield self.item_or_request(cn_item) break