def parse_zhongyaofang21nx_item(data, urlReq): print("Get resp from:", urlReq) soup = BeautifulSoup(data, "html5lib") item = MyCrawlerItem() item['nameCh'] = "" item['namePin'] = "" item['alias'] = "" item['nameEng'] = "" item['source'] = "" item['description'] = "" item['area'] = "" item['gather'] = "" item['shape'] = "" item['taste'] = "" item['effect'] = "" item['application'] = "" item['pharmacology'] = "" item['component'] = "" item['tatoo'] = "" item['prescription'] = "" item['url'] = urlReq conten_part = soup.find("div", class_="gaishu") if conten_part == None: return item return item
def parse_item(self, response): # 通过XPath获取Dom元素 articles = response.xpath('//*[@id="main"]/ul/li') for article in articles: item = MyCrawlerItem() item['title'] = article.xpath( 'h3[@class="entry-title"]/a/text()').extract()[0] item['url'] = article.xpath( 'h3[@class="entry-title"]/a/@href').extract()[0] item['summary'] = article.xpath('div[2]/p/text()').extract()[0] yield item
def parse_item(self, response): item = MyCrawlerItem() item['url'] = str(response.url) item['category'] = 'environment' item['reference'] = 'guardian' item['title'] = str(response.xpath('//h1[contains(@class, "content__headline")] | //h1/span[contains(@class, "content__headline--interview-wrapper")]//text()').extract()[0]) item['title'] = processText(item['title'],True,True) item['subTitle'] = str(response.xpath('//meta[@itemprop="description"]/@content').extract()[0]) item['subTitle'] = processText(item['subTitle'], True, True) # Put the list in a string item['body'] = ' '.join([x.strip() for x in (response.xpath('//div[contains(@class, "content__article-body") and not(contains(@class, "submeta"))]//text()[not(ancestor::div/@class="submeta")]').extract())]) item['body'] = processText(item['body'],True,True,True) if ( item['body'] != "" ): yield item
def parse_item(self, response): item = MyCrawlerItem() item['url'] = str(response.url) item['category'] = 'sport' item['reference'] = 'cnbc' item['title'] = str(response.xpath('//meta[@name="twitter:title"]/@content').extract()[0]) item['title'] = processText(item['title'],True,True) item['subTitle'] = str(response.xpath('//meta[@itemprop="description"]/@content').extract()[0]) item['subTitle'] = processText(item['subTitle'], True, True) # Put the list in a string item['body'] = ' '.join([x.strip() for x in (response.xpath('//div[@id="article_body"]//text()').extract())]) item['body'] = processText(item['body'],True,True,True) if ( item['body'] != "" ): yield item
def parse_item(self, response): item = MyCrawlerItem() item['url'] = str(response.url) item['category'] = 'environment' item['reference'] = 'huffingtonPost' item['title'] = str(response.xpath('//h1[@class="headline__title"]//text()').extract()[0]) item['title'] = processText(item['title'], True, True) subTitle = response.xpath('//h1[@class="headline__title"]'); item['subTitle'] = '' if subTitle: item['subTitle'] = str(subTitle.xpath('text()').extract()[0]) item['subTitle'] = processText(item['subTitle'], True, True) # Put the list in a string item['body'] = ' '.join([x.strip() for x in (response.xpath('//div[contains(@class, "entry__text") and not(contains(@class, "advertisement"))]//text()[not(ancestor::div/@class="advertisement repeating_dynamic_display")]').extract())]) item['body'] = processText(item['body'],True,True,True,True) yield item
def parse_news(self, response): # salva a data date_selector = response.xpath('//*[@id="conteudo"]/div[2]/table[1]') date = date_selector.xpath('//tr/td/text()').get() # pega todos os <b> do html selector = response.xpath('//span') texts = [] # mpega todos os span dentro de uma tag b for span in selector.xpath('.//span/text()'): # mostra o resultado texts.append(span.get()) for i in range(len(texts)): if 'POSITIVOS: ' in texts[i]: positivos = texts[i] if 'CURADOS: ' in texts[i]: curados = texts[i] if 'ÓBITOS CONFIRMADOS:' in texts[i]: obitos = texts[i] inf = MyCrawlerItem(date=date, positivos=positivos, curados=curados, obitos=obitos) yield inf
def parse_zhongyoo_item(data, urlReq): print("Get resp from:", urlReq) soup = BeautifulSoup(data, "html5lib") item = MyCrawlerItem() item['nameCh'] = "" item['namePin'] = "" item['alias'] = "" item['nameEng'] = "" item['source'] = "" item['description'] = "" item['area'] = "" item['gather'] = "" item['shape'] = "" item['taste'] = "" item['effect'] = "" item['application'] = "" item['pharmacology'] = "" item['component'] = "" item['tatoo'] = "" item['prescription'] = "" item['url'] = urlReq conten_part = soup.find("div", class_="gaishu") if conten_part == None: return aliasFlag = False description = conten_part.find("div", class_="text") for content in description.find_all("p"): if content == None: continue else: key = content.find("strong") if key == None: continue strTmp = "" strTmp = content.get_text().strip() strCont = strTmp[strTmp.find("】") + 1:].strip() name = key.string #关键字为"药名"或"中药名" html含有$nsbp表示的空格需要替换,使用unicodedata.normalize处理 if name.find("药名") != -1 or name.find("中药名") != -1: strCont = strCont.replace(';', '').replace('’', '').replace('\'', '') strCont = unicodedata.normalize("NFKD", strCont) if strCont.find(' ') != -1: item['nameCh'] = strCont[:strCont.find(' ')].strip() #对读音为ye的字转码时会转成xue item['namePin'] = pinyin.yinfu2pinyin( string=strCont[strCont.find(' ') + 1:].strip().replace(' ', '')) else: item['nameCh'] = strCont.strip() res = "" for alphat in pinyin.hanzi2pinyin(string=item['nameCh']): res = res + alphat item['namePin'] = res # print("+++++++++++药名:", item['nameCh']) # print("+++++++++++药名拼音:", item['namePin']) elif name.find("别名") != -1: if aliasFlag == False: item['alias'] = strCont aliasFlag = True # print("++++++++++别名:", item['alias']) else: item['nameEng'] = strCont.replace(';', '').replace('’', '').replace( '\'', '') # print("+++++++++英文名:", item['nameEng']) elif name.find("英文名") != -1: item['nameEng'] = strCont.replace(';', '').replace('’', '').replace( '\'', '') # print("++++++++++英文名:", item['nameEng']) elif name.find("来源") != -1: item['source'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++来源:", item['source']) elif name.find("植物形态") != -1: item['description'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++植物形态:", item['description']) #键字为"产地分布"或"生境分布" elif name.find("产地分布") != -1 or name.find("生境分布") != -1: item['area'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++产地分布:", item['area']) elif name.find("采收加工") != -1: item['gather'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++采收加工:", item['gather']) elif name.find("药材性状") != -1: item['shape'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++药材性状:", item['shape']) elif name.find("性味归经") != -1: item['taste'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++性味归经:", item['taste']) elif name.find("功效与作用") != -1: item['effect'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++功效与作用:", item['effect']) elif name.find("临床应用") != -1: item['application'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++临床应用:", item['application']) elif name.find("药理研究") != -1: item['pharmacology'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++药理研究:", item['pharmacology']) #关键字为"化学成分"或"主要成分" elif name.find("化学成分") != -1 or name.find("主要成分") != -1: item['component'] = strCont.replace(';', '').replace( '\'', '').replace('₁', '1').replace('₃', '3') # print("++++++++++化学成分:", item['component']) elif name.find("使用禁忌") != -1: item['tatoo'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++使用禁忌:", item['tatoo']) #关键字为"配伍药方"或"相关药方" elif name.find("配伍药方") != -1 or name.find("相关药方") != -1: for tagP in content.find_next_siblings("p"): strTag = tagP.get_text().strip() if strTag == "" or strTag.find("相关推荐文章") != -1: break else: strCont = strCont + strTag item['prescription'] = strCont.replace(';', '').replace('\'', '') # print("++++++++++配伍药方:", item['prescription']) else: continue #end of for() return item