Python MyCrawlerItem示例，my_crawler.items.MyCrawlerItem Python示例

示例#1

0

显示文件

def parse_zhongyaofang21nx_item(data, urlReq):
    print("Get resp from:", urlReq)
    soup = BeautifulSoup(data, "html5lib")
    item = MyCrawlerItem()
    item['nameCh'] = ""
    item['namePin'] = ""
    item['alias'] = ""
    item['nameEng'] = ""
    item['source'] = ""
    item['description'] = ""
    item['area'] = ""
    item['gather'] = ""
    item['shape'] = ""
    item['taste'] = ""
    item['effect'] = ""
    item['application'] = ""
    item['pharmacology'] = ""
    item['component'] = ""
    item['tatoo'] = ""
    item['prescription'] = ""
    item['url'] = urlReq

    conten_part = soup.find("div", class_="gaishu")
    if conten_part == None:
        return item

    return item

示例#2

0

显示文件

 def parse_item(self, response):
     # 通过XPath获取Dom元素
     articles = response.xpath('//*[@id="main"]/ul/li')
     for article in articles:
         item = MyCrawlerItem()
         item['title'] = article.xpath(
             'h3[@class="entry-title"]/a/text()').extract()[0]
         item['url'] = article.xpath(
             'h3[@class="entry-title"]/a/@href').extract()[0]
         item['summary'] = article.xpath('div[2]/p/text()').extract()[0]
         yield item

示例#3

0

显示文件

   def parse_item(self, response):
      item              = MyCrawlerItem()
      item['url']       = str(response.url)
      item['category']  = 'environment'
      item['reference'] = 'guardian'
      item['title']     = str(response.xpath('//h1[contains(@class, "content__headline")] | //h1/span[contains(@class, "content__headline--interview-wrapper")]//text()').extract()[0])
      item['title']     = processText(item['title'],True,True)
      item['subTitle']  = str(response.xpath('//meta[@itemprop="description"]/@content').extract()[0])
      item['subTitle']  = processText(item['subTitle'], True, True)
      # Put the list in a string
      item['body']      = ' '.join([x.strip() for x in (response.xpath('//div[contains(@class, "content__article-body") and not(contains(@class, "submeta"))]//text()[not(ancestor::div/@class="submeta")]').extract())])
      item['body']      = processText(item['body'],True,True,True)

      if ( item['body'] != "" ):
         yield item

示例#4

0

显示文件

文件： cnbc_sport_spider.py 项目： mozhdeh-dokhani/my-crawler

   def parse_item(self, response):
      item              = MyCrawlerItem()
      item['url']       = str(response.url)
      item['category']  = 'sport'
      item['reference'] = 'cnbc'
      item['title']     = str(response.xpath('//meta[@name="twitter:title"]/@content').extract()[0])
      item['title']     = processText(item['title'],True,True)
      item['subTitle']  = str(response.xpath('//meta[@itemprop="description"]/@content').extract()[0])
      item['subTitle']  = processText(item['subTitle'], True, True)
      # Put the list in a string
      item['body']      = ' '.join([x.strip() for x in (response.xpath('//div[@id="article_body"]//text()').extract())])
      item['body']      = processText(item['body'],True,True,True)

      if ( item['body'] != "" ):
         yield item

示例#5

0

显示文件

   def parse_item(self, response):
      item              = MyCrawlerItem()
      item['url']       = str(response.url)
      item['category']  = 'environment'
      item['reference'] = 'huffingtonPost'
      item['title']     = str(response.xpath('//h1[@class="headline__title"]//text()').extract()[0])
      item['title']     = processText(item['title'], True, True)
      subTitle          = response.xpath('//h1[@class="headline__title"]');
      item['subTitle']  = ''
      if subTitle:
         item['subTitle'] = str(subTitle.xpath('text()').extract()[0])
         item['subTitle'] = processText(item['subTitle'], True, True)
      # Put the list in a string
      item['body']      = ' '.join([x.strip() for x in (response.xpath('//div[contains(@class, "entry__text") and not(contains(@class, "advertisement"))]//text()[not(ancestor::div/@class="advertisement repeating_dynamic_display")]').extract())])
      item['body']      = processText(item['body'],True,True,True,True)

      yield item

示例#6

0

显示文件

文件： prefbirigui.py 项目： Rnnfranca/python-crawler-prefbirigui

    def parse_news(self, response):
        # salva a data
        date_selector = response.xpath('//*[@id="conteudo"]/div[2]/table[1]')
        date = date_selector.xpath('//tr/td/text()').get()
        # pega todos os <b> do html
        selector = response.xpath('//span')
        texts = []
        
        # mpega todos os span dentro de uma tag b
        for span in selector.xpath('.//span/text()'):
            # mostra o resultado
            texts.append(span.get())

        for i in range(len(texts)):
            if 'POSITIVOS: ' in texts[i]: 
                positivos = texts[i]
            if 'CURADOS: ' in texts[i]:
                curados = texts[i]
            if 'ÓBITOS CONFIRMADOS:' in texts[i]:
                obitos = texts[i]

        inf = MyCrawlerItem(date=date, positivos=positivos, curados=curados,  obitos=obitos)
        yield inf

示例#7

0

显示文件

文件： zhongyoo.py 项目： mysear/mycrawler

def parse_zhongyoo_item(data, urlReq):
    print("Get resp from:", urlReq)
    soup = BeautifulSoup(data, "html5lib")
    item = MyCrawlerItem()
    item['nameCh'] = ""
    item['namePin'] = ""
    item['alias'] = ""
    item['nameEng'] = ""
    item['source'] = ""
    item['description'] = ""
    item['area'] = ""
    item['gather'] = ""
    item['shape'] = ""
    item['taste'] = ""
    item['effect'] = ""
    item['application'] = ""
    item['pharmacology'] = ""
    item['component'] = ""
    item['tatoo'] = ""
    item['prescription'] = ""
    item['url'] = urlReq

    conten_part = soup.find("div", class_="gaishu")
    if conten_part == None:
        return

    aliasFlag = False
    description = conten_part.find("div", class_="text")
    for content in description.find_all("p"):
        if content == None:
            continue
        else:
            key = content.find("strong")
            if key == None:
                continue

        strTmp = ""
        strTmp = content.get_text().strip()
        strCont = strTmp[strTmp.find("】") + 1:].strip()
        name = key.string
        #关键字为"药名"或"中药名" html含有$nsbp表示的空格需要替换，使用unicodedata.normalize处理
        if name.find("药名") != -1 or name.find("中药名") != -1:
            strCont = strCont.replace('；', '').replace('’',
                                                       '').replace('\'', '')
            strCont = unicodedata.normalize("NFKD", strCont)
            if strCont.find(' ') != -1:
                item['nameCh'] = strCont[:strCont.find(' ')].strip()
                #对读音为ye的字转码时会转成xue
                item['namePin'] = pinyin.yinfu2pinyin(
                    string=strCont[strCont.find(' ') +
                                   1:].strip().replace(' ', ''))
            else:
                item['nameCh'] = strCont.strip()
                res = ""
                for alphat in pinyin.hanzi2pinyin(string=item['nameCh']):
                    res = res + alphat
                item['namePin'] = res
#            print("+++++++++++药名:", item['nameCh'])
#            print("+++++++++++药名拼音:", item['namePin'])
        elif name.find("别名") != -1:
            if aliasFlag == False:
                item['alias'] = strCont
                aliasFlag = True
#                print("++++++++++别名:", item['alias'])
            else:
                item['nameEng'] = strCont.replace('；',
                                                  '').replace('’', '').replace(
                                                      '\'', '')
#                print("+++++++++英文名:", item['nameEng'])
        elif name.find("英文名") != -1:
            item['nameEng'] = strCont.replace('；', '').replace('’',
                                                               '').replace(
                                                                   '\'', '')
#            print("++++++++++英文名:", item['nameEng'])
        elif name.find("来源") != -1:
            item['source'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++来源:", item['source'])
        elif name.find("植物形态") != -1:
            item['description'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++植物形态:", item['description'])
#键字为"产地分布"或"生境分布"
        elif name.find("产地分布") != -1 or name.find("生境分布") != -1:
            item['area'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++产地分布:", item['area'])
        elif name.find("采收加工") != -1:
            item['gather'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++采收加工:", item['gather'])
        elif name.find("药材性状") != -1:
            item['shape'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++药材性状:", item['shape'])
        elif name.find("性味归经") != -1:
            item['taste'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++性味归经:", item['taste'])
        elif name.find("功效与作用") != -1:
            item['effect'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++功效与作用:", item['effect'])
        elif name.find("临床应用") != -1:
            item['application'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++临床应用:", item['application'])
        elif name.find("药理研究") != -1:
            item['pharmacology'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++药理研究:", item['pharmacology'])
#关键字为"化学成分"或"主要成分"
        elif name.find("化学成分") != -1 or name.find("主要成分") != -1:
            item['component'] = strCont.replace(';', '').replace(
                '\'', '').replace('₁', '1').replace('₃', '3')
#            print("++++++++++化学成分:", item['component'])
        elif name.find("使用禁忌") != -1:
            item['tatoo'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++使用禁忌:", item['tatoo'])
#关键字为"配伍药方"或"相关药方"
        elif name.find("配伍药方") != -1 or name.find("相关药方") != -1:
            for tagP in content.find_next_siblings("p"):
                strTag = tagP.get_text().strip()
                if strTag == "" or strTag.find("相关推荐文章") != -1:
                    break
                else:
                    strCont = strCont + strTag
            item['prescription'] = strCont.replace(';', '').replace('\'', '')
#            print("++++++++++配伍药方:", item['prescription'])
        else:
            continue
#end of for()
    return item