Python BaseDataItem示例，covid_19.items.BaseDataItem Python示例

示例#1

0

显示文件

文件： shanxiSpider.py 项目： zsweet/Covid19-News-Crawl

    def detail_parse(self, response):
        detail_url = response.meta["detail_url"]
        item = BaseDataItem()
        sel = Selector(response)
        title = sel.xpath(
            '//div[@class="detail-article-title oflow-hd"]/h5//text()'
        ).extract_first()
        raw_time = sel.xpath(
            "//li[@class='article-infos-source left']/span[1]/text()"
        ).extract_first()
        province = "山西"
        person_arr = sel.xpath("//p/strong/text()").extract()
        summary = sel.xpath(
            '//p[@align="center"]/following-sibling::p[1]//text()'
        ).extract_first()
        content = ""
        content_strs = sel.xpath(
            '//div[@class="TRS_Editor"]/p//text()').extract()
        for content_row in content_strs:
            content = content + content_row.strip() + "\n"
        raw_time = raw_time.split(" ", 1)
        publish_time = raw_time[0]

        item["title"] = title
        item['publish_time'] = publish_time
        item['province'] = province
        item['location'] = ""
        item['attend_persons'] = person_arr
        item['summary'] = summary
        item['content'] = content
        item['detail_url'] = detail_url
        yield item

示例#2

0

显示文件

文件： anhuiSpider.py 项目： zsweet/Covid19-News-Crawl

    def detail_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "安徽"
        item["location"] = ""
        item["summary"] = ""

        attend_persons = ""
        attend_person_all = sel.xpath(
            '//div[@class="fty_imglistlb"]/ul/li/a/@data-title').extract()
        # for persons in attend_person_all[:-1]:  #多一个“发布会现场”
        # attend_persons = attend_persons + persons + "\n" #应该不需要去掉空格，因为：省人民政府副省长 章㬢
        attend_persons = attend_person_all[:-1]  # 直接拿到列表赋值
        item["attend_persons"] = attend_persons

        content = ""
        content_text = sel.xpath(
            '//div[@class="desc j-fontContent"]/p/text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        yield item

示例#3

0

显示文件

文件： tianjinSpider.py 项目： zsweet/Covid19-News-Crawl

 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = response.meta["publish_time"]
     title = ""
     title_text = sel.xpath('//div[@id="content"]//p[1]/text()').extract()
     for tit in title_text:
         title = title + tit.strip() + "\n"
     item["title"] = title
     summary = ""
     summary_text = sel.xpath(
         '//div[@id="content"]//p[2]//text()').extract()
     for row in summary_text:
         summary = summary + row.strip() + "\n"
     item["summary"] = summary
     item["province"] = "天津"
     item["location"] = "天津"
     attend_persons = ""
     attend_persons_text = sel.xpath(
         '//div[@id="content"]//p/span[@style="color: #0033ff"]//text()'
     ).extract()
     for per in attend_persons_text:
         attend_persons = attend_persons + per.strip() + "\n"
     item["attend_persons"] = attend_persons
     item["time_stamp"] = ""
     content = ""
     content_text = sel.xpath('//div[@id="content"]//p//text()').extract()
     for col in content_text[1:]:
         content = content + col.strip() + "\n"
     item["content"] = content
     yield item

示例#4

0

显示文件

文件： ningxiaSpider.py 项目： zsweet/Covid19-News-Crawl

 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     title = sel.xpath('//div[@id="info_title"]/text()').extract_first()
     raw_time = sel.xpath(
         '//span[@id="info_released_dtime"]/text()').extract_first()
     publish_time = raw_time[:-9]
     content = ""
     content_strs = sel.xpath('//div[@id="ofdneed"]//p/text()').extract()
     for content_str in content_strs:
         content = content + content_str.strip() + "\n"
     attend_persons_str = sel.xpath(
         '//div[@id="ofdneed"]//p[last()]/text()').extract_first()
     attend_persons = []
     if not attend_persons_str is None:
         attend_persons = attend_persons_str.split("，")
     item["publish_time"] = publish_time
     item["location"] = ""
     item["province"] = "宁夏"
     item["detail_url"] = response.meta["detail_url"]
     item["attend_persons"] = attend_persons
     item["title"] = title
     item["summary"] = ""
     item["content"] = content
     yield item

示例#5

0

显示文件

    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["province"] = "甘肃"
        item["location"] = ""
        item["publish_time"] = response.meta["publish_time"]  #时间在正文里的更准确
        title = sel.xpath('//table[@width="95%"]//td/text()').extract_first()
        item["title"] = title
        content = ""
        content_text = sel.xpath('//td[@class="bt_content"]//p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content

        attend_persons = ""
        attend_persons_text = sel.xpath('//td[@class="bt_content"]//p//span/text()').extract()
        for per in attend_persons_text:
            attend_persons = attend_persons + per.strip() + "\n"
        item["attend_persons"] = attend_persons
        
        summary = ""
        summary_text = sel.xpath('//meta[@name="description"]/content/text()').extract()
        for sum in summary_text:
            summary = summary + sum.strip() + "\n"
        item["summary"] = summary_text
        yield item

示例#6

0

显示文件

    def detail_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["publish_time"] = response.meta["publish_time"]
        item["title"] = response.meta["title"]
        summary_text = sel.xpath(
            '//td[@id="new_message_id"]//p[@align="justify"]/text()'
        ).extract_first()
        if summary_text is None:
            summary_text = sel.xpath(
                '//div[@class="big_right"]/text()').extract_first()
            if summary_text is None:
                summary_text = ""

        item["summary"] = summary_text.strip()
        item["province"] = "福建"
        item["location"] = ""
        item["attend_persons"] = ""
        item["time_stamp"] = ""
        content = ""
        content_text = sel.xpath(
            '//td[@id="new_message_id"]//p[@align="justify"]/text()').extract(
            )
        if len(content_text) == 0:
            content_text = sel.xpath(
                '//td[@id="new_message_id"]/p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        yield item

示例#7

0

显示文件

 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     current_url = response.meta["detail_url"]
     cols = sel.xpath('//div[@class="text_record"]/div/p/text()').extract()
     if cols is None:
         cols = sel.xpath(
             '//div[@class="text_record"]//p/span/text()').extract()
     content = ""
     for col in cols:
         content = content + col.strip() + "\n"
     item["publish_time"] = sel.xpath(
         "//ul[@class='list-unstyled int_list']/li[1]/text()"
     ).extract_first()
     item["attend_persons"] = sel.xpath(
         "//ul[@class='list-unstyled int_list']/li[2]/text()"
     ).extract_first()
     item["province"] = "湖北"
     item["location"] = ""
     item["summary"] = sel.xpath(
         "//ul[@class='list-unstyled int_list']//p/text()").extract_first(
         ).strip()
     item["title"] = sel.xpath("//h2/text()").extract_first()
     item["content"] = content
     item["detail_url"] = current_url
     yield item

示例#8

0

显示文件

文件： beijingSpider.py 项目： zsweet/Covid19-News-Crawl

 def detail_parse(self,response):
     item = BaseDataItem()
     sel = Selector(response)
     publish_time = sel.xpath("//span/text()").extract_first()
     if len(publish_time)<6:
         publish_time = sel.xpath('//p[@class="detailmsg"]').extract_first()
         if publish_time is None:
             publish_time = sel.xpath('//h6/text()').extract_first()
             pass
         pass
     print(publish_time)
     publish_time = re.findall(r".*?(\d+-\d+-\d+).*",publish_time,re.M)[0]
     title = sel.xpath("//h1/text()").extract_first()
     if title is None:
         title = sel.xpath("//h2/text()").extract_first()
         pass
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = publish_time
     item["title"] = title
     item["summary"]=""
     item["province"] = "北京"
     item["location"] = "北京"
     item["attend_persons"] = ""
     item["time_stamp"] = ""
     content = ""
     content_text = sel.xpath('//div[@class="container"]/p/text()').extract()
     if len(content_text)==0:
         content_text = sel.xpath('//div[@class="brief"]/text()').extract()
     
     for row in content_text:
         content = content + row.strip() +"\n"
     item["content"] = content
     yield item

示例#9

0

显示文件

文件： sichuanSpider.py 项目： zsweet/Covid19-News-Crawl

    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "四川"
        item["location"] = ""
        title = sel.xpath('//div[@id="articlecontent"]/h2/ucaptitle/text()').extract_first()
        item["title"] = title.strip()

        content = ""
        content_text = sel.xpath('//div[@id="cmsArticleContent"]//p/text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        item["attend_persons"] = ""
        item["summary"]=""
        yield item

示例#10

0

显示文件

    def detail_parse(self,response):
        sel = Selector(response)
        item = BaseDataItem()
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "湖南"
        item["location"] = ""
        attend_persons_all=sel.xpath('//ul[@class="fbh_list"]/li/p/text()').extract()
        item["attend_persons"] = attend_persons_all

        content = ""
        content_text = sel.xpath('//div[@class="ct_txt"]//p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        item["summary"]=""
        yield item

示例#11

0

显示文件

    def detail_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        content = ""
        content_strs = sel.xpath('//div[@id="zoom"]/p//text()').extract()
        for content_str in content_strs:
            content = content + content_str.strip() + "\n"
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["province"] = "浙江"
        item["location"] = ""
        item["content"] = content

        # attend_persons_all = sel.xpath('//div[@class="chat_cont_list"]/ul/li[3]/span[2]/text()').extract_first()
        item["attend_persons"] = ""
        # publish_time = sel.xpath('//div[@class="chat_cont_list"]/ul/li[2]/span[2]/text()').extract_first()
        item["publish_time"] = response.meta["publish_time"]
        item["summary"] = ""
        yield item

示例#12

0

显示文件

文件： yunnanxdSpider.py 项目： zsweet/Covid19-News-Crawl

 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = response.meta["publish_time"]
     item["title"] = response.meta["title"]
     item["summary"] = ""
     item["province"] = "云南"
     item["location"] = ""
     item["attend_persons"] = ""
     item["time_stamp"] = ""
     content = ""
     content_text = sel.xpath(
         '//div[@class="view TRS_UEDITOR trs_paper_default trs_web"]//p//text()'
     ).extract()
     for row in content_text:
         content = content + row.strip() + "\n"
     item["content"] = content
     yield item

示例#13

0

显示文件

 def detail_parse(self, response):
     detail_url = response.meta["detail_url"]
     title = response.meta["title"]
     publish_time = response.meta["publish_time"]
     sel = Selector(response)
     content = ""
     datas = sel.xpath('//div[@id="zoom"]/p//text()').extract()
     for data in datas:
         content = content + data
     item = BaseDataItem()
     item["detail_url"] = detail_url
     item["title"] = title
     item["publish_time"] = publish_time
     item["province"] = "江苏"
     item["location"] = ""
     item["attend_persons"] = ""
     item["summary"] = ""
     item["content"] = content
     yield item

示例#14

0

显示文件

文件： jilinSpider.py 项目： zsweet/Covid19-News-Crawl

    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        uniq_time = response.meta["publish_time"]
        item["publish_time"] = re.sub("\D","-",uniq_time)
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["province"] = "吉林"
        item["location"] = ""

        content =""
        content_text = sel.xpath('//div[@class="TRS_Editor"]//text()').extract()
        for col in content_text:
            content = content + col.strip() +"\n"
        item["summary"]=""
        item["content"] = content
        attend_persons_text = sel.xpath('//font/text()').extract()
        item["attend_persons"] = attend_persons_text
        yield item

示例#15

0

显示文件

文件： qinghaiSpider.py 项目： zsweet/Covid19-News-Crawl

 def detail_parse(self,response):
     item = BaseDataItem()
     sel = Selector(response)
     raw_time_data = sel.xpath("//div[@class='abstract tc']/text()").extract()
     raw_time = raw_time_data[3].strip().split(" ",1)
     publish_time = raw_time[0]
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = publish_time
     item["province"] = "青海"
     item["location"] = ""
     title = sel.xpath('//h1[@class="blue tc"]/text()').extract_first()
     item["title"] = title
     content = ""
     content_text = sel.xpath('//div[@class="details_content"]/p/text()').extract()
     for row in content_text:
         content = content + row.strip() + "\n"
     item["content"] = content
     item["attend_persons"] = ""
     item["summary"]=""
     yield item

示例#16

0

显示文件

文件： nmgSpider.py 项目： zsweet/Covid19-News-Crawl

    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "内蒙古自治区"
        item["location"] = ""
        content = ""
        content_text = sel.xpath('//div[@id="zoom"]/p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"

        item["content"] = content
        item["attend_persons"] = ""
        item["summary"]=""
        yield item

示例#17

0

显示文件

    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["publish_time"] = response.meta["publish_time"]
        item["title"] = response.meta["title"]
        item["summary"]=""
        item["province"] = "西藏自治区"
        item["location"] = ""
        content = ""
        content_text = sel.xpath('//div[@class="vw-art-list"]//p/text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        attend_persons=""
        attend_persons_all = sel.xpath('//div[@class="vw-art-list"]//span/text()').extract()
        for person in attend_persons_all[1:]:
            attend_persons = attend_persons + person.strip() + "\n"
        item["attend_persons"] = ""
        yield item

示例#18

0

显示文件

 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = response.meta["publish_time"]
     item["title"] = response.meta["title"]
     summary = ""
     summary_text = sel.xpath(
         '//div[@class="content"]//p[4]//text()').extract()
     for sum in summary_text:
         summary = summary + sum.strip() + "\n"
     item["summary"] = summary
     item["province"] = "河南"
     item["location"] = ""
     item["attend_persons"] = ""
     item["time_stamp"] = ""
     content = ""
     content_text = sel.xpath(
         '//div[@class="content"]//p//text()').extract()
     for row in content_text:
         content = content + row.strip() + "\n"
     item["content"] = content
     yield item

示例#19

0

显示文件

 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = response.meta["publish_time"]
     item["title"] = response.meta["title"]
     item["summary"] = ""
     item["province"] = "山东"
     item["location"] = ""
     attend_persons = ""
     attend_persons_text = sel.xpath(
         '//div[@class="photos"]/ul//li/p/a/text()').extract()
     for per in attend_persons_text:
         attend_persons = attend_persons + per.strip() + "\n"
     item["attend_persons"] = attend_persons
     item["time_stamp"] = ""
     item["score"] = ""
     content = ""
     content_text = sel.xpath('//div[@class="info"]//text()').extract()
     for col in content_text:
         content = content + col.strip() + "\n"
     item["content"] = content
     yield item

示例#20

0

显示文件

    def text_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        item["publish_time"] = response.meta["publish_time"]
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["province"] = "辽宁"
        item["location"] = ""
        content = ""
        content_text = sel.xpath('//div[@class="fbh_wzf"]//text()').extract()
        for col in content_text:
            content = content + col.strip() + "\n"

        item["content"] = content
        attend_persons_all = ""
        attend_persons_text = sel.xpath(
            '//div[@class="fbh_wzf"]/div[@class="fbh_rm"]//text()').extract()
        for row in attend_persons_text:
            attend_persons_all = attend_persons_all + row.strip() + "\n"
        attend_persons = attend_persons_all
        item["attend_persons"] = attend_persons
        item["summary"] = ""
        yield item

示例#21

0

显示文件

文件： gansudtSpider.py 项目： zsweet/Covid19-News-Crawl

    def detail_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "甘肃"
        item["location"] = ""
        item["attend_persons"] = []
        title = ""
        title_text = sel.xpath(
            '//table[@width="95%"]//tr[1]/td/text()').extract()
        for col in title_text:
            title = title + col.strip()
        item["title"] = title

        content = ""
        content_text = sel.xpath('//div[@id="zoom"]//p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"

        item["content"] = content
        item["summary"] = ""
        yield item