示例#1
0
    def detail_parse(self, response):
        detail_url = response.meta["detail_url"]
        item = BaseDataItem()
        sel = Selector(response)
        title = sel.xpath(
            '//div[@class="detail-article-title oflow-hd"]/h5//text()'
        ).extract_first()
        raw_time = sel.xpath(
            "//li[@class='article-infos-source left']/span[1]/text()"
        ).extract_first()
        province = "山西"
        person_arr = sel.xpath("//p/strong/text()").extract()
        summary = sel.xpath(
            '//p[@align="center"]/following-sibling::p[1]//text()'
        ).extract_first()
        content = ""
        content_strs = sel.xpath(
            '//div[@class="TRS_Editor"]/p//text()').extract()
        for content_row in content_strs:
            content = content + content_row.strip() + "\n"
        raw_time = raw_time.split(" ", 1)
        publish_time = raw_time[0]

        item["title"] = title
        item['publish_time'] = publish_time
        item['province'] = province
        item['location'] = ""
        item['attend_persons'] = person_arr
        item['summary'] = summary
        item['content'] = content
        item['detail_url'] = detail_url
        yield item
示例#2
0
    def detail_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "安徽"
        item["location"] = ""
        item["summary"] = ""

        attend_persons = ""
        attend_person_all = sel.xpath(
            '//div[@class="fty_imglistlb"]/ul/li/a/@data-title').extract()
        # for persons in attend_person_all[:-1]:  #多一个“发布会现场”
        # attend_persons = attend_persons + persons + "\n" #应该不需要去掉空格,因为:省人民政府副省长 章㬢
        attend_persons = attend_person_all[:-1]  # 直接拿到列表赋值
        item["attend_persons"] = attend_persons

        content = ""
        content_text = sel.xpath(
            '//div[@class="desc j-fontContent"]/p/text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        yield item
 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = response.meta["publish_time"]
     title = ""
     title_text = sel.xpath('//div[@id="content"]//p[1]/text()').extract()
     for tit in title_text:
         title = title + tit.strip() + "\n"
     item["title"] = title
     summary = ""
     summary_text = sel.xpath(
         '//div[@id="content"]//p[2]//text()').extract()
     for row in summary_text:
         summary = summary + row.strip() + "\n"
     item["summary"] = summary
     item["province"] = "天津"
     item["location"] = "天津"
     attend_persons = ""
     attend_persons_text = sel.xpath(
         '//div[@id="content"]//p/span[@style="color: #0033ff"]//text()'
     ).extract()
     for per in attend_persons_text:
         attend_persons = attend_persons + per.strip() + "\n"
     item["attend_persons"] = attend_persons
     item["time_stamp"] = ""
     content = ""
     content_text = sel.xpath('//div[@id="content"]//p//text()').extract()
     for col in content_text[1:]:
         content = content + col.strip() + "\n"
     item["content"] = content
     yield item
 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     title = sel.xpath('//div[@id="info_title"]/text()').extract_first()
     raw_time = sel.xpath(
         '//span[@id="info_released_dtime"]/text()').extract_first()
     publish_time = raw_time[:-9]
     content = ""
     content_strs = sel.xpath('//div[@id="ofdneed"]//p/text()').extract()
     for content_str in content_strs:
         content = content + content_str.strip() + "\n"
     attend_persons_str = sel.xpath(
         '//div[@id="ofdneed"]//p[last()]/text()').extract_first()
     attend_persons = []
     if not attend_persons_str is None:
         attend_persons = attend_persons_str.split(",")
     item["publish_time"] = publish_time
     item["location"] = ""
     item["province"] = "宁夏"
     item["detail_url"] = response.meta["detail_url"]
     item["attend_persons"] = attend_persons
     item["title"] = title
     item["summary"] = ""
     item["content"] = content
     yield item
示例#5
0
    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["province"] = "甘肃"
        item["location"] = ""
        item["publish_time"] = response.meta["publish_time"]  #时间在正文里的更准确
        title = sel.xpath('//table[@width="95%"]//td/text()').extract_first()
        item["title"] = title
        content = ""
        content_text = sel.xpath('//td[@class="bt_content"]//p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content

        attend_persons = ""
        attend_persons_text = sel.xpath('//td[@class="bt_content"]//p//span/text()').extract()
        for per in attend_persons_text:
            attend_persons = attend_persons + per.strip() + "\n"
        item["attend_persons"] = attend_persons
        
        summary = ""
        summary_text = sel.xpath('//meta[@name="description"]/content/text()').extract()
        for sum in summary_text:
            summary = summary + sum.strip() + "\n"
        item["summary"] = summary_text
        yield item
示例#6
0
    def detail_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["publish_time"] = response.meta["publish_time"]
        item["title"] = response.meta["title"]
        summary_text = sel.xpath(
            '//td[@id="new_message_id"]//p[@align="justify"]/text()'
        ).extract_first()
        if summary_text is None:
            summary_text = sel.xpath(
                '//div[@class="big_right"]/text()').extract_first()
            if summary_text is None:
                summary_text = ""

        item["summary"] = summary_text.strip()
        item["province"] = "福建"
        item["location"] = ""
        item["attend_persons"] = ""
        item["time_stamp"] = ""
        content = ""
        content_text = sel.xpath(
            '//td[@id="new_message_id"]//p[@align="justify"]/text()').extract(
            )
        if len(content_text) == 0:
            content_text = sel.xpath(
                '//td[@id="new_message_id"]/p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        yield item
示例#7
0
 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     current_url = response.meta["detail_url"]
     cols = sel.xpath('//div[@class="text_record"]/div/p/text()').extract()
     if cols is None:
         cols = sel.xpath(
             '//div[@class="text_record"]//p/span/text()').extract()
     content = ""
     for col in cols:
         content = content + col.strip() + "\n"
     item["publish_time"] = sel.xpath(
         "//ul[@class='list-unstyled int_list']/li[1]/text()"
     ).extract_first()
     item["attend_persons"] = sel.xpath(
         "//ul[@class='list-unstyled int_list']/li[2]/text()"
     ).extract_first()
     item["province"] = "湖北"
     item["location"] = ""
     item["summary"] = sel.xpath(
         "//ul[@class='list-unstyled int_list']//p/text()").extract_first(
         ).strip()
     item["title"] = sel.xpath("//h2/text()").extract_first()
     item["content"] = content
     item["detail_url"] = current_url
     yield item
 def detail_parse(self,response):
     item = BaseDataItem()
     sel = Selector(response)
     publish_time = sel.xpath("//span/text()").extract_first()
     if len(publish_time)<6:
         publish_time = sel.xpath('//p[@class="detailmsg"]').extract_first()
         if publish_time is None:
             publish_time = sel.xpath('//h6/text()').extract_first()
             pass
         pass
     print(publish_time)
     publish_time = re.findall(r".*?(\d+-\d+-\d+).*",publish_time,re.M)[0]
     title = sel.xpath("//h1/text()").extract_first()
     if title is None:
         title = sel.xpath("//h2/text()").extract_first()
         pass
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = publish_time
     item["title"] = title
     item["summary"]=""
     item["province"] = "北京"
     item["location"] = "北京"
     item["attend_persons"] = ""
     item["time_stamp"] = ""
     content = ""
     content_text = sel.xpath('//div[@class="container"]/p/text()').extract()
     if len(content_text)==0:
         content_text = sel.xpath('//div[@class="brief"]/text()').extract()
     
     for row in content_text:
         content = content + row.strip() +"\n"
     item["content"] = content
     yield item
    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "四川"
        item["location"] = ""
        title = sel.xpath('//div[@id="articlecontent"]/h2/ucaptitle/text()').extract_first()
        item["title"] = title.strip()

        content = ""
        content_text = sel.xpath('//div[@id="cmsArticleContent"]//p/text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        item["attend_persons"] = ""
        item["summary"]=""
        yield item
示例#10
0
    def detail_parse(self,response):
        sel = Selector(response)
        item = BaseDataItem()
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "湖南"
        item["location"] = ""
        attend_persons_all=sel.xpath('//ul[@class="fbh_list"]/li/p/text()').extract()
        item["attend_persons"] = attend_persons_all

        content = ""
        content_text = sel.xpath('//div[@class="ct_txt"]//p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        item["summary"]=""
        yield item
示例#11
0
    def detail_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        content = ""
        content_strs = sel.xpath('//div[@id="zoom"]/p//text()').extract()
        for content_str in content_strs:
            content = content + content_str.strip() + "\n"
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["province"] = "浙江"
        item["location"] = ""
        item["content"] = content

        # attend_persons_all = sel.xpath('//div[@class="chat_cont_list"]/ul/li[3]/span[2]/text()').extract_first()
        item["attend_persons"] = ""
        # publish_time = sel.xpath('//div[@class="chat_cont_list"]/ul/li[2]/span[2]/text()').extract_first()
        item["publish_time"] = response.meta["publish_time"]
        item["summary"] = ""
        yield item
示例#12
0
 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = response.meta["publish_time"]
     item["title"] = response.meta["title"]
     item["summary"] = ""
     item["province"] = "云南"
     item["location"] = ""
     item["attend_persons"] = ""
     item["time_stamp"] = ""
     content = ""
     content_text = sel.xpath(
         '//div[@class="view TRS_UEDITOR trs_paper_default trs_web"]//p//text()'
     ).extract()
     for row in content_text:
         content = content + row.strip() + "\n"
     item["content"] = content
     yield item
示例#13
0
 def detail_parse(self, response):
     detail_url = response.meta["detail_url"]
     title = response.meta["title"]
     publish_time = response.meta["publish_time"]
     sel = Selector(response)
     content = ""
     datas = sel.xpath('//div[@id="zoom"]/p//text()').extract()
     for data in datas:
         content = content + data
     item = BaseDataItem()
     item["detail_url"] = detail_url
     item["title"] = title
     item["publish_time"] = publish_time
     item["province"] = "江苏"
     item["location"] = ""
     item["attend_persons"] = ""
     item["summary"] = ""
     item["content"] = content
     yield item
示例#14
0
    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        uniq_time = response.meta["publish_time"]
        item["publish_time"] = re.sub("\D","-",uniq_time)
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["province"] = "吉林"
        item["location"] = ""

        content =""
        content_text = sel.xpath('//div[@class="TRS_Editor"]//text()').extract()
        for col in content_text:
            content = content + col.strip() +"\n"
        item["summary"]=""
        item["content"] = content
        attend_persons_text = sel.xpath('//font/text()').extract()
        item["attend_persons"] = attend_persons_text
        yield item
示例#15
0
 def detail_parse(self,response):
     item = BaseDataItem()
     sel = Selector(response)
     raw_time_data = sel.xpath("//div[@class='abstract tc']/text()").extract()
     raw_time = raw_time_data[3].strip().split(" ",1)
     publish_time = raw_time[0]
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = publish_time
     item["province"] = "青海"
     item["location"] = ""
     title = sel.xpath('//h1[@class="blue tc"]/text()').extract_first()
     item["title"] = title
     content = ""
     content_text = sel.xpath('//div[@class="details_content"]/p/text()').extract()
     for row in content_text:
         content = content + row.strip() + "\n"
     item["content"] = content
     item["attend_persons"] = ""
     item["summary"]=""
     yield item
示例#16
0
    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "内蒙古自治区"
        item["location"] = ""
        content = ""
        content_text = sel.xpath('//div[@id="zoom"]/p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"

        item["content"] = content
        item["attend_persons"] = ""
        item["summary"]=""
        yield item



        
示例#17
0
    def detail_parse(self,response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["publish_time"] = response.meta["publish_time"]
        item["title"] = response.meta["title"]
        item["summary"]=""
        item["province"] = "西藏自治区"
        item["location"] = ""
        content = ""
        content_text = sel.xpath('//div[@class="vw-art-list"]//p/text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"
        item["content"] = content
        attend_persons=""
        attend_persons_all = sel.xpath('//div[@class="vw-art-list"]//span/text()').extract()
        for person in attend_persons_all[1:]:
            attend_persons = attend_persons + person.strip() + "\n"
        item["attend_persons"] = ""
        yield item

        
示例#18
0
 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = response.meta["publish_time"]
     item["title"] = response.meta["title"]
     summary = ""
     summary_text = sel.xpath(
         '//div[@class="content"]//p[4]//text()').extract()
     for sum in summary_text:
         summary = summary + sum.strip() + "\n"
     item["summary"] = summary
     item["province"] = "河南"
     item["location"] = ""
     item["attend_persons"] = ""
     item["time_stamp"] = ""
     content = ""
     content_text = sel.xpath(
         '//div[@class="content"]//p//text()').extract()
     for row in content_text:
         content = content + row.strip() + "\n"
     item["content"] = content
     yield item
示例#19
0
 def detail_parse(self, response):
     item = BaseDataItem()
     sel = Selector(response)
     item["detail_url"] = response.meta["detail_url"]
     item["publish_time"] = response.meta["publish_time"]
     item["title"] = response.meta["title"]
     item["summary"] = ""
     item["province"] = "山东"
     item["location"] = ""
     attend_persons = ""
     attend_persons_text = sel.xpath(
         '//div[@class="photos"]/ul//li/p/a/text()').extract()
     for per in attend_persons_text:
         attend_persons = attend_persons + per.strip() + "\n"
     item["attend_persons"] = attend_persons
     item["time_stamp"] = ""
     item["score"] = ""
     content = ""
     content_text = sel.xpath('//div[@class="info"]//text()').extract()
     for col in content_text:
         content = content + col.strip() + "\n"
     item["content"] = content
     yield item
示例#20
0
    def text_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        item["publish_time"] = response.meta["publish_time"]
        item["detail_url"] = response.meta["detail_url"]
        item["title"] = response.meta["title"]
        item["province"] = "辽宁"
        item["location"] = ""
        content = ""
        content_text = sel.xpath('//div[@class="fbh_wzf"]//text()').extract()
        for col in content_text:
            content = content + col.strip() + "\n"

        item["content"] = content
        attend_persons_all = ""
        attend_persons_text = sel.xpath(
            '//div[@class="fbh_wzf"]/div[@class="fbh_rm"]//text()').extract()
        for row in attend_persons_text:
            attend_persons_all = attend_persons_all + row.strip() + "\n"
        attend_persons = attend_persons_all
        item["attend_persons"] = attend_persons
        item["summary"] = ""
        yield item
示例#21
0
    def detail_parse(self, response):
        item = BaseDataItem()
        sel = Selector(response)
        item["detail_url"] = response.meta["detail_url"]
        item["publish_time"] = response.meta["publish_time"]
        item["province"] = "甘肃"
        item["location"] = ""
        item["attend_persons"] = []
        title = ""
        title_text = sel.xpath(
            '//table[@width="95%"]//tr[1]/td/text()').extract()
        for col in title_text:
            title = title + col.strip()
        item["title"] = title

        content = ""
        content_text = sel.xpath('//div[@id="zoom"]//p//text()').extract()
        for row in content_text:
            content = content + row.strip() + "\n"

        item["content"] = content
        item["summary"] = ""
        yield item