def detail_parse(self, response): detail_url = response.meta["detail_url"] item = BaseDataItem() sel = Selector(response) title = sel.xpath( '//div[@class="detail-article-title oflow-hd"]/h5//text()' ).extract_first() raw_time = sel.xpath( "//li[@class='article-infos-source left']/span[1]/text()" ).extract_first() province = "山西" person_arr = sel.xpath("//p/strong/text()").extract() summary = sel.xpath( '//p[@align="center"]/following-sibling::p[1]//text()' ).extract_first() content = "" content_strs = sel.xpath( '//div[@class="TRS_Editor"]/p//text()').extract() for content_row in content_strs: content = content + content_row.strip() + "\n" raw_time = raw_time.split(" ", 1) publish_time = raw_time[0] item["title"] = title item['publish_time'] = publish_time item['province'] = province item['location'] = "" item['attend_persons'] = person_arr item['summary'] = summary item['content'] = content item['detail_url'] = detail_url yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["title"] = response.meta["title"] item["publish_time"] = response.meta["publish_time"] item["province"] = "安徽" item["location"] = "" item["summary"] = "" attend_persons = "" attend_person_all = sel.xpath( '//div[@class="fty_imglistlb"]/ul/li/a/@data-title').extract() # for persons in attend_person_all[:-1]: #多一个“发布会现场” # attend_persons = attend_persons + persons + "\n" #应该不需要去掉空格,因为:省人民政府副省长 章㬢 attend_persons = attend_person_all[:-1] # 直接拿到列表赋值 item["attend_persons"] = attend_persons content = "" content_text = sel.xpath( '//div[@class="desc j-fontContent"]/p/text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["publish_time"] = response.meta["publish_time"] title = "" title_text = sel.xpath('//div[@id="content"]//p[1]/text()').extract() for tit in title_text: title = title + tit.strip() + "\n" item["title"] = title summary = "" summary_text = sel.xpath( '//div[@id="content"]//p[2]//text()').extract() for row in summary_text: summary = summary + row.strip() + "\n" item["summary"] = summary item["province"] = "天津" item["location"] = "天津" attend_persons = "" attend_persons_text = sel.xpath( '//div[@id="content"]//p/span[@style="color: #0033ff"]//text()' ).extract() for per in attend_persons_text: attend_persons = attend_persons + per.strip() + "\n" item["attend_persons"] = attend_persons item["time_stamp"] = "" content = "" content_text = sel.xpath('//div[@id="content"]//p//text()').extract() for col in content_text[1:]: content = content + col.strip() + "\n" item["content"] = content yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) title = sel.xpath('//div[@id="info_title"]/text()').extract_first() raw_time = sel.xpath( '//span[@id="info_released_dtime"]/text()').extract_first() publish_time = raw_time[:-9] content = "" content_strs = sel.xpath('//div[@id="ofdneed"]//p/text()').extract() for content_str in content_strs: content = content + content_str.strip() + "\n" attend_persons_str = sel.xpath( '//div[@id="ofdneed"]//p[last()]/text()').extract_first() attend_persons = [] if not attend_persons_str is None: attend_persons = attend_persons_str.split(",") item["publish_time"] = publish_time item["location"] = "" item["province"] = "宁夏" item["detail_url"] = response.meta["detail_url"] item["attend_persons"] = attend_persons item["title"] = title item["summary"] = "" item["content"] = content yield item
def detail_parse(self,response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["province"] = "甘肃" item["location"] = "" item["publish_time"] = response.meta["publish_time"] #时间在正文里的更准确 title = sel.xpath('//table[@width="95%"]//td/text()').extract_first() item["title"] = title content = "" content_text = sel.xpath('//td[@class="bt_content"]//p//text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content attend_persons = "" attend_persons_text = sel.xpath('//td[@class="bt_content"]//p//span/text()').extract() for per in attend_persons_text: attend_persons = attend_persons + per.strip() + "\n" item["attend_persons"] = attend_persons summary = "" summary_text = sel.xpath('//meta[@name="description"]/content/text()').extract() for sum in summary_text: summary = summary + sum.strip() + "\n" item["summary"] = summary_text yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["publish_time"] = response.meta["publish_time"] item["title"] = response.meta["title"] summary_text = sel.xpath( '//td[@id="new_message_id"]//p[@align="justify"]/text()' ).extract_first() if summary_text is None: summary_text = sel.xpath( '//div[@class="big_right"]/text()').extract_first() if summary_text is None: summary_text = "" item["summary"] = summary_text.strip() item["province"] = "福建" item["location"] = "" item["attend_persons"] = "" item["time_stamp"] = "" content = "" content_text = sel.xpath( '//td[@id="new_message_id"]//p[@align="justify"]/text()').extract( ) if len(content_text) == 0: content_text = sel.xpath( '//td[@id="new_message_id"]/p//text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) current_url = response.meta["detail_url"] cols = sel.xpath('//div[@class="text_record"]/div/p/text()').extract() if cols is None: cols = sel.xpath( '//div[@class="text_record"]//p/span/text()').extract() content = "" for col in cols: content = content + col.strip() + "\n" item["publish_time"] = sel.xpath( "//ul[@class='list-unstyled int_list']/li[1]/text()" ).extract_first() item["attend_persons"] = sel.xpath( "//ul[@class='list-unstyled int_list']/li[2]/text()" ).extract_first() item["province"] = "湖北" item["location"] = "" item["summary"] = sel.xpath( "//ul[@class='list-unstyled int_list']//p/text()").extract_first( ).strip() item["title"] = sel.xpath("//h2/text()").extract_first() item["content"] = content item["detail_url"] = current_url yield item
def detail_parse(self,response): item = BaseDataItem() sel = Selector(response) publish_time = sel.xpath("//span/text()").extract_first() if len(publish_time)<6: publish_time = sel.xpath('//p[@class="detailmsg"]').extract_first() if publish_time is None: publish_time = sel.xpath('//h6/text()').extract_first() pass pass print(publish_time) publish_time = re.findall(r".*?(\d+-\d+-\d+).*",publish_time,re.M)[0] title = sel.xpath("//h1/text()").extract_first() if title is None: title = sel.xpath("//h2/text()").extract_first() pass item["detail_url"] = response.meta["detail_url"] item["publish_time"] = publish_time item["title"] = title item["summary"]="" item["province"] = "北京" item["location"] = "北京" item["attend_persons"] = "" item["time_stamp"] = "" content = "" content_text = sel.xpath('//div[@class="container"]/p/text()').extract() if len(content_text)==0: content_text = sel.xpath('//div[@class="brief"]/text()').extract() for row in content_text: content = content + row.strip() +"\n" item["content"] = content yield item
def detail_parse(self,response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["publish_time"] = response.meta["publish_time"] item["province"] = "四川" item["location"] = "" title = sel.xpath('//div[@id="articlecontent"]/h2/ucaptitle/text()').extract_first() item["title"] = title.strip() content = "" content_text = sel.xpath('//div[@id="cmsArticleContent"]//p/text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content item["attend_persons"] = "" item["summary"]="" yield item
def detail_parse(self,response): sel = Selector(response) item = BaseDataItem() item["detail_url"] = response.meta["detail_url"] item["title"] = response.meta["title"] item["publish_time"] = response.meta["publish_time"] item["province"] = "湖南" item["location"] = "" attend_persons_all=sel.xpath('//ul[@class="fbh_list"]/li/p/text()').extract() item["attend_persons"] = attend_persons_all content = "" content_text = sel.xpath('//div[@class="ct_txt"]//p//text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content item["summary"]="" yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) content = "" content_strs = sel.xpath('//div[@id="zoom"]/p//text()').extract() for content_str in content_strs: content = content + content_str.strip() + "\n" item["detail_url"] = response.meta["detail_url"] item["title"] = response.meta["title"] item["province"] = "浙江" item["location"] = "" item["content"] = content # attend_persons_all = sel.xpath('//div[@class="chat_cont_list"]/ul/li[3]/span[2]/text()').extract_first() item["attend_persons"] = "" # publish_time = sel.xpath('//div[@class="chat_cont_list"]/ul/li[2]/span[2]/text()').extract_first() item["publish_time"] = response.meta["publish_time"] item["summary"] = "" yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["publish_time"] = response.meta["publish_time"] item["title"] = response.meta["title"] item["summary"] = "" item["province"] = "云南" item["location"] = "" item["attend_persons"] = "" item["time_stamp"] = "" content = "" content_text = sel.xpath( '//div[@class="view TRS_UEDITOR trs_paper_default trs_web"]//p//text()' ).extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content yield item
def detail_parse(self, response): detail_url = response.meta["detail_url"] title = response.meta["title"] publish_time = response.meta["publish_time"] sel = Selector(response) content = "" datas = sel.xpath('//div[@id="zoom"]/p//text()').extract() for data in datas: content = content + data item = BaseDataItem() item["detail_url"] = detail_url item["title"] = title item["publish_time"] = publish_time item["province"] = "江苏" item["location"] = "" item["attend_persons"] = "" item["summary"] = "" item["content"] = content yield item
def detail_parse(self,response): item = BaseDataItem() sel = Selector(response) uniq_time = response.meta["publish_time"] item["publish_time"] = re.sub("\D","-",uniq_time) item["detail_url"] = response.meta["detail_url"] item["title"] = response.meta["title"] item["province"] = "吉林" item["location"] = "" content ="" content_text = sel.xpath('//div[@class="TRS_Editor"]//text()').extract() for col in content_text: content = content + col.strip() +"\n" item["summary"]="" item["content"] = content attend_persons_text = sel.xpath('//font/text()').extract() item["attend_persons"] = attend_persons_text yield item
def detail_parse(self,response): item = BaseDataItem() sel = Selector(response) raw_time_data = sel.xpath("//div[@class='abstract tc']/text()").extract() raw_time = raw_time_data[3].strip().split(" ",1) publish_time = raw_time[0] item["detail_url"] = response.meta["detail_url"] item["publish_time"] = publish_time item["province"] = "青海" item["location"] = "" title = sel.xpath('//h1[@class="blue tc"]/text()').extract_first() item["title"] = title content = "" content_text = sel.xpath('//div[@class="details_content"]/p/text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content item["attend_persons"] = "" item["summary"]="" yield item
def detail_parse(self,response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["title"] = response.meta["title"] item["publish_time"] = response.meta["publish_time"] item["province"] = "内蒙古自治区" item["location"] = "" content = "" content_text = sel.xpath('//div[@id="zoom"]/p//text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content item["attend_persons"] = "" item["summary"]="" yield item
def detail_parse(self,response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["publish_time"] = response.meta["publish_time"] item["title"] = response.meta["title"] item["summary"]="" item["province"] = "西藏自治区" item["location"] = "" content = "" content_text = sel.xpath('//div[@class="vw-art-list"]//p/text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content attend_persons="" attend_persons_all = sel.xpath('//div[@class="vw-art-list"]//span/text()').extract() for person in attend_persons_all[1:]: attend_persons = attend_persons + person.strip() + "\n" item["attend_persons"] = "" yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["publish_time"] = response.meta["publish_time"] item["title"] = response.meta["title"] summary = "" summary_text = sel.xpath( '//div[@class="content"]//p[4]//text()').extract() for sum in summary_text: summary = summary + sum.strip() + "\n" item["summary"] = summary item["province"] = "河南" item["location"] = "" item["attend_persons"] = "" item["time_stamp"] = "" content = "" content_text = sel.xpath( '//div[@class="content"]//p//text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["publish_time"] = response.meta["publish_time"] item["title"] = response.meta["title"] item["summary"] = "" item["province"] = "山东" item["location"] = "" attend_persons = "" attend_persons_text = sel.xpath( '//div[@class="photos"]/ul//li/p/a/text()').extract() for per in attend_persons_text: attend_persons = attend_persons + per.strip() + "\n" item["attend_persons"] = attend_persons item["time_stamp"] = "" item["score"] = "" content = "" content_text = sel.xpath('//div[@class="info"]//text()').extract() for col in content_text: content = content + col.strip() + "\n" item["content"] = content yield item
def text_parse(self, response): item = BaseDataItem() sel = Selector(response) item["publish_time"] = response.meta["publish_time"] item["detail_url"] = response.meta["detail_url"] item["title"] = response.meta["title"] item["province"] = "辽宁" item["location"] = "" content = "" content_text = sel.xpath('//div[@class="fbh_wzf"]//text()').extract() for col in content_text: content = content + col.strip() + "\n" item["content"] = content attend_persons_all = "" attend_persons_text = sel.xpath( '//div[@class="fbh_wzf"]/div[@class="fbh_rm"]//text()').extract() for row in attend_persons_text: attend_persons_all = attend_persons_all + row.strip() + "\n" attend_persons = attend_persons_all item["attend_persons"] = attend_persons item["summary"] = "" yield item
def detail_parse(self, response): item = BaseDataItem() sel = Selector(response) item["detail_url"] = response.meta["detail_url"] item["publish_time"] = response.meta["publish_time"] item["province"] = "甘肃" item["location"] = "" item["attend_persons"] = [] title = "" title_text = sel.xpath( '//table[@width="95%"]//tr[1]/td/text()').extract() for col in title_text: title = title + col.strip() item["title"] = title content = "" content_text = sel.xpath('//div[@id="zoom"]//p//text()').extract() for row in content_text: content = content + row.strip() + "\n" item["content"] = content item["summary"] = "" yield item