def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?people.com.cn.*?/\d+/.*?', url) and 'BIG' not in url: content = response.xpath('//*[@id="p_content"]/span/text() | //*[@class="clearfix"]/p/text()').extract() if content: item = NewsItem( domainname='http://uyghur.people.com.cn/', chinesename='维语人民网', url=sel.root.base, title=sel.css('div.ej_right > h1::text').extract_first(), subtitle=sel.css('div.ej_right > h5::text').extract_first(), language='维文', encodingtype='utf-8', corpustype='网络', # timeofpublish=sel.re( # r'\d{4}年\d{2}月\d{2}日\d{2}:\d{2}')[0], timeofpublish=sel.css('div.ej_right #p_publishtime::text').extract_first(), content=''.join(content), source=sel.css('div.ej_right #p_origin > a:nth-child(1)::text').extract_first(), author=sel.css('div.ej_right #p_publishtime::text').extract_first() ) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author",None)) # item = judge_time_news(item) # if item: yield item
def parse_item(response): sel = Selector(response) print(sel) url = response.request.url if re.match(r'.*?tibet.people.com.cn/.*?', url): print('---------------------') print(url) content = response.xpath('//html/body/div[2]/div[4]/div[1]/div[2]/div[2]/text()').extract() # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text() print(content) if content: item = NewsItem( domainname='http://tibet.people.com.cn', chinesename='people', url=sel.root.base, title=sel.css('.gq_content > h1:nth-child(2)::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='tibet', encodingtype='utf-8', corpustype='网络', timeofpublish=sel.re(r'\d{4}.*?\d{2}.*?\d{2}.*?\d{2}:\d{2}')[0].replace('ལོའི་ཟླ་ ', '年').replace('ཚེས་', '月').replace('ཉིན། ', '日'), # timeofpublish = re.search(r'\d{4}.*?\d{2}.*?\d{2}',sel.css('.title_hd > p:nth-child(2)::text').extract_first()).group(0), content=''.join(content), # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(), # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first() ) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) item = judge_time_news_people(item) if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?tb.chinatibetnews.com/.*?', url): print('---------------------') print(url) content = response.xpath( '/html/body/div[4]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[1]//p//text()' ).extract() print(content) # 移除编辑 editor = response.xpath( '//*[@class="-articleeditor"]/text()').extract_first() title = response.xpath( '/html/body/div[4]/div[1]/div[2]/ul[1]/li[1]/p[2]//text()' ).extract() if editor: content.remove(editor) publish_time = sel.re(r'\d{4}-\d{2}-\d{2}')[0] print(publish_time) if ' ' in publish_time: publish_time = publish_time.replace(' ', '') if content: item = NewsItem(domainname='http://tibet.cpc.people.com.cn/', chinesename='tibet3', url=sel.root.base, title=''.join(title), subtitle=sel.css('.sub::text').extract_first(), language='藏文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), author=None) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) # yield item # item = judge_time_news(item) # if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?/\d{4}-\d{2}/\d{2}/.*?', url): print('---------------------') print(url) content = response.xpath( '/html/body/div[6]/div/div/div[3]//p//text()').extract() print(content) # 移除编辑 editor = response.xpath( '//*[@class="-articleeditor"]/text()').extract_first() if editor: content.remove(editor) publish_time = sel.re(r'\d{4}-\d{2}-\d{2}')[0] print(publish_time) if ' ' in publish_time: publish_time = publish_time.replace(' ', '') if content: item = NewsItem( domainname='http://xizang.news.cn/', chinesename='tibetxinhua', url=sel.root.base, title=sel.css('#ArticleTit::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='藏文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), source=sel.css( '#Articlely > div.laiyuan > a::text').extract_first(), author=sel.css( '#contentK > div.xinxi > span:nth-child(3)::text'). extract_first()) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) # yield item item = judge_time_news(item) if item: yield item
def parse_item(response): sel = Selector(response) print(sel) url = response.request.url if re.match(r'.*?/\d{4}/\d{2}/\d{2}/.*?', url): print('---------------------') print(url) content = response.xpath( '//*[@id="page_body"]/div[2]/div/div[1]/div[1]/div/div[2]/p[2]/text() | //*[@id="page_body"]/div[2]/div/div[1]/div[1]/div/div[2]/p[2]/span/text()' ).extract() # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text() print(content) if content: item = NewsItem( domainname='http://uyghur.cntv.com', chinesename='CCTV', url=sel.root.base, title=sel.css( '.title_hd > h3:nth-child(1)::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', # timeofpublish=sel.re(r'\d{4}_\d{2}_\d{2}')[0], # strr=sel.css('.title_hd > p:nth-child(2)::text').extract_first(), timeofpublish=re.search( r'\d{2}-\d{2}-\d{4}', sel.css('.title_hd > p:nth-child(2)::text'). extract_first()).group(0), content=''.join(content), # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(), # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first() ) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) # item = judge_time_news(item) # if item: yield item