def parse_page(self, response): url = response.url if url.startswith('http://news.xinhuanet.com/house/'): date = url.split('/')[-2] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date, '%Y-%m-%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://qz.zp365.com/News/'): date = response.css('div.newstitle_down div[style="float:left;"]::text').extract()[-1].split()[-1] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date.encode('utf-8'), '时间:%Y-%m-%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://news.nn.xkhouse.com/html/'): date = response.css('.cont>.times::text').extract_first() item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://www.thepaper.cn/'): date = response.css('div.news_about > p:nth-child(2)::text' ).extract_first().split(' ')[0] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date, '%Y-%m-%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://news.gxfdc.cn/News/'): date = response.css( 'div.newstitle_down div::text').extract()[1].split()[-1][-10:] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date, '%Y-%m-%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://nn.loupan.com/html/news/'): date = response.css('p.time-form::text').extract_first().split()[0] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date.encode('utf-8'), '%Y年%m月%d日') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://www.fanggx.com/gxi/news/'): date = response.css( 'div.article-dfa::text').extract_first().split()[0] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date, '%Y-%m-%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://gxcic.net/HTMLFile/'): date = response.css('#Labeltitle2::text').extract_first().split( ' ')[0] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date, '%Y/%m/%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://www.0771555.com/article/'): date = response.css( '#content_writer td[width="150"]::text').extract_first() item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date.encode('utf-8'), '日期:%Y-%m-%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://www.chinanews.com/house/'): year = url.split('/')[-3] month_day = url.split('/')[-2] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(year + month_day, '%Y%m-%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://house.gxsky.com/'): date = response.css( '#left_news > div.left_news_nr_title > h2::text').extract( )[1].strip() item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(date, '%Y-%m-%d %H:%M') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item
def parse_page(self, response): url = response.url if url.startswith('http://liuzhou.house.163.com/'): url_info = url.split('/') year = url_info[3] month_day = url_info[4] item = CorpusItem() item['url'] = url item['website'] = self.name item['published_at'] = datetime.strptime(year + month_day, '%y%m%d') item['html'] = response.body_as_unicode() item['status'] = 'ready' return item