示例#1
0
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://news.xinhuanet.com/house/'):

            date = url.split('/')[-2]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date, '%Y-%m-%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://qz.zp365.com/News/'):

            date = response.css('div.newstitle_down div[style="float:left;"]::text').extract()[-1].split()[-1]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date.encode('utf-8'), '时间:%Y-%m-%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://news.nn.xkhouse.com/html/'):

            date = response.css('.cont>.times::text').extract_first()

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://www.thepaper.cn/'):

            date = response.css('div.news_about > p:nth-child(2)::text'
                                ).extract_first().split(' ')[0]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date, '%Y-%m-%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
示例#5
0
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://news.gxfdc.cn/News/'):

            date = response.css(
                'div.newstitle_down div::text').extract()[1].split()[-1][-10:]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date, '%Y-%m-%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://nn.loupan.com/html/news/'):

            date = response.css('p.time-form::text').extract_first().split()[0]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date.encode('utf-8'),
                                                     '%Y年%m月%d日')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://www.fanggx.com/gxi/news/'):

            date = response.css(
                'div.article-dfa::text').extract_first().split()[0]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date, '%Y-%m-%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://gxcic.net/HTMLFile/'):

            date = response.css('#Labeltitle2::text').extract_first().split(
                ' ')[0]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date, '%Y/%m/%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://www.0771555.com/article/'):

            date = response.css(
                '#content_writer td[width="150"]::text').extract_first()

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date.encode('utf-8'),
                                                     '日期:%Y-%m-%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
示例#10
0
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://www.chinanews.com/house/'):

            year = url.split('/')[-3]
            month_day = url.split('/')[-2]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(year + month_day,
                                                     '%Y%m-%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://house.gxsky.com/'):

            date = response.css(
                '#left_news > div.left_news_nr_title > h2::text').extract(
                )[1].strip()

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(date, '%Y-%m-%d %H:%M')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item
    def parse_page(self, response):

        url = response.url
        if url.startswith('http://liuzhou.house.163.com/'):

            url_info = url.split('/')

            year = url_info[3]
            month_day = url_info[4]

            item = CorpusItem()
            item['url'] = url
            item['website'] = self.name
            item['published_at'] = datetime.strptime(year + month_day,
                                                     '%y%m%d')
            item['html'] = response.body_as_unicode()
            item['status'] = 'ready'

            return item