예제 #1
0
 def parse_item(self, response):
     loader = ChinaLoader(item=NewsItem(), response=response)
     loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
     loader.add_value('url', response.url)
     loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
     loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
     loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
     loader.add_value('website', '中华网')
     yield loader.load_item()
예제 #2
0
파일: china.py 프로젝트: yuebanwanwan/9.27
 def parse_item2(self, response):
     item = NewsItem()
     item['title'] = response.xpath(
         '//title/text()').extract_first().strip()
     item['url'] = response.url
     item['text'] = ''.join(
         response.xpath(
             '//div[@id="chan_newsDetail"]//p/text()').extract()).strip()
     item['datetime'] = response.xpath(
         '//div[@id="chan_newsInfo"]/text()').re_first(
             '(\d+-\d+-\d+\s\d+:\d+:\d+)')
     item['source'] = response.xpath(
         '//div[@id="chan_newsInfo"]/text()').re_first('(;.*?;)')
     item['website'] = '中华网'
     yield item
예제 #3
0
 def parse_item(self, response):
     loader = ChinaLoader(item=NewsItem(), response=response)
     loader.add_xpath('title', "h1[@id='chan_newsTitle']/text()")
     loader.add_value('url', response.url)
     loader.add_xpath('text', "//div[@id='chan_newsDetail']//text()")
     loader.add_xpath(
         "datetime",
         "//div[@class='chan_newsInfo_source']/span[@class='time']/text()",
         re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
     loader.add_xpath(
         "source",
         "//div[@class='chan_newsInfo_source']/span[@class='source']/text()"
     )
     loader.add_value('website', '中华网')
     yield loader.load_item()
예제 #4
0
 def parse_item(self, response):
     loader = ChinaLoader(item=NewsItem(), response=response)
     #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
     #item['name'] = response.xpath('//div[@id="name"]').get()
     #item['description'] = response.xpath('//div[@id="description"]').get()
     loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
     loader.add_value('url', response.url)
     loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
     loader.add_xpath('datetime',
                      '//div[@id="chan_newsInfo"]/text()',
                      re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
     loader.add_xpath('source',
                      '//div[@id="chan_newsInfo"]/text()',
                      re='来源:(.*)')
     loader.add_value('website', '中华网')
     yield loader.load_item()
예제 #5
0
파일: china.py 프로젝트: Fly365/py-learn
 def parse_item(self, response):
     '''
     item = NewsItem()
     item['title'] = response.xpath('//h1[@id="chan_newsTitle"]/text()').extract_first()
     item['url'] = response.url
     item['text'] = ''.join(response.xpath('//div[@id="chan_newsDetail"]/text()').extract()).strip()
     item['datatime'] = response.xpath('//div[@id="chan_newsInfo"]/text()').re_first('(\d+-\d+-\d+\s\d+:\d+:\d+)')
     item['source'] = response.xpath('//div[@id="chan_newsInfo"]/text()').re_first(' 来源:(.*)').strip()
     item['website'] = '中华网'
     yield item
     '''
     loader = ChinaLoader(item=NewsItem(), response=response)
     loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
     loader.add_xpath('url', response.url)
     loader.add_xpath('text', '//div[@id="chan_newsDetail"]/text()')
     loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
     loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re=' 来源:(.*)')
     loader.add_value('website', '中华网')
     yield loader.load_item()
예제 #6
0
    def parse_item(self, response):
        #        item = NewsItem()
        #        item['title'] = response.xpath('//*[@id="chan_newsTitle"]/text()').get()
        #        item['url'] = response.url
        #        item['source'] = response.xpath('//*[@id="js-article-title"]//span[@class="source"]/text()').get()[3:].strip()
        #        item['datatime'] = response.xpath('//*[@id="js-article-title"]//span[@class="time"]/text()').get()
        #        item['text'] = response.xpath('//*[@id="chan_newsDetail"]//p[position() < last()]/text()').getall()
        #        item['website'] = 'tech.china.com'
        #        return item
        loader = ChinaLoader(item=NewsItem(), response=response)
        loader.add_xpath('title', '//*[@id="chan_newsTitle"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath(
            'text',
            '//*[@id="chan_newsDetail"]//p[position() < last()]/text()')
        loader.add_xpath(
            'datatime',
            '//*[@id="js-article-title"]//span[@class="time"]/text()')
        loader.add_xpath(
            'source',
            '//*[@id="js-article-title"]//span[@class="source"]/text()',
            re='来源:(.*)')
        loader.add_value('website', 'tech.china.com')

        #        print()
        #        print(loader.load_item())
        #        print(type(loader.load_item()))
        #        print(loader.load_item()['title'])
        #        print(type(loader.load_item()['title']))
        #        print(loader.load_item()['url'])
        #        print(type(loader.load_item()['url']))
        #        print(loader.load_item()['text'])
        #        print(type(loader.load_item()['text']))
        #        print(loader.load_item()['datatime'])
        #        print(type(loader.load_item()['datatime']))
        #        print(loader.load_item()['source'])
        #        print(type(loader.load_item()['source']))
        #        print()

        yield loader.load_item()