示例#1
0
 def parse(self, response):
     """this is the dump of a cms running on django nonrel and google appengine I  wrote in 2010 or s/t. Appengine got too expensive for me."""
     logging.log(logging.INFO, "Parsed XML-Input-File %s"%response.url)
     ##
     # bvg is still online, we get e/t from there by recrawling recursively 
     for dj_au_obj in response.xpath('/django-objects/object[@model = "stag.augmentedarticlescraperpolizeiberlin"]'):
         parts = {}
         parts['place'] = None
         parts['author'] = 'Polizei Berlin'
         parts['source_name'] = 'polizei'
         parts['source_url'] = dj_au_obj.xpath('field[@name = "source_url"]/text()').extract_first()
         parts['source_id'] = dj_au_obj.xpath('field[@name = "source_url"]/text()').re_first('(\d+)/index\.html')
         parts['place'] = dj_au_obj.xpath('field[@name = "district"]/text()').extract_first()
         if parts['place'] == 'False':
             parts['place'] = None
         ##
         # select the payload-article
         # now we need to resolve the id for more data
         article_id = dj_au_obj.xpath('field[@to = "article.article"]/text()').extract_first()
         dj_art_obj = response.xpath('/django-objects/object[@pk = "%s"]'%article_id)
         pub_date = dj_art_obj.xpath('field[@name = "date_published"]/text()').extract_first()
         parts['time'] = datetime.strptime(pub_date , "%Y-%m-%d %H:%M:%S")
         parts['headline']  = dj_art_obj.xpath('field[@name = "name"]/text()').extract_first()
         parts['body'] = dj_art_obj.xpath('field[@name = "content"]/text()').extract_first()
         item_loader = BerlinItemLoader()
         item_loader.add_value(None, parts)
         yield item_loader.load_item()
示例#2
0
    def parse(self, response):
        parts = {}
        parts['place'] = None
        parts['author'] = 'Polizei Berlin'
        parts['source_name'] = 'polizei'
        parts['source_url'] = dj_au_obj.xpath('field[@name = "source_url"]/text()').extract_first()
        parts['source_id'] = dj_au_obj.xpath('field[@name = "source_url"]/text()').re_first('(\d+)/index\.html')
        parts['place'] = dj_au_obj.xpath('field[@name = "district"]/text()').extract_first()
        parts['time'] = datetime.strptime(pub_date , "%Y-%m-%d %H:%M:%S")

        parts['headline']  = dj_art_obj.xpath('field[@name = "name"]/text()').extract_first()
        parts['body'] = dj_art_obj.xpath('field[@name = "content"]/text()').extract_first()
        item_loader = BerlinItemLoader()
        item_loader.add_value(None, parts)
        yield item_loader.load_item()
示例#3
0
 def parse_item_page(self, response):
     ##
     # retrieve what we sent
     selector = response.css('div.article')
     item_loader = BerlinItemLoader(selector=selector)
     ##
     # the simple parts
     parts = response.meta['parts']
     parts['place'] = self.parse_item_page_place(response)
     parts['author'] = 'Polizei Berlin'
     item_loader.add_value(None, parts)
     ##
     #
     item_loader.add_css('headline', 'h1.title::text')
     item_loader.add_css('body', 'div.textile')
     return item_loader.load_item()
示例#4
0
    def parse(self, response):
        parts = {}
        parts['place'] = None
        parts['author'] = 'Polizei Berlin'
        parts['source_name'] = 'polizei'
        parts['source_url'] = dj_au_obj.xpath(
            'field[@name = "source_url"]/text()').extract_first()
        parts['source_id'] = dj_au_obj.xpath(
            'field[@name = "source_url"]/text()').re_first('(\d+)/index\.html')
        parts['place'] = dj_au_obj.xpath(
            'field[@name = "district"]/text()').extract_first()
        parts['time'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M:%S")

        parts['headline'] = dj_art_obj.xpath(
            'field[@name = "name"]/text()').extract_first()
        parts['body'] = dj_art_obj.xpath(
            'field[@name = "content"]/text()').extract_first()
        item_loader = BerlinItemLoader()
        item_loader.add_value(None, parts)
        yield item_loader.load_item()
示例#5
0
 def parse(self, response):
     """this is the dump of a cms running on django nonrel and google appengine I  wrote in 2010 or s/t. Appengine got too expensive for me."""
     logging.log(logging.INFO, "Parsed XML-Input-File %s" % response.url)
     ##
     # bvg is still online, we get e/t from there by recrawling recursively
     for dj_au_obj in response.xpath(
             '/django-objects/object[@model = "stag.augmentedarticlescraperpolizeiberlin"]'
     ):
         parts = {}
         parts['place'] = None
         parts['author'] = 'Polizei Berlin'
         parts['source_name'] = 'polizei'
         parts['source_url'] = dj_au_obj.xpath(
             'field[@name = "source_url"]/text()').extract_first()
         parts['source_id'] = dj_au_obj.xpath(
             'field[@name = "source_url"]/text()').re_first(
                 '(\d+)/index\.html')
         parts['place'] = dj_au_obj.xpath(
             'field[@name = "district"]/text()').extract_first()
         if parts['place'] == 'False':
             parts['place'] = None
         ##
         # select the payload-article
         # now we need to resolve the id for more data
         article_id = dj_au_obj.xpath(
             'field[@to = "article.article"]/text()').extract_first()
         dj_art_obj = response.xpath('/django-objects/object[@pk = "%s"]' %
                                     article_id)
         pub_date = dj_art_obj.xpath(
             'field[@name = "date_published"]/text()').extract_first()
         parts['time'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M:%S")
         parts['headline'] = dj_art_obj.xpath(
             'field[@name = "name"]/text()').extract_first()
         parts['body'] = dj_art_obj.xpath(
             'field[@name = "content"]/text()').extract_first()
         item_loader = BerlinItemLoader()
         item_loader.add_value(None, parts)
         yield item_loader.load_item()
示例#6
0
 def parse_item_page(self, response):
     ##
     # retrieve what we sent
     selector = response.css('div.article')
     item_loader = BerlinItemLoader(selector = selector)
     ##
     # the simple parts
     parts = response.meta['parts']
     parts['place'] = self.parse_item_page_place(response)
     parts['author'] = 'Polizei Berlin'
     item_loader.add_value(None, parts)
     ##
     #
     item_loader.add_css('headline', 'h1.title::text')
     item_loader.add_css('body', 'div.textile')
     return item_loader.load_item()
示例#7
0
文件: bvg.py 项目: sbry/scrapy-berlin
 def parse_item_page(self, response):
     selector = response.css('div.article__body')
     item_loader = BerlinItemLoader(selector = selector)
     ##
     # the simple parts
     parts = response.meta['parts']
     place_from_text, raw_time, unwanted, author, headline \
       = [ x.xpath('string(.)') for x in selector.css('.moment-info dd') ]
     parts['author'] = author.extract_first()
     if not 'author' in parts:
         parts['author'] = "-"
     parts['headline'] = headline.extract_first()
     ##
     
     parts['place'] = self.parse_item_page_place(selector)
     ##
     # and load
     item_loader.add_value(None, parts)
     ##
     #
     item_loader.add_css('body','div.moment-message')
     return item_loader.load_item()