Пример #1
0
 def runTest(self):
     for s in examples:
         self.assert_(japanese_addresses.find_addresses(s),
                      s.encode('utf-8'))
     for s in negative_examples:
         self.assertFalse(japanese_addresses.find_addresses(s),
                          s.encode('utf-8'))
Пример #2
0
    def parseEvent(self, response):
        def _get(css_id):
            return items.extract_text(response.css('#%s' % css_id))

        item = items.WebEvent()
        item['country'] = 'JP'
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/event/(\w+)/',
                                          response.url).group(1)
        item['name'] = _get('u474-4')
        image_url = response.xpath('//img[@id="u469_img"]/@src').extract()[0]
        image_url = self.abs_url(response, image_url)
        if image_url == 'http://et-stage.net/event_image/no_image_big.jpg':
            image_url = None
        item['photo'] = image_url

        # cost = _get('u511-7')
        # email = _get('u512-4')

        item['description'] = _get('u468-156')
        item['start_time'], item['end_time'] = parse_times(_get('u506-2'))

        venue = _get('u507-4')
        if not venue:
            venue = jp_spider.get_venue_from_description(item['description'])
        jp_addresses = japanese_addresses.find_addresses(_get('u509-11'))
        jp_spider.setup_location(venue, jp_addresses, item)

        yield item
    def parseEvent(self, response):
        print response.url

        item = items.WebEvent()
        item['country'] = 'JP'
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'\?p=(\d+)', response.url).group(1)
        item['name'] = items.extract_text(response.xpath('//a[@rel="bookmark"]/text()'))

        post = response.css('.entry-content')

        photos = post.css('img.size-full').xpath('./@src').extract()
        if photos:
            item['photo'] = self.abs_url(response, photos[0])
        else:
            item['photo'] = None

        post_html = post.extract()[0]
        post_top = response.css('.social4i').extract()[0]
        post_html = post_html.replace(post_top, '')

        full_description = items.format_text(post_html)
        item['description'] = full_description

        jp_addresses = japanese_addresses.find_addresses(item['description'])
        venue = jp_spider.get_venue_from_description(item['description'])
        jp_spider.setup_location(venue, jp_addresses, item)

        content_date = ''.join(response.css('.contentdate').xpath('.//text()').extract())
        item['start_time'], item['end_time'] = self.parseDateTimes(content_date, full_description)

        yield item
Пример #4
0
    def parseEvent(self, response):
        print response.url

        item = items.WebEvent()
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/(\d+)\.php', response.url).group(1)
        item['name'] = items.extract_text(response.css('div.event-detail-name'))

        photos = response.css('div.event-detail-img').xpath('./a/@href').extract()
        if photos:
            item['photo'] = self.abs_url(response, photos[0])
        else:
            item['photo'] = None

        category = response.css('div.event-detail-koumoku').xpath('./img/@alt').extract()[0]
        # Because dt otherwise remains flush up against the end of the previous dd, we insert manual breaks.
        full_description = items.format_text(response.xpath('//dl').extract()[0].replace('<dt>', '<dt><br><br>'))
        item['description'] = '%s\n\n%s' % (category, full_description)

        jp_addresses = japanese_addresses.find_addresses(item['description'])
        venue = jp_spider.get_venue_from_description(item['description'])
        jp_spider.setup_location(venue, jp_addresses, item)

        item['start_time'], item['end_time'] = self.parseDateTimes(response)

        yield item
    def parseEvent(self, response):
        def _get(css_id):
            return items.extract_text(response.css('#%s' % css_id))

        item = items.WebEvent()
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/event/(\w+)/', response.url).group(1)
        item['name'] = _get('u474-4')
        image_url = response.xpath('//img[@id="u469_img"]/@src').extract()[0]
        image_url = self.abs_url(response, image_url)
        if image_url == 'http://et-stage.net/event_image/no_image_big.jpg':
            image_url = None
        item['photo'] = image_url

        # cost = _get('u511-7')
        # email = _get('u512-4')

        item['description'] = _get('u468-156')
        item['start_time'], item['end_time'] = parse_times(_get('u506-2'))

        venue = _get('u507-4')
        if not venue:
            venue = jp_spider.get_venue_from_description(item['description'])
        jp_addresses = japanese_addresses.find_addresses(_get('u509-11'))
        jp_spider.setup_location(venue, jp_addresses, item)

        yield item
Пример #6
0
    def parseEvent(self, response):
        def _get(propname, attr=None):
            node = response.xpath('//*[@itemprop="%s"]' % propname)
            if attr:
                return node.xpath('./@%s' % attr).extract()[0]
            else:
                return items.extract_text(node)

        def _definition(term):
            return items.extract_text(
                response.xpath(
                    u'//dt[contains(., "%s")]/following-sibling::dd' % term))

        item = items.WebEvent()
        item['country'] = 'JP'
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/event/(\w+)\.html',
                                          response.url).group(1)
        item['name'] = _get('name')
        item['photo'] = self.abs_url(response, _get('image', 'content'))

        genre = _definition(u'ジャンル')
        description = self._get_description(response)
        item['description'] = genre + description

        item['start_time'], item['end_time'] = parse_date_times(
            _get('startDate'), _definition(u'時間'))

        venue = _get('location')
        if not venue:
            venue = jp_spider.get_venue_from_description(item['description'])
        jp_addresses = japanese_addresses.find_addresses(item['description'])
        jp_spider.setup_location(venue, jp_addresses, item)

        yield item
 def runTest(self):
     for s in examples:
         self.assert_(japanese_addresses.find_addresses(s), s.encode('utf-8'))
     for s in negative_examples:
         self.assertFalse(japanese_addresses.find_addresses(s), s.encode('utf-8'))