예제 #1
0
    def parseEvent(self, response):
        def _get(propname, attr=None):
            node = response.xpath('//*[@itemprop="%s"]' % propname)
            if attr:
                return node.xpath('./@%s' % attr).extract()[0]
            else:
                return items.extract_text(node)

        def _definition(term):
            return items.extract_text(response.xpath(u'//dt[contains(., "%s")]/following-sibling::dd' % term))

        item = items.WebEvent()
        item['country'] = 'JP'
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/event/(\w+)\.html', response.url).group(1)
        item['name'] = _get('name')
        item['photo'] = self.abs_url(response, _get('image', 'content'))

        genre = _definition(u'ジャンル')
        description = self._get_description(response)
        item['description'] = genre + description

        item['start_time'], item['end_time'] = parse_date_times(_get('startDate'), _definition(u'時間'))

        venue = _get('location')
        if not venue:
            venue = jp_spider.get_venue_from_description(item['description'])
        jp_addresses = japanese_addresses.find_addresses(item['description'])
        jp_spider.setup_location(venue, jp_addresses, item)

        yield item
    def parseEvent(self, response):
        def _get(css_id):
            return items.extract_text(response.css('#%s' % css_id))

        item = items.WebEvent()
        item['country'] = 'JP'
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/event/(\w+)/', response.url).group(1)

        item['description'] = items.extract_text(response.css('div.eventdetail'))

        tds = response.css('div.visible-xs table.table td')
        if len(tds) != 6:
            logging.error('Problem with unknown %s tds:\n%s', len(tds), '\n'.join(str(x) for x in tds))

        item['name'] = items.extract_text(tds[0].xpath('.//text()'))
        item['start_time'], item['end_time'] = self.parseDateTimes(items.extract_text(tds[1].xpath('.//text()')))
        venue = items.extract_text(tds[2].xpath('.//text()'))
        address = items.extract_text(tds[3].xpath('.//text()'))
        if not venue:
            venue = jp_spider.get_venue_from_description(item['description'])
        jp_addresses = japanese_addresses.find_addresses(address)
        jp_spider.setup_location(venue, jp_addresses, item)

        image_elements = response.xpath("//img[@data-target='#image_Modal']/@src").extract()
        if image_elements:
            image_url = image_elements[0]
            image_url = self.abs_url(response, image_url)
        else:
            image_url = None
        item['photo'] = image_url

        yield item
예제 #3
0
    def parseEvent(self, response):
        print response.url

        item = items.WebEvent()
        item['country'] = 'JP'
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/(\d+)\.php', response.url).group(1)
        item['name'] = items.extract_text(response.css('div.event-detail-name'))

        photos = response.css('div.event-detail-img').xpath('./a/@href').extract()
        if photos:
            item['photo'] = self.abs_url(response, photos[0])
        else:
            item['photo'] = None

        category = response.css('div.event-detail-koumoku').xpath('./img/@alt').extract()[0]
        # Because dt otherwise remains flush up against the end of the previous dd, we insert manual breaks.
        full_description = items.format_text(response.xpath('//dl').extract()[0].replace('<dt>', '<dt><br><br>'))
        item['description'] = '%s\n\n%s' % (category, full_description)

        jp_addresses = japanese_addresses.find_addresses(item['description'])
        venue = jp_spider.get_venue_from_description(item['description'])
        jp_spider.setup_location(venue, jp_addresses, item)

        item['start_time'], item['end_time'] = self.parseDateTimes(response)

        yield item
예제 #4
0
    def parseEvent(self, response):
        print response.url

        item = items.WebEvent()
        item['country'] = 'JP'
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/event/(\d+)',
                                          response.url).group(1)
        item['name'] = items.extract_text(
            response.css('div.title h2').xpath('.//text()'))

        photos = response.css('a.cb_photo').xpath('./@href').extract()
        if photos:
            item['photo'] = self.abs_url(response, photos[0])
        else:
            item['photo'] = None

        item['description'] = items.extract_text(
            response.css('div.tag_preview'))

        jp_addresses = japanese_addresses.find_addresses(item['description'])
        venue = jp_spider.get_venue_from_description(item['description'])
        jp_spider.setup_location(venue, jp_addresses, item)

        content_date = response.css('tr.event_date td').xpath(
            './text()').extract()[0]
        item['start_time'], item['end_time'] = self.parseDateTimes(
            content_date.strip())

        yield item
    def parseEvent(self, response):
        print response.url

        item = items.WebEvent()
        item['country'] = 'JP'
        item['namespace'] = self.namespace
        item['namespaced_id'] = re.search(r'/event/(\d+)', response.url).group(1)
        item['name'] = items.extract_text(response.css('div.title h2').xpath('.//text()'))

        photos = response.css('a.cb_photo').xpath('./@href').extract()
        if photos:
            item['photo'] = self.abs_url(response, photos[0])
        else:
            item['photo'] = None

        item['description'] = items.extract_text(response.css('div.tag_preview'))

        jp_addresses = japanese_addresses.find_addresses(item['description'])
        venue = jp_spider.get_venue_from_description(item['description'])
        jp_spider.setup_location(venue, jp_addresses, item)

        content_date = response.css('tr.event_date td').xpath('./text()').extract()[0]
        item['start_time'], item['end_time'] = self.parseDateTimes(content_date.strip())

        yield item
예제 #6
0
 def runTest(self):
     for s in examples:
         self.assert_(japanese_addresses.find_addresses(s), s.encode('utf-8'))
     for s in negative_examples:
         self.assertFalse(japanese_addresses.find_addresses(s), s.encode('utf-8'))