def parseEvent(self, response): def _get(propname, attr=None): node = response.xpath('//*[@itemprop="%s"]' % propname) if attr: return node.xpath('./@%s' % attr).extract()[0] else: return items.extract_text(node) def _definition(term): return items.extract_text(response.xpath(u'//dt[contains(., "%s")]/following-sibling::dd' % term)) item = items.WebEvent() item['country'] = 'JP' item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/event/(\w+)\.html', response.url).group(1) item['name'] = _get('name') item['photo'] = self.abs_url(response, _get('image', 'content')) genre = _definition(u'ジャンル') description = self._get_description(response) item['description'] = genre + description item['start_time'], item['end_time'] = parse_date_times(_get('startDate'), _definition(u'時間')) venue = _get('location') if not venue: venue = jp_spider.get_venue_from_description(item['description']) jp_addresses = japanese_addresses.find_addresses(item['description']) jp_spider.setup_location(venue, jp_addresses, item) yield item
def parseEvent(self, response): def _get(css_id): return items.extract_text(response.css('#%s' % css_id)) item = items.WebEvent() item['country'] = 'JP' item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/event/(\w+)/', response.url).group(1) item['description'] = items.extract_text(response.css('div.eventdetail')) tds = response.css('div.visible-xs table.table td') if len(tds) != 6: logging.error('Problem with unknown %s tds:\n%s', len(tds), '\n'.join(str(x) for x in tds)) item['name'] = items.extract_text(tds[0].xpath('.//text()')) item['start_time'], item['end_time'] = self.parseDateTimes(items.extract_text(tds[1].xpath('.//text()'))) venue = items.extract_text(tds[2].xpath('.//text()')) address = items.extract_text(tds[3].xpath('.//text()')) if not venue: venue = jp_spider.get_venue_from_description(item['description']) jp_addresses = japanese_addresses.find_addresses(address) jp_spider.setup_location(venue, jp_addresses, item) image_elements = response.xpath("//img[@data-target='#image_Modal']/@src").extract() if image_elements: image_url = image_elements[0] image_url = self.abs_url(response, image_url) else: image_url = None item['photo'] = image_url yield item
def parseEvent(self, response): print response.url item = items.WebEvent() item['country'] = 'JP' item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/(\d+)\.php', response.url).group(1) item['name'] = items.extract_text(response.css('div.event-detail-name')) photos = response.css('div.event-detail-img').xpath('./a/@href').extract() if photos: item['photo'] = self.abs_url(response, photos[0]) else: item['photo'] = None category = response.css('div.event-detail-koumoku').xpath('./img/@alt').extract()[0] # Because dt otherwise remains flush up against the end of the previous dd, we insert manual breaks. full_description = items.format_text(response.xpath('//dl').extract()[0].replace('<dt>', '<dt><br><br>')) item['description'] = '%s\n\n%s' % (category, full_description) jp_addresses = japanese_addresses.find_addresses(item['description']) venue = jp_spider.get_venue_from_description(item['description']) jp_spider.setup_location(venue, jp_addresses, item) item['start_time'], item['end_time'] = self.parseDateTimes(response) yield item
def parseEvent(self, response): print response.url item = items.WebEvent() item['country'] = 'JP' item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/event/(\d+)', response.url).group(1) item['name'] = items.extract_text( response.css('div.title h2').xpath('.//text()')) photos = response.css('a.cb_photo').xpath('./@href').extract() if photos: item['photo'] = self.abs_url(response, photos[0]) else: item['photo'] = None item['description'] = items.extract_text( response.css('div.tag_preview')) jp_addresses = japanese_addresses.find_addresses(item['description']) venue = jp_spider.get_venue_from_description(item['description']) jp_spider.setup_location(venue, jp_addresses, item) content_date = response.css('tr.event_date td').xpath( './text()').extract()[0] item['start_time'], item['end_time'] = self.parseDateTimes( content_date.strip()) yield item
def parseEvent(self, response): print response.url item = items.WebEvent() item['country'] = 'JP' item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/event/(\d+)', response.url).group(1) item['name'] = items.extract_text(response.css('div.title h2').xpath('.//text()')) photos = response.css('a.cb_photo').xpath('./@href').extract() if photos: item['photo'] = self.abs_url(response, photos[0]) else: item['photo'] = None item['description'] = items.extract_text(response.css('div.tag_preview')) jp_addresses = japanese_addresses.find_addresses(item['description']) venue = jp_spider.get_venue_from_description(item['description']) jp_spider.setup_location(venue, jp_addresses, item) content_date = response.css('tr.event_date td').xpath('./text()').extract()[0] item['start_time'], item['end_time'] = self.parseDateTimes(content_date.strip()) yield item
def runTest(self): for s in examples: self.assert_(japanese_addresses.find_addresses(s), s.encode('utf-8')) for s in negative_examples: self.assertFalse(japanese_addresses.find_addresses(s), s.encode('utf-8'))