def runTest(self): for s in examples: self.assert_(japanese_addresses.find_addresses(s), s.encode('utf-8')) for s in negative_examples: self.assertFalse(japanese_addresses.find_addresses(s), s.encode('utf-8'))
def parseEvent(self, response): def _get(css_id): return items.extract_text(response.css('#%s' % css_id)) item = items.WebEvent() item['country'] = 'JP' item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/event/(\w+)/', response.url).group(1) item['name'] = _get('u474-4') image_url = response.xpath('//img[@id="u469_img"]/@src').extract()[0] image_url = self.abs_url(response, image_url) if image_url == 'http://et-stage.net/event_image/no_image_big.jpg': image_url = None item['photo'] = image_url # cost = _get('u511-7') # email = _get('u512-4') item['description'] = _get('u468-156') item['start_time'], item['end_time'] = parse_times(_get('u506-2')) venue = _get('u507-4') if not venue: venue = jp_spider.get_venue_from_description(item['description']) jp_addresses = japanese_addresses.find_addresses(_get('u509-11')) jp_spider.setup_location(venue, jp_addresses, item) yield item
def parseEvent(self, response): print response.url item = items.WebEvent() item['country'] = 'JP' item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'\?p=(\d+)', response.url).group(1) item['name'] = items.extract_text(response.xpath('//a[@rel="bookmark"]/text()')) post = response.css('.entry-content') photos = post.css('img.size-full').xpath('./@src').extract() if photos: item['photo'] = self.abs_url(response, photos[0]) else: item['photo'] = None post_html = post.extract()[0] post_top = response.css('.social4i').extract()[0] post_html = post_html.replace(post_top, '') full_description = items.format_text(post_html) item['description'] = full_description jp_addresses = japanese_addresses.find_addresses(item['description']) venue = jp_spider.get_venue_from_description(item['description']) jp_spider.setup_location(venue, jp_addresses, item) content_date = ''.join(response.css('.contentdate').xpath('.//text()').extract()) item['start_time'], item['end_time'] = self.parseDateTimes(content_date, full_description) yield item
def parseEvent(self, response): print response.url item = items.WebEvent() item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/(\d+)\.php', response.url).group(1) item['name'] = items.extract_text(response.css('div.event-detail-name')) photos = response.css('div.event-detail-img').xpath('./a/@href').extract() if photos: item['photo'] = self.abs_url(response, photos[0]) else: item['photo'] = None category = response.css('div.event-detail-koumoku').xpath('./img/@alt').extract()[0] # Because dt otherwise remains flush up against the end of the previous dd, we insert manual breaks. full_description = items.format_text(response.xpath('//dl').extract()[0].replace('<dt>', '<dt><br><br>')) item['description'] = '%s\n\n%s' % (category, full_description) jp_addresses = japanese_addresses.find_addresses(item['description']) venue = jp_spider.get_venue_from_description(item['description']) jp_spider.setup_location(venue, jp_addresses, item) item['start_time'], item['end_time'] = self.parseDateTimes(response) yield item
def parseEvent(self, response): def _get(css_id): return items.extract_text(response.css('#%s' % css_id)) item = items.WebEvent() item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/event/(\w+)/', response.url).group(1) item['name'] = _get('u474-4') image_url = response.xpath('//img[@id="u469_img"]/@src').extract()[0] image_url = self.abs_url(response, image_url) if image_url == 'http://et-stage.net/event_image/no_image_big.jpg': image_url = None item['photo'] = image_url # cost = _get('u511-7') # email = _get('u512-4') item['description'] = _get('u468-156') item['start_time'], item['end_time'] = parse_times(_get('u506-2')) venue = _get('u507-4') if not venue: venue = jp_spider.get_venue_from_description(item['description']) jp_addresses = japanese_addresses.find_addresses(_get('u509-11')) jp_spider.setup_location(venue, jp_addresses, item) yield item
def parseEvent(self, response): def _get(propname, attr=None): node = response.xpath('//*[@itemprop="%s"]' % propname) if attr: return node.xpath('./@%s' % attr).extract()[0] else: return items.extract_text(node) def _definition(term): return items.extract_text( response.xpath( u'//dt[contains(., "%s")]/following-sibling::dd' % term)) item = items.WebEvent() item['country'] = 'JP' item['namespace'] = self.namespace item['namespaced_id'] = re.search(r'/event/(\w+)\.html', response.url).group(1) item['name'] = _get('name') item['photo'] = self.abs_url(response, _get('image', 'content')) genre = _definition(u'ジャンル') description = self._get_description(response) item['description'] = genre + description item['start_time'], item['end_time'] = parse_date_times( _get('startDate'), _definition(u'時間')) venue = _get('location') if not venue: venue = jp_spider.get_venue_from_description(item['description']) jp_addresses = japanese_addresses.find_addresses(item['description']) jp_spider.setup_location(venue, jp_addresses, item) yield item