def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link, parse_500 = True) event_detail = html_util.get_first_element(event_doc, ".event-detail") artist_info = html_util.get_first_element(event_doc, ".artist-boxes") date_txt = html_util.get_first_element(event_detail, ".dates").text_content() performers = [] for el in html_util.get_elements(event_doc, '.headliners'): for name in lang_util.parse_performers(el.text_content()): performers.append(Performer(name, headliner = True)) for el in html_util.get_elements(event_doc, '.supports'): for name in lang_util.parse_performers(el.text_content()): performers.append(Performer(name, headliner = False)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, html_util.get_first_element(event_detail, ".times").text_content()) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail, artist_info) img = html_util.get_first_element(event_detail, "img", optional = True) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_performers(self, h): ret = [] for name in lang_util.parse_performers(h.text_content()): ret.append(Performer(name, headliner = h.tag in ('h1', 'h2'))) return ret
def _parse_show(self, event_detail): if html_util.get_first_element(event_detail, 'h2', optional = True) is None: return None show = Show() date_txt = html_util.get_first_element(event_detail, 'h2').text_content() performers_txt = html_util.get_first_element(event_detail, '.caption').text_content() show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(performers_txt)] if not date_txt.lower().startswith('every'): show.date = date_util.parse_date_and_time(date_txt, None) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, el): event_detail = html_util.get_first_element(el, '.event-details') date_txt = html_util.get_first_element(event_detail, 'strong').text time_txt = event_detail.text_content() show = Show() show.venue = self.venue() title_txt = html_util.get_first_element(event_detail, '.event-name').text_content() show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)] show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) img = html_util.get_first_element(el, ".event-image img", optional = True) if img is not None: show.resources.image_url = img.get('src') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _trans_show(self, show_data): LOG.debug("Checking event: %s" % show_data['EventName']) if "Music" not in show_data['MajorGenre']: LOG.debug("Skipping non music show") return None elif show_data.get('Canceled'): LOG.debug("Skipping cancelled show") return None elif 'VIP Packages' in show_data['EventName']: LOG.debug("Skipping VIP package") return None show = Show() performers = [] for i, name in enumerate(lang_util.parse_performers(show_data['EventName'])): performers.append(Performer(name, headliner = i == 0)) show.merge_key = show_data['EventId'] show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_time(show_data['EventDate']) #if show.show_time: # show.show_time = timezone(show_data['Timezone']).localize(show.show_time) if show_data['AttractionImage']: show.resources.image_url = self._image_url(show_data, show_data['AttractionImage'][0]) return show
def _parse_performers(self, el): headliner = html_util.has_class(el, "headliners") support = html_util.has_class(el, "supports") if not headliner and not support: return [] elif headliner: return [Performer(el.text_content(), headliner = True)] elif support: return [Performer(name) for name in lang_util.parse_performers(el.text_content())]
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) event_detail = event_doc.get_element_by_id("eventDetail") title_txt = [] found_h_el = False # Start parsing when we find the first h* element # Stop parsing if we found an h* element, but then encounter anything else for el in event_detail.getchildren(): if el.tag in ("h1", "h2"): found_h_el = True if el.text_content(): title_txt.append(el.text_content()) elif found_h_el: break """ <span id="timeDetail"> Apr 24, 2010<br /> upstairs<br /> Doors @ 7 PM<br/> $15.00 Adv. / $20 at the Door<br /> <a href="http://www.deadcellentertainment.tix.com/Schedule.asp?OrganizationNumber=2690" target="_blank"> <img src="/images/buyticket.png" alt="Purchase Tickets" /> </a> </span> """ time_el = event_detail.get_element_by_id("timeDetail") date_txt = time_el.text time_txt = time_el.text_content() performers = [] show = Show() show.merge_key = link show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers("/".join(title_txt))] show.door_time = date_util.parse_show_time(date_txt, time_txt) show.show_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) img = html_util.get_first_element(event_detail, "img") if img is not None: show.resources.image_url = img.get("src") return show
def _parse_show(self, show_txt): parts = show_txt.split(self.SHOW_PART_SEP) date_txt, time_txt = parts[0], parts[1] performers = parts[-1] show = Show() show.show_time = date_util.parse_date_and_time(date_txt, time_txt) show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(performers)] date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _process_entry(self, entry): logger.debug("Processing entry: %s, starting on: %s" % (entry.title.text, entry.when[0].start_time)) if not self.BACK_ROOM_RE.match(entry.title.text): return None title_txt = self.BACK_ROOM_RE.sub('', entry.title.text) show = Show() show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)] show.show_time = date_util.parse_date_time(entry.when[0].start_time) return show
def _parse_show(self, link): raw_url = self.raw_url(link) match = self.EVENT_ID.search(link) if not match: raise Exception("Unable to locate event id in: %s" % link) event_id = match.group(0) logging.debug('Fetching show info: %s' % link) event_doc = html_util.fetch_and_parse(link) show_el = html_util.get_first_element(event_doc, '#content') header_el = html_util.get_first_element(show_el, 'h1') header_match = self.HEADER_PARSE.search(header_el.text_content()) if not header_match: raise Exception("Unable to parse header: %s" % header_el.text_content()) date_txt = header_match.group('date').strip() title = header_match.group('title').strip() if date_txt.lower().startswith('tonight'): date_txt = datetime.today().date().strftime('%F') img = html_util.get_first_element(show_el, 'img', optional = True) show = Show() show.performers = [Performer(p) for p in lang_util.parse_performers(title)] show.show_time = date_util.parse_date_and_time(date_txt, None) show.merge_key = event_id show.venue = self.venue() show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(show_el) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_show(self, api, event): LOG.debug("Parsing event: %s" % event["id"]) show = Show() show.merge_key = event["id"] show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(event["name"])] show.show_time = date_util.parse_date_time(event["start_time"]) html_doc = u"<html><body>%s</body></html>" % cgi.escape(event.get("description", "")) doc = lxml.html.document_fromstring(html_doc) show.resources.show_url = self.EVENT_URL % event["id"] show.resources.image_url = self.PICTURE_URL % event["id"] show.resources.resource_uris = self.resource_extractor.extract_resources(doc) return show
def _parse_show(self, url, section): doc = html_util.fetch_and_parse(url) show_el = html_util.get_first_element(doc, '#detailPage') date_txt = html_util.get_first_element(show_el, 'time.dtstart').get('datetime') title = html_util.get_first_element(section, 'h4').text_content() show = Show() show.merge_key = url show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(title)] show.show_time = date_util.parse_date_time(date_txt) show.resources.show_url = url show.resources.resource_uris = self.resource_extractor.extract_resources(section, show_el) return show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) match = self.IS_EVENT.match(link) event_id = int(match.group("event_id")) event_detail = html_util.get_first_element(event_doc, ".tfly-event-id-%d" % event_id) date_txt = html_util.get_first_element(event_doc, ".dates").text_content() time_txt = html_util.get_first_element(event_doc, ".times").text_content() img = html_util.get_first_element(event_detail, "img") performers = [] for p in html_util.get_elements(event_detail, ".headliners"): performers.append(Performer(p.text_content(), headliner=True)) for p in html_util.get_elements(event_detail, ".supports"): for pi in lang_util.parse_performers(p.text_content()): performers.append(Performer(pi, headliner=False)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) if img is not None: show.resources.image_url = img.get("src") return show
def _parse_shows(self, entry): content = None shows = [] today = datetime.now() entry_date = date_util.parse_date_time(entry.published) # Only parse shows for the current year, or at the tail end of last year if entry_date.year != today.year or (entry_date.year == today.year -1 and entry_date.month > 10): return [] for item in entry.content: if item.type in ('text/html',): content = item.value if not content: logging.error('Unable to extract content from entry: %s' % entry.id) return [] # This next part is technically pretty evil entry_doc = lxml.html.fromstring(content) tags = ['span', 'b', 'i', 'strong', 'em'] cleaner = Cleaner(remove_tags = tags, links = False) clean_content = cleaner.clean_html(entry_doc) # FIXME patch lxml to handle this while calling text_content() # http://codespeak.net/pipermail/lxml-dev/2008-August/004009.html content_str = lxml.html.tostring(clean_content) for regexp, replacement in self.REPLACEMENTS: content_str = regexp.sub(replacement, content_str) for part in self.SHOW_DIVIDER_RE.split(content_str): part = part.strip(' \t\n') parts = part.split('\n') header = parts.pop(0) body = '\n'.join(parts) header_parts = self.HEADER_SEP_RE.split(header) date_txt = header_parts.pop(0) time_txt = None for part in header_parts: if date_util.STRICT_TIME_RE.search(part): time_txt = date_util.sanitize_time(part) break if not time_txt: logging.error('Unable to find time in header: %s' % header) continue if '-' in time_txt: time_txt = time_txt.split('-')[0].strip() if not(time_txt.endswith('am') or time_txt.endswith('pm')): time_txt = time_txt + 'pm' show_doc = lxml.html.fromstring(body) use_all = False performer_parts = [] all_parts = [] for el in show_doc.iter(): if self._is_img(el): break text = el.text or '' tail = el.tail or '' for regexp in self.BODY_SKIP: if regexp.search(text): text = '' if regexp.search(tail): tail = '' for p in (text, tail): if p: all_parts.append(p) if text and el.tag != 'a': use_all = True if el.tag == 'a' and tail.strip() not in(',', '&', 'w/', ''): use_all = True if el.tag == 'a': performer_parts.append(text) img_url = None for img in show_doc.iter(tag = 'img'): if img.get('src'): img_url = img.get('src') break show = Show() show.venue = self.venue() if use_all: performers_str = ' '.join(all_parts).replace(' ,', ',').replace(' ', ' ') show.performers = [Performer(name) for name in lang_util.parse_performers(performers_str)] else: show.performers = [Performer(name) for name in performer_parts if name] try: show.show_time = date_util.parse_date_and_time(date_txt, time_txt) except: logging.exception('Unable to parse: %s - %s' % (date_txt, time_txt)) continue show.resources.image_url = img_url show.resources.resource_uris = self.resource_extractor.extract_resources(show_doc) date_util.adjust_fuzzy_years(show, entry_date) shows.append(show) return shows