def _parse_show(self, el): event_detail = html_util.get_first_element(el, '.event-details') date_txt = html_util.get_first_element(event_detail, 'strong').text time_txt = event_detail.text_content() show = Show() show.venue = self.venue() title_txt = html_util.get_first_element(event_detail, '.event-name').text_content() show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)] show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) img = html_util.get_first_element(el, ".event-image img", optional = True) if img is not None: show.resources.image_url = img.get('src') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, event_detail): if html_util.get_first_element(event_detail, 'h2', optional = True) is None: return None show = Show() date_txt = html_util.get_first_element(event_detail, 'h2').text_content() performers_txt = html_util.get_first_element(event_detail, '.caption').text_content() show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(performers_txt)] if not date_txt.lower().startswith('every'): show.date = date_util.parse_date_and_time(date_txt, None) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, show_txt): parts = show_txt.split(self.SHOW_PART_SEP) date_txt, time_txt = parts[0], parts[1] performers = parts[-1] show = Show() show.show_time = date_util.parse_date_and_time(date_txt, time_txt) show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(performers)] date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, show_url, event_detail, today): show = Show() # Union hall will have duplicate instances of #unionhall_performer # some may or may not have links, but those that do have links are tagged # with the same id again ie: <div id="unionhall_performer"><a href="#" id="#unionhall_performer"> ... performers = [Performer(p.text_content()) for p in event_detail.cssselect("#unionhall_performer") if p.tag != 'a'] performers[0].headliner = True ticket_link = html_util.get_first_element(event_detail, '#ticket_link a', optional = True) show.venue = self.venue() show.performers = performers if ticket_link is not None: show.merge_key = ticket_link.get('href') # Format: THU 3/25: 6pm / $15 date_tag = event_detail.get_element_by_id("unionhall_date") date_match = self.DATE_RE.match(date_tag.text_content()) time_match = self.TIME_RE.search(date_tag.text_content()) if date_match and time_match: month, day = (int(d) for d in (date_match.group('month'), date_match.group('day'))) show_date = datetime.now().replace(month = month, day = day) show.show_time = date_util.parse_date_and_time(show_date.strftime('%F'), time_match.group('time')) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, el): date_el = html_util.get_first_element(el, '.calendardates') for span in date_el.iter(tag = 'span'): if span.get('class') == 'small': span.getparent().remove(span) date_txt = date_el.text_content().lower() # Skip recurring events if 'every' in date_txt: return None date_txt, time_txt = date_txt.split(',') performers = [] title_el = html_util.get_first_element(el, '.calendar') for name in title_el.text_content().split('/'): performers.append(Performer(name)) show = Show() show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(el) for img in el.iter(tag = 'img'): logging.debug('image: %s - %s' % (img.get('src'), self.IMAGE_RE.search(img.get('src', '')))) if self.IMAGE_RE.search(img.get('src', '')): show.resources.image_url = img.get('src') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, date_txt, info_el): logger.debug('Parsing show in %s' % date_txt) info_txt = html_util.get_displayed_text_content(info_el) performers = [] show_time_txt = None for line in info_txt.split('\n'): match = self.PERFORMER_RE.match(line) if match: time_txt, name = match.group('time'), match.group('performer') show_time_txt = time_txt performers.append(Performer(name, start_time = time_txt)) if len(performers) == 0: return None show = Show() show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, show_time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(info_el) # Fontanas's stores the large image in an anchor tag for a in info_el.iter(tag = 'a'): if self.IMAGE_RE.search(a.get('href', '')): show.resources.image_url = a.get('href') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, event_detail): show = Show() performers = [] content = html_util.get_displayed_text_content(event_detail).strip() date_txt = None # This flag is set up and down to allow either of the following to be processed: # 1st: Ava Luna # or # 1st: # Ava Luna had_num = True logger.debug("Parsing show content: %s" % content) for line in content.split('\n'): if line: time_match = date_util.STRICT_TIME_RE.search(line) if not date_txt: date_txt = line elif time_match: show.show_time = date_util.parse_date_and_time(date_txt, time_match.group('time')) line = date_util.STRICT_TIME_RE.sub('', line).strip(': ') if line: performers.append(Performer(line)) had_num = False else: had_num = True elif self.NUM_RE.match(line): line = self.NUM_RE.sub('', line).strip() if line: performers.append(Performer(line)) had_num = False else: had_num = True elif had_num: performers.append(Performer(line)) had_num = False else: logger.error('Unknown line format: %s' % line) show.venue = self.venue() show.performers = performers show.date = date_util.parse_date_and_time(date_txt, None) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_shows(self, entry): content = None shows = [] today = datetime.now() entry_date = date_util.parse_date_time(entry.published) # Only parse shows for the current year, or at the tail end of last year if entry_date.year != today.year or (entry_date.year == today.year -1 and entry_date.month > 10): return [] for item in entry.content: if item.type in ('text/html',): content = item.value if not content: logging.error('Unable to extract content from entry: %s' % entry.id) return [] # This next part is technically pretty evil entry_doc = lxml.html.fromstring(content) tags = ['span', 'b', 'i', 'strong', 'em'] cleaner = Cleaner(remove_tags = tags, links = False) clean_content = cleaner.clean_html(entry_doc) # FIXME patch lxml to handle this while calling text_content() # http://codespeak.net/pipermail/lxml-dev/2008-August/004009.html content_str = lxml.html.tostring(clean_content) for regexp, replacement in self.REPLACEMENTS: content_str = regexp.sub(replacement, content_str) for part in self.SHOW_DIVIDER_RE.split(content_str): part = part.strip(' \t\n') parts = part.split('\n') header = parts.pop(0) body = '\n'.join(parts) header_parts = self.HEADER_SEP_RE.split(header) date_txt = header_parts.pop(0) time_txt = None for part in header_parts: if date_util.STRICT_TIME_RE.search(part): time_txt = date_util.sanitize_time(part) break if not time_txt: logging.error('Unable to find time in header: %s' % header) continue if '-' in time_txt: time_txt = time_txt.split('-')[0].strip() if not(time_txt.endswith('am') or time_txt.endswith('pm')): time_txt = time_txt + 'pm' show_doc = lxml.html.fromstring(body) use_all = False performer_parts = [] all_parts = [] for el in show_doc.iter(): if self._is_img(el): break text = el.text or '' tail = el.tail or '' for regexp in self.BODY_SKIP: if regexp.search(text): text = '' if regexp.search(tail): tail = '' for p in (text, tail): if p: all_parts.append(p) if text and el.tag != 'a': use_all = True if el.tag == 'a' and tail.strip() not in(',', '&', 'w/', ''): use_all = True if el.tag == 'a': performer_parts.append(text) img_url = None for img in show_doc.iter(tag = 'img'): if img.get('src'): img_url = img.get('src') break show = Show() show.venue = self.venue() if use_all: performers_str = ' '.join(all_parts).replace(' ,', ',').replace(' ', ' ') show.performers = [Performer(name) for name in lang_util.parse_performers(performers_str)] else: show.performers = [Performer(name) for name in performer_parts if name] try: show.show_time = date_util.parse_date_and_time(date_txt, time_txt) except: logging.exception('Unable to parse: %s - %s' % (date_txt, time_txt)) continue show.resources.image_url = img_url show.resources.resource_uris = self.resource_extractor.extract_resources(show_doc) date_util.adjust_fuzzy_years(show, entry_date) shows.append(show) return shows