def _get_parser(self): doc = html_util.fetch_and_parse(self.BASE_URL) events = html_util.get_first_element(doc, '.defaultText') content = html_util.get_displayed_text_content(events).strip() for line in content.split('\n'): if self.SHOW_START_RE.match(line): show = self._parse_show(line) if show: yield show
def _parse_show(self, date_txt, info_el): logger.debug('Parsing show in %s' % date_txt) info_txt = html_util.get_displayed_text_content(info_el) performers = [] show_time_txt = None for line in info_txt.split('\n'): match = self.PERFORMER_RE.match(line) if match: time_txt, name = match.group('time'), match.group('performer') show_time_txt = time_txt performers.append(Performer(name, start_time = time_txt)) if len(performers) == 0: return None show = Show() show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, show_time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(info_el) # Fontanas's stores the large image in an anchor tag for a in info_el.iter(tag = 'a'): if self.IMAGE_RE.search(a.get('href', '')): show.resources.image_url = a.get('href') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, event_detail): show = Show() performers = [] content = html_util.get_displayed_text_content(event_detail).strip() date_txt = None # This flag is set up and down to allow either of the following to be processed: # 1st: Ava Luna # or # 1st: # Ava Luna had_num = True logger.debug("Parsing show content: %s" % content) for line in content.split('\n'): if line: time_match = date_util.STRICT_TIME_RE.search(line) if not date_txt: date_txt = line elif time_match: show.show_time = date_util.parse_date_and_time(date_txt, time_match.group('time')) line = date_util.STRICT_TIME_RE.sub('', line).strip(': ') if line: performers.append(Performer(line)) had_num = False else: had_num = True elif self.NUM_RE.match(line): line = self.NUM_RE.sub('', line).strip() if line: performers.append(Performer(line)) had_num = False else: had_num = True elif had_num: performers.append(Performer(line)) had_num = False else: logger.error('Unknown line format: %s' % line) show.venue = self.venue() show.performers = performers show.date = date_util.parse_date_and_time(date_txt, None) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def TextMatcher(node, match_exp): text = html_util.get_displayed_text_content(node) for m in match_exp.finditer(text): yield m