def _process_entry_group(self, start_date, entries): show = Show() show.venue = self.venue() show.performers = [] entries.sort(key = lambda e: e.when[0].start_time, reverse = True) for entry in entries: logger.debug("Processing entry: %s, starting on: %s" % (entry.title.text, entry.when[0].start_time)) # Full day events usually denote a title which we currently will simply skip if 'T' not in entry.when[0].start_time: logger.debug('Entry "%s" is an all day event, skipping' % entry.title.text) continue elif 'pub side' in entry.title.text.lower(): logger.debug('Entry "%s" is on the Pub Side of Spike Hill, skipping' % entry.title.text) continue start_time = date_util.parse_date_time(entry.when[0].start_time) show.show_time = min(start_time, show.show_time or start_time) show.performers.append(Performer(entry.title.text)) return [show]
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link, parse_500 = True) event_detail = html_util.get_first_element(event_doc, ".event-detail") artist_info = html_util.get_first_element(event_doc, ".artist-boxes") date_txt = html_util.get_first_element(event_detail, ".dates").text_content() performers = [] for el in html_util.get_elements(event_doc, '.headliners'): for name in lang_util.parse_performers(el.text_content()): performers.append(Performer(name, headliner = True)) for el in html_util.get_elements(event_doc, '.supports'): for name in lang_util.parse_performers(el.text_content()): performers.append(Performer(name, headliner = False)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, html_util.get_first_element(event_detail, ".times").text_content()) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail, artist_info) img = html_util.get_first_element(event_detail, "img", optional = True) if img is not None: show.resources.image_url = img.get('src') return show
def _trans_show(self, show_data): LOG.debug("Checking event: %s" % show_data['EventName']) if "Music" not in show_data['MajorGenre']: LOG.debug("Skipping non music show") return None elif show_data.get('Canceled'): LOG.debug("Skipping cancelled show") return None elif 'VIP Packages' in show_data['EventName']: LOG.debug("Skipping VIP package") return None show = Show() performers = [] for i, name in enumerate(lang_util.parse_performers(show_data['EventName'])): performers.append(Performer(name, headliner = i == 0)) show.merge_key = show_data['EventId'] show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_time(show_data['EventDate']) #if show.show_time: # show.show_time = timezone(show_data['Timezone']).localize(show.show_time) if show_data['AttractionImage']: show.resources.image_url = self._image_url(show_data, show_data['AttractionImage'][0]) return show
def _parse_show(self, el): event_detail = html_util.get_first_element(el, '.event-details') date_txt = html_util.get_first_element(event_detail, 'strong').text time_txt = event_detail.text_content() show = Show() show.venue = self.venue() title_txt = html_util.get_first_element(event_detail, '.event-name').text_content() show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)] show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) img = html_util.get_first_element(el, ".event-image img", optional = True) if img is not None: show.resources.image_url = img.get('src') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, event_detail): if html_util.get_first_element(event_detail, 'h2', optional = True) is None: return None show = Show() date_txt = html_util.get_first_element(event_detail, 'h2').text_content() performers_txt = html_util.get_first_element(event_detail, '.caption').text_content() show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(performers_txt)] if not date_txt.lower().startswith('every'): show.date = date_util.parse_date_and_time(date_txt, None) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, link): LOG.debug("Fetching show: %s" % link) event_doc = html_util.fetch_and_parse(link) event_detail = event_doc.get_element_by_id("mainColumn") show = Show() for performer in html_util.get_elements(event_detail, 'h1'): name = performer.text_content().strip(' \n\r\t') if name: show.performers.append(Performer(name)) date_txt = html_util.get_first_element(event_detail, '.date').text_content() event_match = self.EVENT_URL.match(link) show.merge_key = event_match.group('page_id') show.venue = self.venue() show.show_time = date_util.parse_date_time(date_txt).replace(hour = 21) LOG.debug('Date: %s' % show.date) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): if 'main' in img_tag.get('src'): show.resources.image_url = img_tag.get('src') break return show
def _parse_show(self, link): logging.debug('Parsing show from: %s' % link) event_doc = html_util.fetch_and_parse(link) event = html_util.get_first_element(event_doc, '.biglisting') img = html_util.get_first_element(event, '.tonightinfo img', optional = True) date_el = html_util.get_first_element(event, '.date') date_match = self.DATE_RE.search(date_el.text_content()) if date_match: date_txt = date_match.group(0) else: raise Exception('Unable to determine show date from: %s' % date_el.text_content()) performers = [] first_time = None for det in event.cssselect('.showpage-details'): header = None for child in det.getchildren(): if child.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): header = child if header is None: logger.error('Unable to determine performer') else: time_txt = html_util.get_first_element(det, '.time').text_content() time_match = date_util.TIME_RE.search(time_txt) if time_match: first_time = time_txt = time_match.group('time') else: time_txt = None performers.append(Performer(header.text_content(), start_time = time_txt, headliner = header.tag in ('h1'))) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, first_time) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_show(self, base_date, links): performers = [] show_time = None resource_els = [] for a in links: # Every other link on the calendar seems to have no text if a.text_content(): name, start_time_txt, artist_el = self._parse_artist(a) if artist_el is not None: resource_els.append(artist_el) if start_time_txt: start_time = date_util.parse_date_and_time(base_date, start_time_txt) if not show_time or start_time < show_time: show_time = start_time performers.append(Performer(name, start_time = start_time_txt)) # Performers are list from first to last performers.reverse() resource_els.reverse() show = Show() show.venue = self.venue() show.performers = performers show.date = base_date show.show_time = show_time show.resources.resource_uris = self.resource_extractor.extract_resources(*resource_els) image_url = None for el in resource_els: if image_url: break for img_tag in el.iter(tag = 'img'): image_url = img_tag.get('src') break show.resources.image_url = image_url return show
def _parse_show(self, show_txt): parts = show_txt.split(self.SHOW_PART_SEP) date_txt, time_txt = parts[0], parts[1] performers = parts[-1] show = Show() show.show_time = date_util.parse_date_and_time(date_txt, time_txt) show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(performers)] date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _process_entry(self, entry): logger.debug("Processing entry: %s, starting on: %s" % (entry.title.text, entry.when[0].start_time)) if not self.BACK_ROOM_RE.match(entry.title.text): return None title_txt = self.BACK_ROOM_RE.sub('', entry.title.text) show = Show() show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)] show.show_time = date_util.parse_date_time(entry.when[0].start_time) return show
def _get_parser(self): today = datetime.today().date() doc = html_util.fetch_and_parse(self.CALENDAR_URL) by_date = { } for tr in doc.iter(tag = 'tr'): tds = list(tr.iter(tag = 'td')) date_txt = tds[0].text_content().strip() logger.debug('Checking if td el is a show: %s' % date_txt) if self.DATE_RE.match(date_txt): show = self._parse_show(tds) show_date = show[0].date() if show_date not in by_date: by_date[show_date] = [] by_date[show_date].append(show) for date, performers in by_date.iteritems(): performers.sort(lambda x,y: cmp(y[0], x[0])) show = Show() show.venue = self.venue() show.performers = [] all_resources = [] for time, name, resources in performers: all_resources.extend(resources) if not show.show_time: show.show_time = time else: show.show_time = min(show.show_time, time) show.performers.append(Performer(name, start_time = time.strftime('%I:%M'))) show.resources.resource_uris = list(set(all_resources)) yield show
def _parse_show(self, link): raw_url = self.raw_url(link) match = self.EVENT_ID.search(link) if not match: raise Exception("Unable to locate event id in: %s" % link) event_id = match.group(0) logging.debug('Fetching show info: %s' % link) event_doc = html_util.fetch_and_parse(link) show_el = html_util.get_first_element(event_doc, '#content') header_el = html_util.get_first_element(show_el, 'h1') header_match = self.HEADER_PARSE.search(header_el.text_content()) if not header_match: raise Exception("Unable to parse header: %s" % header_el.text_content()) date_txt = header_match.group('date').strip() title = header_match.group('title').strip() if date_txt.lower().startswith('tonight'): date_txt = datetime.today().date().strftime('%F') img = html_util.get_first_element(show_el, 'img', optional = True) show = Show() show.performers = [Performer(p) for p in lang_util.parse_performers(title)] show.show_time = date_util.parse_date_and_time(date_txt, None) show.merge_key = event_id show.venue = self.venue() show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(show_el) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_show(self, api, event): LOG.debug("Parsing event: %s" % event["id"]) show = Show() show.merge_key = event["id"] show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(event["name"])] show.show_time = date_util.parse_date_time(event["start_time"]) html_doc = u"<html><body>%s</body></html>" % cgi.escape(event.get("description", "")) doc = lxml.html.document_fromstring(html_doc) show.resources.show_url = self.EVENT_URL % event["id"] show.resources.image_url = self.PICTURE_URL % event["id"] show.resources.resource_uris = self.resource_extractor.extract_resources(doc) return show
def _parse_show(self, url, section): doc = html_util.fetch_and_parse(url) show_el = html_util.get_first_element(doc, '#detailPage') date_txt = html_util.get_first_element(show_el, 'time.dtstart').get('datetime') title = html_util.get_first_element(section, 'h4').text_content() show = Show() show.merge_key = url show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(title)] show.show_time = date_util.parse_date_time(date_txt) show.resources.show_url = url show.resources.resource_uris = self.resource_extractor.extract_resources(section, show_el) return show
def _parse_show(self, show_url, event_detail, today): show = Show() # Union hall will have duplicate instances of #unionhall_performer # some may or may not have links, but those that do have links are tagged # with the same id again ie: <div id="unionhall_performer"><a href="#" id="#unionhall_performer"> ... performers = [Performer(p.text_content()) for p in event_detail.cssselect("#unionhall_performer") if p.tag != 'a'] performers[0].headliner = True ticket_link = html_util.get_first_element(event_detail, '#ticket_link a', optional = True) show.venue = self.venue() show.performers = performers if ticket_link is not None: show.merge_key = ticket_link.get('href') # Format: THU 3/25: 6pm / $15 date_tag = event_detail.get_element_by_id("unionhall_date") date_match = self.DATE_RE.match(date_tag.text_content()) time_match = self.TIME_RE.search(date_tag.text_content()) if date_match and time_match: month, day = (int(d) for d in (date_match.group('month'), date_match.group('day'))) show_date = datetime.now().replace(month = month, day = day) show.show_time = date_util.parse_date_and_time(show_date.strftime('%F'), time_match.group('time')) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, show_date, show_detail, show_time): show = Show() time_txt = ','.join([p for p in show_time.text_content().split(',') if not self.PRICE_OR_AGE.search(p)]) logger.debug('Show: %s - %s' % (time_txt, show_time.text_content().strip(' \n'))) show.venue = self.venue() show.performers = self._parse_performers(show_detail) show.show_time = date_util.parse_show_time(show_date, time_txt) show.door_time = date_util.parse_door_time(show_date, time_txt) # TODO right now the below parsing doesn't work, so just skip these shows for now if not show.show_time and not show.door_time: time_match = self.TIME_RE.search(time_txt) if time_match: show.door_time = date_util.parse_date_and_time(show_date, time_match.group('time')) show.resources.resource_uris = self.resource_extractor.extract_resources(show_detail, show_time) # TODO work could be done here to find larger images (sometimes the img's are enclosed in an anchor tag) for img_tag in show_detail.iter(tag = 'img'): src = img_tag.get('src') # Skip the images that show the early shows, later shows, and the 5 years logo if not ('early' in src or 'later' in src or '5years' in src): show.resources.image_url = src break return show
def _parse_show(self, el): date_el = html_util.get_first_element(el, '.calendardates') for span in date_el.iter(tag = 'span'): if span.get('class') == 'small': span.getparent().remove(span) date_txt = date_el.text_content().lower() # Skip recurring events if 'every' in date_txt: return None date_txt, time_txt = date_txt.split(',') performers = [] title_el = html_util.get_first_element(el, '.calendar') for name in title_el.text_content().split('/'): performers.append(Performer(name)) show = Show() show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(el) for img in el.iter(tag = 'img'): logging.debug('image: %s - %s' % (img.get('src'), self.IMAGE_RE.search(img.get('src', '')))) if self.IMAGE_RE.search(img.get('src', '')): show.resources.image_url = img.get('src') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) event_detail = event_doc.get_element_by_id("detail") show = Show() strong_iter = event_detail.iter(tag = 'strong') date_tag, title_tag, blank_tag, desc_tag = strong_iter.next(), strong_iter.next(), strong_iter.next(), strong_iter.next() date_txt = date_tag.text_content() if desc_tag.getnext().tail: time_match = self.TIME_RE.search(desc_tag.getnext().tail) else: time_match = None if time_match: time_txt = time_match.group('time') else: time_txt = None show.merge_key = link show.venue = self.venue() show.title = title_tag.text_content() show.show_time = date_util.parse_date_and_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): if 'main' in img_tag.get('src'): show.resources.image_url = img_tag.get('src') break return show
def _trans_show(self, event): LOG.debug("Transforming show: %s" % event.get_title()) show = Show() performers = [] artists = event.get_artists() for i, artist in enumerate(artists): performers.append(Performer(artist.get_name(), headliner = i == 0)) if artist.get_cover_image(size = pylast.COVER_MEGA): show.resources.image_url = artist.get_cover_image(size = pylast.COVER_MEGA) show.merge_key = event.get_id() show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_time(event.get_start_date()) show.resources.show_url = event.get_url() return show
def _parse_show(self, link, show_section): show_doc = html_util.fetch_and_parse(link) show_detail = show_doc.get_element_by_id("content") title = html_util.get_first_element(show_detail, '.title').text date_txt = html_util.get_first_element(show_section, '.date').text image_url = html_util.get_first_element(show_detail, '.left-view-header img').get('src') performers = [] performer_detail = html_util.get_first_element(show_detail, '.performers') performer_urls = [] for anchor in performer_detail.iter(tag = 'a'): performers.extend(self._parse_performers(anchor)) if self.IS_ARTIST_URL_RE.match(anchor.get('href')): performer_urls.append(anchor.get('href')) resource_sections = [show_section, show_detail] for url in performer_urls: resource_sections.extend(self.fetch_performer_content(url)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_time(date_txt) show.resources.show_url = link show.resources.image_url = image_url show.resources.resource_uris = self.resource_extractor.extract_resources(*resource_sections) return show
def _parse_show(self, date_txt, info_el): logger.debug('Parsing show in %s' % date_txt) info_txt = html_util.get_displayed_text_content(info_el) performers = [] show_time_txt = None for line in info_txt.split('\n'): match = self.PERFORMER_RE.match(line) if match: time_txt, name = match.group('time'), match.group('performer') show_time_txt = time_txt performers.append(Performer(name, start_time = time_txt)) if len(performers) == 0: return None show = Show() show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, show_time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(info_el) # Fontanas's stores the large image in an anchor tag for a in info_el.iter(tag = 'a'): if self.IMAGE_RE.search(a.get('href', '')): show.resources.image_url = a.get('href') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) event_detail = event_doc.get_element_by_id("eventDetail") title_txt = [] found_h_el = False # Start parsing when we find the first h* element # Stop parsing if we found an h* element, but then encounter anything else for el in event_detail.getchildren(): if el.tag in ("h1", "h2"): found_h_el = True if el.text_content(): title_txt.append(el.text_content()) elif found_h_el: break """ <span id="timeDetail"> Apr 24, 2010<br /> upstairs<br /> Doors @ 7 PM<br/> $15.00 Adv. / $20 at the Door<br /> <a href="http://www.deadcellentertainment.tix.com/Schedule.asp?OrganizationNumber=2690" target="_blank"> <img src="/images/buyticket.png" alt="Purchase Tickets" /> </a> </span> """ time_el = event_detail.get_element_by_id("timeDetail") date_txt = time_el.text time_txt = time_el.text_content() performers = [] show = Show() show.merge_key = link show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers("/".join(title_txt))] show.door_time = date_util.parse_show_time(date_txt, time_txt) show.show_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) img = html_util.get_first_element(event_detail, "img") if img is not None: show.resources.image_url = img.get("src") return show
def _trans_record(self, record): show = Show() show.venue = Venue(record.get('venue-name'), record.get('venue-url')) show.title = record.get('title') show.merge_key = record.get('merge-key') performers = [] if record.get('performers'): for performer in record['performers'].split(','): performers.append(Performer(performer.strip())) if record.get('tags'): show.tags = [t.strip() for t in record['tags'].split(',')] date_txt = record.get('show-date') if not date_txt: raise Exception('Show Date is required') else: show.date = date_util.parse_date_time(date_txt) if performers: show.performers = performers if record.get('show-time'): show.show_time = date_util.parse_date_and_time(date_txt, record.get('show-time')) if record.get('door-time'): show.show_time = date_util.parse_date_and_time(date_txt, record.get('door-time')) show.resources.show_url = record.get('show-url') show.resources.image_url = record.get('image-url') show.resources.resource_uris = self.resource_extractor.extract_resources(self._create_resource_doc(record)) return show
def _parse_show(self, link, show_section): show_doc = html_util.fetch_and_parse(link) show_detail = html_util.get_first_element(show_doc, "#content .event-detail") date_txt = html_util.get_first_element(show_detail, ".dates").text_content() time_txt = html_util.get_first_element(show_detail, ".times").text_content() sold_out = html_util.get_first_element(show_detail, '.sold-out', optional = True) image = html_util.get_first_element(show_detail, 'img', optional = True) # The image we want is generally the first one, but if the layout changes this may break if image is not None: image_url = image.get('src') else: image_url = None performers = [] for tag in ('h1', 'h2', 'h3'): for p in show_detail.iter(tag = tag): if p.text_content(): performers.extend(self._parse_performers(p)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.door_time = date_util.parse_door_time(date_txt, time_txt) show.show_time = date_util.parse_show_time(date_txt, time_txt) show.soldout = sold_out is not None show.resources.show_url = link show.resources.image_url = image_url show.resources.resource_uris = self.resource_extractor.extract_resources(show_detail) return show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) event = html_util.get_first_element(event_doc, '#tfly-center-column-wide') event_detail = html_util.get_first_element(event, '#details') """ <div class="info"> Sat, May 22, 2010<br /> Doors: 6:00 PM / Show: 7:00 PM <br /> $5.00<br /> </div> """ event_info = html_util.get_first_element(event_detail, ".info") date_txt = event_info.text time_txt = event_info.getchildren()[0].tail img = html_util.get_first_element(event_detail, "img") performers = [] for tag in ('h1', 'h2', 'h3', 'h4'): for h in event_detail.iter(tag = tag): performers.extend(self._parse_performers(h)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) match = self.IS_EVENT.match(link) event_id = int(match.group("event_id")) event_detail = html_util.get_first_element(event_doc, ".tfly-event-id-%d" % event_id) date_txt = html_util.get_first_element(event_doc, ".dates").text_content() time_txt = html_util.get_first_element(event_doc, ".times").text_content() img = html_util.get_first_element(event_detail, "img") performers = [] for p in html_util.get_elements(event_detail, ".headliners"): performers.append(Performer(p.text_content(), headliner=True)) for p in html_util.get_elements(event_detail, ".supports"): for pi in lang_util.parse_performers(p.text_content()): performers.append(Performer(pi, headliner=False)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) if img is not None: show.resources.image_url = img.get("src") return show
def _parse_show(self, event_detail): show = Show() performers = [] content = html_util.get_displayed_text_content(event_detail).strip() date_txt = None # This flag is set up and down to allow either of the following to be processed: # 1st: Ava Luna # or # 1st: # Ava Luna had_num = True logger.debug("Parsing show content: %s" % content) for line in content.split('\n'): if line: time_match = date_util.STRICT_TIME_RE.search(line) if not date_txt: date_txt = line elif time_match: show.show_time = date_util.parse_date_and_time(date_txt, time_match.group('time')) line = date_util.STRICT_TIME_RE.sub('', line).strip(': ') if line: performers.append(Performer(line)) had_num = False else: had_num = True elif self.NUM_RE.match(line): line = self.NUM_RE.sub('', line).strip() if line: performers.append(Performer(line)) had_num = False else: had_num = True elif had_num: performers.append(Performer(line)) had_num = False else: logger.error('Unknown line format: %s' % line) show.venue = self.venue() show.performers = performers show.date = date_util.parse_date_and_time(date_txt, None) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_shows(self, entry): content = None shows = [] today = datetime.now() entry_date = date_util.parse_date_time(entry.published) # Only parse shows for the current year, or at the tail end of last year if entry_date.year != today.year or (entry_date.year == today.year -1 and entry_date.month > 10): return [] for item in entry.content: if item.type in ('text/html',): content = item.value if not content: logging.error('Unable to extract content from entry: %s' % entry.id) return [] # This next part is technically pretty evil entry_doc = lxml.html.fromstring(content) tags = ['span', 'b', 'i', 'strong', 'em'] cleaner = Cleaner(remove_tags = tags, links = False) clean_content = cleaner.clean_html(entry_doc) # FIXME patch lxml to handle this while calling text_content() # http://codespeak.net/pipermail/lxml-dev/2008-August/004009.html content_str = lxml.html.tostring(clean_content) for regexp, replacement in self.REPLACEMENTS: content_str = regexp.sub(replacement, content_str) for part in self.SHOW_DIVIDER_RE.split(content_str): part = part.strip(' \t\n') parts = part.split('\n') header = parts.pop(0) body = '\n'.join(parts) header_parts = self.HEADER_SEP_RE.split(header) date_txt = header_parts.pop(0) time_txt = None for part in header_parts: if date_util.STRICT_TIME_RE.search(part): time_txt = date_util.sanitize_time(part) break if not time_txt: logging.error('Unable to find time in header: %s' % header) continue if '-' in time_txt: time_txt = time_txt.split('-')[0].strip() if not(time_txt.endswith('am') or time_txt.endswith('pm')): time_txt = time_txt + 'pm' show_doc = lxml.html.fromstring(body) use_all = False performer_parts = [] all_parts = [] for el in show_doc.iter(): if self._is_img(el): break text = el.text or '' tail = el.tail or '' for regexp in self.BODY_SKIP: if regexp.search(text): text = '' if regexp.search(tail): tail = '' for p in (text, tail): if p: all_parts.append(p) if text and el.tag != 'a': use_all = True if el.tag == 'a' and tail.strip() not in(',', '&', 'w/', ''): use_all = True if el.tag == 'a': performer_parts.append(text) img_url = None for img in show_doc.iter(tag = 'img'): if img.get('src'): img_url = img.get('src') break show = Show() show.venue = self.venue() if use_all: performers_str = ' '.join(all_parts).replace(' ,', ',').replace(' ', ' ') show.performers = [Performer(name) for name in lang_util.parse_performers(performers_str)] else: show.performers = [Performer(name) for name in performer_parts if name] try: show.show_time = date_util.parse_date_and_time(date_txt, time_txt) except: logging.exception('Unable to parse: %s - %s' % (date_txt, time_txt)) continue show.resources.image_url = img_url show.resources.resource_uris = self.resource_extractor.extract_resources(show_doc) date_util.adjust_fuzzy_years(show, entry_date) shows.append(show) return shows