def _parse_show(self, el): event_detail = html_util.get_first_element(el, '.event-details') date_txt = html_util.get_first_element(event_detail, 'strong').text time_txt = event_detail.text_content() show = Show() show.venue = self.venue() title_txt = html_util.get_first_element(event_detail, '.event-name').text_content() show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)] show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) img = html_util.get_first_element(el, ".event-image img", optional = True) if img is not None: show.resources.image_url = img.get('src') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, event_detail): if html_util.get_first_element(event_detail, 'h2', optional = True) is None: return None show = Show() date_txt = html_util.get_first_element(event_detail, 'h2').text_content() performers_txt = html_util.get_first_element(event_detail, '.caption').text_content() show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(performers_txt)] if not date_txt.lower().startswith('every'): show.date = date_util.parse_date_and_time(date_txt, None) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link, parse_500 = True) event_detail = html_util.get_first_element(event_doc, ".event-detail") artist_info = html_util.get_first_element(event_doc, ".artist-boxes") date_txt = html_util.get_first_element(event_detail, ".dates").text_content() performers = [] for el in html_util.get_elements(event_doc, '.headliners'): for name in lang_util.parse_performers(el.text_content()): performers.append(Performer(name, headliner = True)) for el in html_util.get_elements(event_doc, '.supports'): for name in lang_util.parse_performers(el.text_content()): performers.append(Performer(name, headliner = False)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, html_util.get_first_element(event_detail, ".times").text_content()) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail, artist_info) img = html_util.get_first_element(event_detail, "img", optional = True) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_show(self, link): logging.debug('Parsing show from: %s' % link) event_doc = html_util.fetch_and_parse(link) event = html_util.get_first_element(event_doc, '.biglisting') img = html_util.get_first_element(event, '.tonightinfo img', optional = True) date_el = html_util.get_first_element(event, '.date') date_match = self.DATE_RE.search(date_el.text_content()) if date_match: date_txt = date_match.group(0) else: raise Exception('Unable to determine show date from: %s' % date_el.text_content()) performers = [] first_time = None for det in event.cssselect('.showpage-details'): header = None for child in det.getchildren(): if child.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): header = child if header is None: logger.error('Unable to determine performer') else: time_txt = html_util.get_first_element(det, '.time').text_content() time_match = date_util.TIME_RE.search(time_txt) if time_match: first_time = time_txt = time_match.group('time') else: time_txt = None performers.append(Performer(header.text_content(), start_time = time_txt, headliner = header.tag in ('h1'))) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, first_time) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_show(self, link): LOG.debug("Fetching show: %s" % link) event_doc = html_util.fetch_and_parse(link) event_detail = event_doc.get_element_by_id("mainColumn") show = Show() for performer in html_util.get_elements(event_detail, 'h1'): name = performer.text_content().strip(' \n\r\t') if name: show.performers.append(Performer(name)) date_txt = html_util.get_first_element(event_detail, '.date').text_content() event_match = self.EVENT_URL.match(link) show.merge_key = event_match.group('page_id') show.venue = self.venue() show.show_time = date_util.parse_date_time(date_txt).replace(hour = 21) LOG.debug('Date: %s' % show.date) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): if 'main' in img_tag.get('src'): show.resources.image_url = img_tag.get('src') break return show
def _fetch_profile(self, profile_id): profile_link = 'http://www.myspace.com/' + profile_id logger.debug('Fetching profile page: %s' % profile_link) html, doc = self._fetch_and_parse(profile_link) body = parsing.get_first_element(doc, 'body') if 'layout_0_1' in body.get('class'): logger.debug('%s is v1 profile' % profile_id) friend_id = FRIEND_ID_RE.search(html) if not friend_id: raise Exception("Unable to determine friend id for v1 myspace profile: %s" % profile_id) classic_profile_link = 'http://www.myspace.com/%s/classic' % friend_id.group(1) logger.debug('Fetching classic profile page: %s' % classic_profile_link) new_html, new_doc = self._fetch_and_parse(classic_profile_link) return (1, new_html, new_doc) elif 'layout_0_2' in body.get('class'): return (2, html, doc) else: raise Exception('Unable to determine myspace profile version: %s' % profile_id) logger.debug('%s is v2 profile' % profile.profile_id)
def _parse_show(self, link): raw_url = self.raw_url(link) match = self.EVENT_ID.search(link) if not match: raise Exception("Unable to locate event id in: %s" % link) event_id = match.group(0) logging.debug('Fetching show info: %s' % link) event_doc = html_util.fetch_and_parse(link) show_el = html_util.get_first_element(event_doc, '#content') header_el = html_util.get_first_element(show_el, 'h1') header_match = self.HEADER_PARSE.search(header_el.text_content()) if not header_match: raise Exception("Unable to parse header: %s" % header_el.text_content()) date_txt = header_match.group('date').strip() title = header_match.group('title').strip() if date_txt.lower().startswith('tonight'): date_txt = datetime.today().date().strftime('%F') img = html_util.get_first_element(show_el, 'img', optional = True) show = Show() show.performers = [Performer(p) for p in lang_util.parse_performers(title)] show.show_time = date_util.parse_date_and_time(date_txt, None) show.merge_key = event_id show.venue = self.venue() show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(show_el) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_v2(self, doc): content = parsing.get_first_element(doc, '.content.contentMid') html_boxes = list(parsing.get_elements(content, '.htmlBoxModule')) resources = self.resource_extractor.extract_resources(*html_boxes) return ArtistProfileParserResult(resources)
def _month_parser(self, request_date): month_url = '%scalendar/%s.html' % (self.BASE_URL, request_date.strftime('%b%y').lower()) logger.debug('Parsing: %s' % month_url) doc = html_util.fetch_and_parse(month_url) main_table = html_util.get_first_element(doc, 'body > table') trs = main_table.getchildren() # Remove the "header" row from the table trs.pop(0) # Remove the "footer" row from the table trs.pop() # The remaining rows look as follows: # The Show Details/Time pairing might be repeated """ <tr><td>Monday 4th</td></tr> <tr><td> <center> <p>Show Details</p> <p>Show Time</p> </center> </td></tr> """ while trs: date_row, show_row = trs.pop(0).getchildren(), trs.pop(0).getchildren() for i, date_td in enumerate(date_row): # At the end of the month, the html doesn't always contain a corresponding table # cell in the show_row for each date in the date_row if i > len(show_row) - 1: break date_match = self.DATE_RE.match(date_td.text_content().strip()) if date_match: show_date = request_date.replace(day = int(date_match.group('day'))) show_td = show_row[i] p_list = show_td.cssselect('center > p') if len(p_list) % 2 == 0: while p_list: show_detail, show_time = p_list.pop(0), p_list.pop(0) show = self._parse_show(show_date, show_detail, show_time) if not (show.door_time or show.show_time): logger.warning('Unable to determine door or show time for show on %s, discarding' % show_date) elif not (show.title or len(show.performers) > 0): logger.warning('Unable to determine title or performers for show on %s, discarding' % show_date) else: yield show
def _parse_show(self, url, section): doc = html_util.fetch_and_parse(url) show_el = html_util.get_first_element(doc, '#detailPage') date_txt = html_util.get_first_element(show_el, 'time.dtstart').get('datetime') title = html_util.get_first_element(section, 'h4').text_content() show = Show() show.merge_key = url show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(title)] show.show_time = date_util.parse_date_time(date_txt) show.resources.show_url = url show.resources.resource_uris = self.resource_extractor.extract_resources(section, show_el) return show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) event_detail = event_doc.get_element_by_id("eventDetail") title_txt = [] found_h_el = False # Start parsing when we find the first h* element # Stop parsing if we found an h* element, but then encounter anything else for el in event_detail.getchildren(): if el.tag in ("h1", "h2"): found_h_el = True if el.text_content(): title_txt.append(el.text_content()) elif found_h_el: break """ <span id="timeDetail"> Apr 24, 2010<br /> upstairs<br /> Doors @ 7 PM<br/> $15.00 Adv. / $20 at the Door<br /> <a href="http://www.deadcellentertainment.tix.com/Schedule.asp?OrganizationNumber=2690" target="_blank"> <img src="/images/buyticket.png" alt="Purchase Tickets" /> </a> </span> """ time_el = event_detail.get_element_by_id("timeDetail") date_txt = time_el.text time_txt = time_el.text_content() performers = [] show = Show() show.merge_key = link show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers("/".join(title_txt))] show.door_time = date_util.parse_show_time(date_txt, time_txt) show.show_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) img = html_util.get_first_element(event_detail, "img") if img is not None: show.resources.image_url = img.get("src") return show
def _month_parser(self, request_date): month_url = '%scalendar/%d-%d' % (self.BASE_URL, request_date.year, request_date.month) logger.debug('Parsing: %s' % month_url) doc = html_util.fetch_and_parse(month_url) main_table = html_util.get_first_element(doc, '.month-view table') for td in html_util.get_elements(main_table, 'td.has-events'): for show in self._parse_shows(request_date, td): yield show
def _get_parser(self): doc = html_util.fetch_and_parse(self.BASE_URL) events = html_util.get_first_element(doc, '.defaultText') content = html_util.get_displayed_text_content(events).strip() for line in content.split('\n'): if self.SHOW_START_RE.match(line): show = self._parse_show(line) if show: yield show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) event = html_util.get_first_element(event_doc, '#tfly-center-column-wide') event_detail = html_util.get_first_element(event, '#details') """ <div class="info"> Sat, May 22, 2010<br /> Doors: 6:00 PM / Show: 7:00 PM <br /> $5.00<br /> </div> """ event_info = html_util.get_first_element(event_detail, ".info") date_txt = event_info.text time_txt = event_info.getchildren()[0].tail img = html_util.get_first_element(event_detail, "img") performers = [] for tag in ('h1', 'h2', 'h3', 'h4'): for h in event_detail.iter(tag = tag): performers.extend(self._parse_performers(h)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event) if img is not None: show.resources.image_url = img.get('src') return show
def _parse_show(self, el): date_el = html_util.get_first_element(el, '.calendardates') for span in date_el.iter(tag = 'span'): if span.get('class') == 'small': span.getparent().remove(span) date_txt = date_el.text_content().lower() # Skip recurring events if 'every' in date_txt: return None date_txt, time_txt = date_txt.split(',') performers = [] title_el = html_util.get_first_element(el, '.calendar') for name in title_el.text_content().split('/'): performers.append(Performer(name)) show = Show() show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, time_txt) show.resources.resource_uris = self.resource_extractor.extract_resources(el) for img in el.iter(tag = 'img'): logging.debug('image: %s - %s' % (img.get('src'), self.IMAGE_RE.search(img.get('src', '')))) if self.IMAGE_RE.search(img.get('src', '')): show.resources.image_url = img.get('src') date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_show(self, link, show_section): show_doc = html_util.fetch_and_parse(link) show_detail = show_doc.get_element_by_id("content") title = html_util.get_first_element(show_detail, '.title').text date_txt = html_util.get_first_element(show_section, '.date').text image_url = html_util.get_first_element(show_detail, '.left-view-header img').get('src') performers = [] performer_detail = html_util.get_first_element(show_detail, '.performers') performer_urls = [] for anchor in performer_detail.iter(tag = 'a'): performers.extend(self._parse_performers(anchor)) if self.IS_ARTIST_URL_RE.match(anchor.get('href')): performer_urls.append(anchor.get('href')) resource_sections = [show_section, show_detail] for url in performer_urls: resource_sections.extend(self.fetch_performer_content(url)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_time(date_txt) show.resources.show_url = link show.resources.image_url = image_url show.resources.resource_uris = self.resource_extractor.extract_resources(*resource_sections) return show
def _parse_show(self, link, show_section): show_doc = html_util.fetch_and_parse(link) show_detail = html_util.get_first_element(show_doc, "#content .event-detail") date_txt = html_util.get_first_element(show_detail, ".dates").text_content() time_txt = html_util.get_first_element(show_detail, ".times").text_content() sold_out = html_util.get_first_element(show_detail, '.sold-out', optional = True) image = html_util.get_first_element(show_detail, 'img', optional = True) # The image we want is generally the first one, but if the layout changes this may break if image is not None: image_url = image.get('src') else: image_url = None performers = [] for tag in ('h1', 'h2', 'h3'): for p in show_detail.iter(tag = tag): if p.text_content(): performers.extend(self._parse_performers(p)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.door_time = date_util.parse_door_time(date_txt, time_txt) show.show_time = date_util.parse_show_time(date_txt, time_txt) show.soldout = sold_out is not None show.resources.show_url = link show.resources.image_url = image_url show.resources.resource_uris = self.resource_extractor.extract_resources(show_detail) return show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) match = self.IS_EVENT.match(link) event_id = int(match.group("event_id")) event_detail = html_util.get_first_element(event_doc, ".tfly-event-id-%d" % event_id) date_txt = html_util.get_first_element(event_doc, ".dates").text_content() time_txt = html_util.get_first_element(event_doc, ".times").text_content() img = html_util.get_first_element(event_detail, "img") performers = [] for p in html_util.get_elements(event_detail, ".headliners"): performers.append(Performer(p.text_content(), headliner=True)) for p in html_util.get_elements(event_detail, ".supports"): for pi in lang_util.parse_performers(p.text_content()): performers.append(Performer(pi, headliner=False)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) if img is not None: show.resources.image_url = img.get("src") return show
def parse(self, artist, profile): msp_profile = api.MySpaceProfile(profile.profile_id) doc = msp_profile.get_profile_doc() body = parsing.get_first_element(doc, 'body') self._resolve_offsite_links(doc) if msp_profile.get_profile_version() == 1: logger.debug('%s is v1 profile' % profile.profile_id) return self._parse_v1(doc) elif msp_profile.get_profile_version() == 2: logger.debug('%s is v2 profile' % profile.profile_id) return self._parse_v2(doc) else: raise Exception('Unable to determine myspace profile version')
def _parse_shows(self, base_date, td): day = int(html_util.get_first_element(td, '.day').text_content()) date = base_date.replace(day = day) logger.debug('Parsing shows on %s' % date.strftime('%F')) lr_shows = html_util.get_elements(td, '.lr_color a') googie_shows = html_util.get_elements(td, '.googie_color a') shows = [] if lr_shows: shows.append(self._parse_show(date, lr_shows)) if googie_shows: shows.append(self._parse_show(date, googie_shows)) return shows
def _parse_show(self, show_url, event_detail, today): show = Show() # Union hall will have duplicate instances of #unionhall_performer # some may or may not have links, but those that do have links are tagged # with the same id again ie: <div id="unionhall_performer"><a href="#" id="#unionhall_performer"> ... performers = [Performer(p.text_content()) for p in event_detail.cssselect("#unionhall_performer") if p.tag != 'a'] performers[0].headliner = True ticket_link = html_util.get_first_element(event_detail, '#ticket_link a', optional = True) show.venue = self.venue() show.performers = performers if ticket_link is not None: show.merge_key = ticket_link.get('href') # Format: THU 3/25: 6pm / $15 date_tag = event_detail.get_element_by_id("unionhall_date") date_match = self.DATE_RE.match(date_tag.text_content()) time_match = self.TIME_RE.search(date_tag.text_content()) if date_match and time_match: month, day = (int(d) for d in (date_match.group('month'), date_match.group('day'))) show_date = datetime.now().replace(month = month, day = day) show.show_time = date_util.parse_date_and_time(show_date.strftime('%F'), time_match.group('time')) show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): show.resources.image_url = img_tag.get('src') break date_util.adjust_fuzzy_years(show, self._parse_started) return show
def _parse_artist(self, a): time_match = self.TIME_RE.match(a.text_content()) if time_match: start_time = time_match.group('time') name = self.TIME_RE.sub('', a.text_content()) else: start_time = None name = a.text_content() link = a.get('href') if link: artist_doc = html_util.fetch_and_parse(link) artist_el = html_util.get_first_element(artist_doc, '#bleh') else: artist_el = None logging.debug('Artist (%s) name: %s from (%s)' % (start_time, name, a.text_content())) return (name, start_time, artist_el)