def _festival(self, rows): if len(rows) == 0: return None info = DictForNameValue(rows) for r in ('name', 'merge-key', 'season-merge-key', 'season-start-date', 'season-end-date'): if info.get(r) == None: raise Exception("Festival lacks required field: %s" % r) festival_query = Q(merge_key = info['merge-key']) festival_kwargs = {'name': info['name'], 'merge_key': info['merge-key']} festival, created = Festival.objects.get_or_create(festival_query, defaults = festival_kwargs) season_query = Q(festival_id = festival.id, merge_key = info['season-merge-key']) season_kwargs = {'festival_id': festival.id, 'merge_key': info['season-merge-key'], 'start_date': date_util.parse_date_time(info['season-start-date']), 'end_date': date_util.parse_date_time(info['season-end-date']) } season, created = FestivalSeason.objects.get_or_create(season_query, defaults = season_kwargs) return festival, season
def _process_entry_group(self, start_date, entries): show = Show() show.venue = self.venue() show.performers = [] entries.sort(key = lambda e: e.when[0].start_time, reverse = True) for entry in entries: logger.debug("Processing entry: %s, starting on: %s" % (entry.title.text, entry.when[0].start_time)) # Full day events usually denote a title which we currently will simply skip if 'T' not in entry.when[0].start_time: logger.debug('Entry "%s" is an all day event, skipping' % entry.title.text) continue elif 'pub side' in entry.title.text.lower(): logger.debug('Entry "%s" is on the Pub Side of Spike Hill, skipping' % entry.title.text) continue start_time = date_util.parse_date_time(entry.when[0].start_time) show.show_time = min(start_time, show.show_time or start_time) show.performers.append(Performer(entry.title.text)) return [show]
def _trans_show(self, show_data): LOG.debug("Checking event: %s" % show_data['EventName']) if "Music" not in show_data['MajorGenre']: LOG.debug("Skipping non music show") return None elif show_data.get('Canceled'): LOG.debug("Skipping cancelled show") return None elif 'VIP Packages' in show_data['EventName']: LOG.debug("Skipping VIP package") return None show = Show() performers = [] for i, name in enumerate(lang_util.parse_performers(show_data['EventName'])): performers.append(Performer(name, headliner = i == 0)) show.merge_key = show_data['EventId'] show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_time(show_data['EventDate']) #if show.show_time: # show.show_time = timezone(show_data['Timezone']).localize(show.show_time) if show_data['AttractionImage']: show.resources.image_url = self._image_url(show_data, show_data['AttractionImage'][0]) return show
def _parse_show(self, link): LOG.debug("Fetching show: %s" % link) event_doc = html_util.fetch_and_parse(link) event_detail = event_doc.get_element_by_id("mainColumn") show = Show() for performer in html_util.get_elements(event_detail, 'h1'): name = performer.text_content().strip(' \n\r\t') if name: show.performers.append(Performer(name)) date_txt = html_util.get_first_element(event_detail, '.date').text_content() event_match = self.EVENT_URL.match(link) show.merge_key = event_match.group('page_id') show.venue = self.venue() show.show_time = date_util.parse_date_time(date_txt).replace(hour = 21) LOG.debug('Date: %s' % show.date) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): if 'main' in img_tag.get('src'): show.resources.image_url = img_tag.get('src') break return show
def _process_entry(self, entry): logger.debug("Processing entry: %s, starting on: %s" % (entry.title.text, entry.when[0].start_time)) if not self.BACK_ROOM_RE.match(entry.title.text): return None title_txt = self.BACK_ROOM_RE.sub('', entry.title.text) show = Show() show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)] show.show_time = date_util.parse_date_time(entry.when[0].start_time) return show
def _get_parser(self): calendar_service = CalendarService() yesterday = datetime.today() - timedelta(days = 1) three_months = yesterday + timedelta(days = 90) query = CalendarEventQuery(self.calendar_id(), 'public', 'full') query.start_min = yesterday.strftime('%F') query.start_max = three_months.strftime('%F') query['max-results'] = '500' feed = calendar_service.CalendarQuery(query) start_date = lambda e: date_util.parse_date_time(e.when[0].start_time).date() single = [] recurring = [] for e in feed.entry: if len(e.when) > 1: recurring.append(e) else: single.append(e) for show in self._process_recurring_entries(recurring): if show: yield show if self.group_by_date(): single.sort(key = start_date) for batch_date, date_entries in groupby(single, start_date): for show in self._process_entry_group(batch_date, list(date_entries)): if show: pass yield show else: for entry in single: show = self._process_entry(entry) if show: yield show
def _parse_show(self, api, event): LOG.debug("Parsing event: %s" % event["id"]) show = Show() show.merge_key = event["id"] show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(event["name"])] show.show_time = date_util.parse_date_time(event["start_time"]) html_doc = u"<html><body>%s</body></html>" % cgi.escape(event.get("description", "")) doc = lxml.html.document_fromstring(html_doc) show.resources.show_url = self.EVENT_URL % event["id"] show.resources.image_url = self.PICTURE_URL % event["id"] show.resources.resource_uris = self.resource_extractor.extract_resources(doc) return show
def _get_parser(self): api = GraphAPI(self.settings["facebook_access_token"]) events = api.get_connections(self.profile_id(), "events") today = datetime.today() event_ids = [] for event_info in events["data"]: start_time = date_util.parse_date_time(event_info["start_time"]) if start_time >= today: event_ids.append(event_info["id"]) if event_ids: parse_events = api.get_objects(event_ids) for event in parse_events.values(): yield self._parse_show(api, event)
def _parse_show(self, url, section): doc = html_util.fetch_and_parse(url) show_el = html_util.get_first_element(doc, '#detailPage') date_txt = html_util.get_first_element(show_el, 'time.dtstart').get('datetime') title = html_util.get_first_element(section, 'h4').text_content() show = Show() show.merge_key = url show.venue = self.venue() show.performers = [Performer(p) for p in lang_util.parse_performers(title)] show.show_time = date_util.parse_date_time(date_txt) show.resources.show_url = url show.resources.resource_uris = self.resource_extractor.extract_resources(section, show_el) return show
def _trans_show(self, event): LOG.debug("Transforming show: %s" % event.get_title()) show = Show() performers = [] artists = event.get_artists() for i, artist in enumerate(artists): performers.append(Performer(artist.get_name(), headliner = i == 0)) if artist.get_cover_image(size = pylast.COVER_MEGA): show.resources.image_url = artist.get_cover_image(size = pylast.COVER_MEGA) show.merge_key = event.get_id() show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_time(event.get_start_date()) show.resources.show_url = event.get_url() return show
def _parse_show(self, link, show_section): show_doc = html_util.fetch_and_parse(link) show_detail = show_doc.get_element_by_id("content") title = html_util.get_first_element(show_detail, '.title').text date_txt = html_util.get_first_element(show_section, '.date').text image_url = html_util.get_first_element(show_detail, '.left-view-header img').get('src') performers = [] performer_detail = html_util.get_first_element(show_detail, '.performers') performer_urls = [] for anchor in performer_detail.iter(tag = 'a'): performers.extend(self._parse_performers(anchor)) if self.IS_ARTIST_URL_RE.match(anchor.get('href')): performer_urls.append(anchor.get('href')) resource_sections = [show_section, show_detail] for url in performer_urls: resource_sections.extend(self.fetch_performer_content(url)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_time(date_txt) show.resources.show_url = link show.resources.image_url = image_url show.resources.resource_uris = self.resource_extractor.extract_resources(*resource_sections) return show
def _trans_record(self, record): show = Show() show.venue = Venue(record.get('venue-name'), record.get('venue-url')) show.title = record.get('title') show.merge_key = record.get('merge-key') performers = [] if record.get('performers'): for performer in record['performers'].split(','): performers.append(Performer(performer.strip())) if record.get('tags'): show.tags = [t.strip() for t in record['tags'].split(',')] date_txt = record.get('show-date') if not date_txt: raise Exception('Show Date is required') else: show.date = date_util.parse_date_time(date_txt) if performers: show.performers = performers if record.get('show-time'): show.show_time = date_util.parse_date_and_time(date_txt, record.get('show-time')) if record.get('door-time'): show.show_time = date_util.parse_date_and_time(date_txt, record.get('door-time')) show.resources.show_url = record.get('show-url') show.resources.image_url = record.get('image-url') show.resources.resource_uris = self.resource_extractor.extract_resources(self._create_resource_doc(record)) return show
def _parse_shows(self, entry): content = None shows = [] today = datetime.now() entry_date = date_util.parse_date_time(entry.published) # Only parse shows for the current year, or at the tail end of last year if entry_date.year != today.year or (entry_date.year == today.year -1 and entry_date.month > 10): return [] for item in entry.content: if item.type in ('text/html',): content = item.value if not content: logging.error('Unable to extract content from entry: %s' % entry.id) return [] # This next part is technically pretty evil entry_doc = lxml.html.fromstring(content) tags = ['span', 'b', 'i', 'strong', 'em'] cleaner = Cleaner(remove_tags = tags, links = False) clean_content = cleaner.clean_html(entry_doc) # FIXME patch lxml to handle this while calling text_content() # http://codespeak.net/pipermail/lxml-dev/2008-August/004009.html content_str = lxml.html.tostring(clean_content) for regexp, replacement in self.REPLACEMENTS: content_str = regexp.sub(replacement, content_str) for part in self.SHOW_DIVIDER_RE.split(content_str): part = part.strip(' \t\n') parts = part.split('\n') header = parts.pop(0) body = '\n'.join(parts) header_parts = self.HEADER_SEP_RE.split(header) date_txt = header_parts.pop(0) time_txt = None for part in header_parts: if date_util.STRICT_TIME_RE.search(part): time_txt = date_util.sanitize_time(part) break if not time_txt: logging.error('Unable to find time in header: %s' % header) continue if '-' in time_txt: time_txt = time_txt.split('-')[0].strip() if not(time_txt.endswith('am') or time_txt.endswith('pm')): time_txt = time_txt + 'pm' show_doc = lxml.html.fromstring(body) use_all = False performer_parts = [] all_parts = [] for el in show_doc.iter(): if self._is_img(el): break text = el.text or '' tail = el.tail or '' for regexp in self.BODY_SKIP: if regexp.search(text): text = '' if regexp.search(tail): tail = '' for p in (text, tail): if p: all_parts.append(p) if text and el.tag != 'a': use_all = True if el.tag == 'a' and tail.strip() not in(',', '&', 'w/', ''): use_all = True if el.tag == 'a': performer_parts.append(text) img_url = None for img in show_doc.iter(tag = 'img'): if img.get('src'): img_url = img.get('src') break show = Show() show.venue = self.venue() if use_all: performers_str = ' '.join(all_parts).replace(' ,', ',').replace(' ', ' ') show.performers = [Performer(name) for name in lang_util.parse_performers(performers_str)] else: show.performers = [Performer(name) for name in performer_parts if name] try: show.show_time = date_util.parse_date_and_time(date_txt, time_txt) except: logging.exception('Unable to parse: %s - %s' % (date_txt, time_txt)) continue show.resources.image_url = img_url show.resources.resource_uris = self.resource_extractor.extract_resources(show_doc) date_util.adjust_fuzzy_years(show, entry_date) shows.append(show) return shows