示例#1
0
  def _parse_show(self, event_detail):
    if html_util.get_first_element(event_detail, 'h2', optional = True) is None:
      return None

    show = Show()

    date_txt       = html_util.get_first_element(event_detail, 'h2').text_content()
    performers_txt = html_util.get_first_element(event_detail, '.caption').text_content()

    show.venue      = self.venue()
    show.performers = [Performer(p) for p in lang_util.parse_performers(performers_txt)]
    
    if not date_txt.lower().startswith('every'):
      show.date = date_util.parse_date_and_time(date_txt, None)

    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    for img_tag in event_detail.iter(tag = 'img'):
      show.resources.image_url = img_tag.get('src')
      
      break
      
    date_util.adjust_fuzzy_years(show, self._parse_started)
      
    return show
示例#2
0
  def _parse_show(self, show_date, show_detail, show_time):
    show = Show()
    
    time_txt = ','.join([p for p in show_time.text_content().split(',') if not self.PRICE_OR_AGE.search(p)])

    logger.debug('Show: %s - %s' % (time_txt, show_time.text_content().strip(' \n')))
    show.venue      = self.venue()
    show.performers = self._parse_performers(show_detail)
    show.show_time  = date_util.parse_show_time(show_date, time_txt)
    show.door_time  = date_util.parse_door_time(show_date, time_txt)

    # TODO right now the below parsing doesn't work, so just skip these shows for now
    if not show.show_time and not show.door_time:
      time_match = self.TIME_RE.search(time_txt)
      
      if time_match:
        show.door_time = date_util.parse_date_and_time(show_date, time_match.group('time'))

    show.resources.resource_uris = self.resource_extractor.extract_resources(show_detail, show_time)

    # TODO work could be done here to find larger images (sometimes the img's are enclosed in an anchor tag)
    for img_tag in show_detail.iter(tag = 'img'):
      src = img_tag.get('src')
      
      # Skip the images that show the early shows, later shows, and the 5 years logo
      if not ('early' in src or 'later' in src or '5years' in src):
        show.resources.image_url = src
        
        break

    return show
示例#3
0
  def _parse_show(self, link):
    event_doc = html_util.fetch_and_parse(link, parse_500 = True)

    event_detail = html_util.get_first_element(event_doc,    ".event-detail")
    artist_info  = html_util.get_first_element(event_doc,    ".artist-boxes")
    
    date_txt     = html_util.get_first_element(event_detail, ".dates").text_content()
    
    performers = [] 
    
    for el in html_util.get_elements(event_doc, '.headliners'):
      for name in lang_util.parse_performers(el.text_content()):
        performers.append(Performer(name, headliner = True))

    for el in html_util.get_elements(event_doc, '.supports'):
      for name in lang_util.parse_performers(el.text_content()):
        performers.append(Performer(name, headliner = False))

    show = Show()

    show.merge_key  = link
    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_and_time(date_txt, html_util.get_first_element(event_detail, ".times").text_content())

    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail, artist_info)

    img = html_util.get_first_element(event_detail, "img", optional = True)
    
    if img is not None:
      show.resources.image_url = img.get('src')

    return show
示例#4
0
  def _parse_show(self, link):
    logging.debug('Parsing show from: %s' % link)

    event_doc = html_util.fetch_and_parse(link)

    event        = html_util.get_first_element(event_doc, '.biglisting')
    img          = html_util.get_first_element(event, '.tonightinfo img', optional = True)

    date_el     = html_util.get_first_element(event, '.date')
    
    date_match  = self.DATE_RE.search(date_el.text_content())
    
    if date_match:
      date_txt = date_match.group(0)
    else:
      raise Exception('Unable to determine show date from: %s' % date_el.text_content())
    
    performers = [] 
    first_time = None

    for det in event.cssselect('.showpage-details'):
      header = None
      
      for child in det.getchildren():
        if child.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
          header = child
          
      if header is None:
        logger.error('Unable to determine performer')
      else:
        time_txt = html_util.get_first_element(det, '.time').text_content()
      
        time_match = date_util.TIME_RE.search(time_txt)
      
        if time_match:
          first_time = time_txt = time_match.group('time')
        else:
          time_txt = None

        performers.append(Performer(header.text_content(), start_time = time_txt, headliner = header.tag in ('h1')))

    show = Show()

    show.merge_key  = link
    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_and_time(date_txt, first_time)

    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(event)
    
    if img is not None:
      show.resources.image_url  = img.get('src')

    return show
示例#5
0
  def _parse_show(self, base_date, links):
    performers = []
    
    show_time  = None
    
    resource_els = []

    for a in links:
      # Every other link on the calendar seems to have no text
      if a.text_content():
        name, start_time_txt, artist_el = self._parse_artist(a)
      
        if artist_el is not None:
          resource_els.append(artist_el)
      
        if start_time_txt:
          start_time = date_util.parse_date_and_time(base_date, start_time_txt)
        
          if not show_time or start_time < show_time:
            show_time = start_time

        performers.append(Performer(name, start_time = start_time_txt))
      
    # Performers are list from first to last
    performers.reverse()
    resource_els.reverse()

    show = Show()

    show.venue      = self.venue()
    show.performers = performers
    show.date       = base_date
    show.show_time  = show_time

    show.resources.resource_uris = self.resource_extractor.extract_resources(*resource_els)

    image_url = None

    for el in resource_els:
      if image_url:
        break

      for img_tag in el.iter(tag = 'img'):
        image_url = img_tag.get('src')
        
        break

    show.resources.image_url = image_url

    return show
示例#6
0
  def _trans_record(self, record):
    show = Show()
    
    show.venue     = Venue(record.get('venue-name'), record.get('venue-url'))
    show.title     = record.get('title')
    show.merge_key = record.get('merge-key')
    
    performers = []
    
    if record.get('performers'):
      for performer in record['performers'].split(','):
        performers.append(Performer(performer.strip()))
        
    if record.get('tags'):
      show.tags = [t.strip() for t in record['tags'].split(',')]
      
    date_txt = record.get('show-date')
    
    if not date_txt:
      raise Exception('Show Date is required')
    else:
      show.date = date_util.parse_date_time(date_txt)
      
    if performers:
      show.performers = performers
      
    if record.get('show-time'):
      show.show_time = date_util.parse_date_and_time(date_txt, record.get('show-time'))

    if record.get('door-time'):
      show.show_time = date_util.parse_date_and_time(date_txt, record.get('door-time'))

    show.resources.show_url      = record.get('show-url')
    show.resources.image_url     = record.get('image-url')
    show.resources.resource_uris = self.resource_extractor.extract_resources(self._create_resource_doc(record))
      
    return show
示例#7
0
  def _parse_show(self, show_txt):
    parts = show_txt.split(self.SHOW_PART_SEP)
    
    date_txt, time_txt = parts[0], parts[1]
    performers         = parts[-1]

    show = Show()

    show.show_time = date_util.parse_date_and_time(date_txt, time_txt)
    show.venue      = self.venue()
    show.performers = [Performer(p) for p in lang_util.parse_performers(performers)]

    date_util.adjust_fuzzy_years(show, self._parse_started)

    return show
示例#8
0
  def _parse_show(self, link):
    raw_url   = self.raw_url(link)
    
    match     = self.EVENT_ID.search(link)
    
    if not match:
      raise Exception("Unable to locate event id in: %s" % link)
      
    event_id = match.group(0)
    
    logging.debug('Fetching show info: %s' % link)

    event_doc = html_util.fetch_and_parse(link)
    
    show_el   = html_util.get_first_element(event_doc, '#content')

    header_el = html_util.get_first_element(show_el, 'h1')
    
    header_match = self.HEADER_PARSE.search(header_el.text_content())
    
    if not header_match:
      raise Exception("Unable to parse header: %s" % header_el.text_content())
      
    date_txt = header_match.group('date').strip()
    title    = header_match.group('title').strip()
    
    if date_txt.lower().startswith('tonight'):
      date_txt = datetime.today().date().strftime('%F')

    img   = html_util.get_first_element(show_el, 'img', optional = True)

    show = Show()
    
    show.performers = [Performer(p) for p in lang_util.parse_performers(title)]
    show.show_time  = date_util.parse_date_and_time(date_txt, None)

    show.merge_key = event_id
    show.venue     = self.venue()
  
    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(show_el)
    
    if img is not None:
      show.resources.image_url = img.get('src')
      
    return show
示例#9
0
  def _parse_show(self, show_url, event_detail, today):
    show = Show()

    # Union hall will have duplicate instances of #unionhall_performer
    # some may or may not have links, but those that do have links are tagged
    # with the same id again ie: <div id="unionhall_performer"><a href="#" id="#unionhall_performer"> ...
    performers = [Performer(p.text_content()) for p in event_detail.cssselect("#unionhall_performer") if p.tag != 'a']

    performers[0].headliner = True
    
    ticket_link = html_util.get_first_element(event_detail, '#ticket_link a', optional = True)

    show.venue      = self.venue()
    show.performers = performers

    if ticket_link is not None:
      show.merge_key = ticket_link.get('href')

    # Format: THU 3/25: 6pm / $15      
    date_tag   = event_detail.get_element_by_id("unionhall_date")
    
    date_match = self.DATE_RE.match(date_tag.text_content())
    time_match = self.TIME_RE.search(date_tag.text_content())
    
    if date_match and time_match:
      month, day = (int(d) for d in (date_match.group('month'), date_match.group('day')))

      show_date = datetime.now().replace(month = month, day = day)
      
      show.show_time = date_util.parse_date_and_time(show_date.strftime('%F'), time_match.group('time'))

    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    for img_tag in event_detail.iter(tag = 'img'):
      show.resources.image_url = img_tag.get('src')
      
      break

    date_util.adjust_fuzzy_years(show, self._parse_started)
  
    return show
示例#10
0
  def _parse_show(self, tds):
    date, info, time = tds[0], tds[1], tds[2]
    
    date_txt = date.text_content()
    
    time_match = date_util.STRICT_TIME_RE.search(time.text_content())
    
    if time_match:
      time_txt = time_match.group('time')
    else:
      raise Exception('Unable to determine time for show: %s - %s - %s' % (date_txt, time.text_content(), info.text_content()))
      
    performers = []

    for el in info.iter(tag = 'a'):
      performers.append(el.text_content())

    show_time     = date_util.parse_date_and_time(date_txt, time_txt)
    resource_uris = self.resource_extractor.extract_resources(*tds)
      
    return (show_time, ' '.join(performers), resource_uris)
示例#11
0
  def _parse_show(self, el):
    date_el = html_util.get_first_element(el, '.calendardates')
    
    for span in date_el.iter(tag = 'span'):
      if span.get('class') == 'small':
        span.getparent().remove(span)
    
    date_txt = date_el.text_content().lower()
    
    # Skip recurring events
    if 'every' in date_txt:
      return None
      
    date_txt, time_txt = date_txt.split(',')
    
    performers = [] 
    
    title_el = html_util.get_first_element(el, '.calendar')

    for name in title_el.text_content().split('/'):
      performers.append(Performer(name))

    show = Show()

    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_and_time(date_txt, time_txt)

    show.resources.resource_uris = self.resource_extractor.extract_resources(el)
    
    for img in el.iter(tag = 'img'):
      logging.debug('image: %s - %s' % (img.get('src'), self.IMAGE_RE.search(img.get('src', ''))))
      if self.IMAGE_RE.search(img.get('src', '')):
        show.resources.image_url = img.get('src')
        
    date_util.adjust_fuzzy_years(show, self._parse_started)

    return show
示例#12
0
  def _parse_show(self, link):
    event_doc    = html_util.fetch_and_parse(link)

    event_detail = event_doc.get_element_by_id("detail")

    show = Show()

    strong_iter = event_detail.iter(tag = 'strong')

    date_tag, title_tag, blank_tag, desc_tag = strong_iter.next(), strong_iter.next(), strong_iter.next(), strong_iter.next()
    
    date_txt = date_tag.text_content()
    
    if desc_tag.getnext().tail:
      time_match = self.TIME_RE.search(desc_tag.getnext().tail)
    else:
      time_match = None
    
    if time_match:
      time_txt = time_match.group('time')
    else:
      time_txt = None

    show.merge_key = link
    show.venue     = self.venue()
    show.title     = title_tag.text_content()
    show.show_time = date_util.parse_date_and_time(date_txt, time_txt)
    
    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    for img_tag in event_detail.iter(tag = 'img'):
      if 'main' in img_tag.get('src'):
        show.resources.image_url = img_tag.get('src')
        
        break

    return show
示例#13
0
  def _parse_show(self, date_txt, info_el):
    logger.debug('Parsing show in %s' % date_txt)

    info_txt = html_util.get_displayed_text_content(info_el)
    
    performers    = []
    show_time_txt = None
    
    for line in info_txt.split('\n'):
      match = self.PERFORMER_RE.match(line)
      
      if match:
        time_txt, name = match.group('time'), match.group('performer')
        
        show_time_txt = time_txt
        
        performers.append(Performer(name, start_time = time_txt))
    
    if len(performers) == 0:
      return None

    show = Show()

    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_and_time(date_txt, show_time_txt)

    show.resources.resource_uris = self.resource_extractor.extract_resources(info_el)
    
    # Fontanas's stores the large image in an anchor tag
    for a in info_el.iter(tag = 'a'):
      if self.IMAGE_RE.search(a.get('href', '')):
        show.resources.image_url = a.get('href')

    date_util.adjust_fuzzy_years(show, self._parse_started)

    return show
示例#14
0
  def _parse_show(self, event_detail):
    show = Show()
    
    performers = []
    
    content  = html_util.get_displayed_text_content(event_detail).strip()
    date_txt = None

    # This flag is set up and down to allow either of the following to be processed:
    # 1st: Ava Luna
    # or
    # 1st:
    # Ava Luna
    had_num  = True
    
    logger.debug("Parsing show content: %s" % content)
    
    for line in content.split('\n'):
      if line:
        time_match = date_util.STRICT_TIME_RE.search(line)

        if not date_txt:
          date_txt = line
        elif time_match:
          show.show_time = date_util.parse_date_and_time(date_txt, time_match.group('time'))
          
          line = date_util.STRICT_TIME_RE.sub('', line).strip(': ')
          
          if line:
            performers.append(Performer(line))

            had_num = False
          else:
            had_num = True
        elif self.NUM_RE.match(line):
          line = self.NUM_RE.sub('', line).strip()
          
          if line:
            performers.append(Performer(line))

            had_num = False
          else:
            had_num = True
        elif had_num:
          performers.append(Performer(line))
          had_num = False
        else:
          logger.error('Unknown line format: %s' % line)
            
    show.venue      = self.venue()
    show.performers = performers
    show.date       = date_util.parse_date_and_time(date_txt, None)
    
    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    for img_tag in event_detail.iter(tag = 'img'):
      show.resources.image_url = img_tag.get('src')
      
      break

    date_util.adjust_fuzzy_years(show, self._parse_started)

    return show
示例#15
0
  def _parse_shows(self, entry):
    content = None
    shows   = []
    today   = datetime.now()
    
    entry_date = date_util.parse_date_time(entry.published)
    
    # Only parse shows for the current year, or at the tail end of last year
    if entry_date.year != today.year or (entry_date.year == today.year -1 and entry_date.month > 10):
      return []

    for item in entry.content:
      if item.type in ('text/html',):
        content = item.value

    if not content:
      logging.error('Unable to extract content from entry: %s' % entry.id)

      return []
      
    # This next part is technically pretty evil
    entry_doc = lxml.html.fromstring(content)

    tags = ['span', 'b', 'i', 'strong', 'em']

    cleaner = Cleaner(remove_tags = tags, links = False)

    clean_content = cleaner.clean_html(entry_doc)
  
    # FIXME patch lxml to handle this while calling text_content()
    # http://codespeak.net/pipermail/lxml-dev/2008-August/004009.html  
    content_str = lxml.html.tostring(clean_content)
    
    for regexp, replacement in self.REPLACEMENTS:
      content_str = regexp.sub(replacement, content_str)
  
    for part in self.SHOW_DIVIDER_RE.split(content_str):
      part = part.strip(' \t\n')
      
      parts = part.split('\n')
      
      header = parts.pop(0)
      body   = '\n'.join(parts)
      
      header_parts = self.HEADER_SEP_RE.split(header)

      date_txt = header_parts.pop(0)
      time_txt = None
      
      for part in header_parts:
        if date_util.STRICT_TIME_RE.search(part):
          time_txt = date_util.sanitize_time(part)
          
          break
          
      if not time_txt:
        logging.error('Unable to find time in header: %s' % header)
        
        continue

      if '-' in time_txt:
        time_txt = time_txt.split('-')[0].strip()
        
      if not(time_txt.endswith('am') or time_txt.endswith('pm')):
        time_txt = time_txt + 'pm'

      show_doc = lxml.html.fromstring(body)
      
      use_all         = False
      performer_parts = []
      all_parts       = []
      
      for el in show_doc.iter():
        if self._is_img(el):
          break

        text = el.text or ''
        tail = el.tail or ''
  
        for regexp in self.BODY_SKIP:
          if regexp.search(text):
            text = ''

          if regexp.search(tail):
            tail = ''
        
        for p in (text, tail):
          if p:
            all_parts.append(p)

        if text and el.tag != 'a':
          use_all = True
        
        if el.tag == 'a' and tail.strip() not in(',', '&', 'w/', ''):
          use_all = True

        if el.tag == 'a':
          performer_parts.append(text)

      img_url = None
      
      for img in show_doc.iter(tag = 'img'):
        if img.get('src'):
          img_url = img.get('src')
          
          break
          
      show = Show()
      
      show.venue = self.venue()
      
      if use_all:
        performers_str  = ' '.join(all_parts).replace(' ,', ',').replace('  ', ' ')

        show.performers = [Performer(name) for name in lang_util.parse_performers(performers_str)]
      else:
        show.performers = [Performer(name) for name in performer_parts if name]
      
      try:
        show.show_time = date_util.parse_date_and_time(date_txt, time_txt)
      except:
        logging.exception('Unable to parse: %s - %s' % (date_txt, time_txt))
        continue

      show.resources.image_url     = img_url
      show.resources.resource_uris = self.resource_extractor.extract_resources(show_doc)

      date_util.adjust_fuzzy_years(show, entry_date)

      shows.append(show)

    return shows