예제 #1
0
  def find_meeting(self, start_date=None, end_date=None):
    """
    Find meetings within a given time frame and add them to the meeting queue.
    """
    meeting_url = "%ssi010.asp?selfaction=ws&template=xyz&kaldatvon=%s&kaldatbis=%s" % (self.config.BASE_URL, start_date.strftime("%d.%m.%Y"), end_date.strftime("%d.%m.%Y"))
    logging.info("Getting meeting overview from %s", meeting_url)
    
    
    parser = etree.XMLParser(recover=True)
    
    r = self.get_url(meeting_url)
    if not r:
      return
    
    xml = r.text.encode('ascii','xmlcharrefreplace') 
    root = etree.fromstring(xml, parser=parser)

    for item in root[1].iterchildren():
      raw_meeting = {}
      for e in item.iterchildren():
        raw_meeting[e.tag] = e.text
      meeting = Meeting(numeric_id=int(raw_meeting['silfdnr']), identifier=int(raw_meeting['silfdnr']))
      meeting.date_start = self.parse_date(raw_meeting['sisbvcs'])
      meeting.date_end = self.parse_date(raw_meeting['sisevcs'])
      meeting.identifier = raw_meeting['siname']
      meeting.original_url = "%sto010.asp?SILFDNR=%s&options=4" % (self.config.BASE_URL, raw_meeting['silfdnr'])
      meeting.title = raw_meeting['sitext']
      meeting.committee_name = raw_meeting['grname']
      meeting.description = raw_meeting['sitext']
      oid = self.db.save_meeting(meeting)
      self.meeting_queue.add(meeting.numeric_id)
예제 #2
0
  def get_meeting(self, meeting_url=None, meeting_id=None):
    """
    Load meeting details for the given detail page URL or numeric ID
    """
    # Read either meeting_id or meeting_url from the opposite
    if meeting_id is not None:
      meeting_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % meeting_id
    elif meeting_url is not None:
      parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'], meeting_url)
      meeting_id = parsed['meeting_id']
  
    logging.info("Getting meeting (session) %d from %s", meeting_id, meeting_url)
  
    meeting = Meeting(numeric_id=meeting_id)
    
    time.sleep(self.config.WAIT_TIME)
    response = self.get_url(meeting_url)
    if not response:
      return
    
    # forms for later document download
    mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
    # seek(0) is necessary to reset response pointer.
    response.seek(0)
    html = response.read()
    html = html.replace(' ', ' ')
    parser = etree.HTMLParser()
    dom = etree.parse(StringIO(html), parser)
    # check for page errors
    try:
      page_title = dom.xpath('//h1')[0].text
      if 'Fehlermeldung' in page_title:
        logging.info("Page %s cannot be accessed due to server error", meeting_url)
        return
      if 'Berechtigungsfehler' in page_title:
        logging.info("Page %s cannot be accessed due to permissions", meeting_url)
        return
    except:
      pass
    try:
      error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip()
      if 'Keine Daten gefunden' in error_h3:
        logging.info("Page %s does not contain any agenda items", meeting_url)
        return
      if 'Fehlercode: 1104' in error_h3:
        logging.info("Page %s cannot be accessed due to permissions", meeting_url)
        return
    except:
      pass
  
    meeting.original_url = meeting_url
    # Session title
    try:
      meeting.title = dom.xpath(self.xpath['SESSION_DETAIL_TITLE'])[0].text
    except:
      logging.critical('Cannot find session title element using XPath SESSION_DETAIL_TITLE')
      raise TemplateError('Cannot find session title element using XPath SESSION_DETAIL_TITLE')
  
    # Committe link
    #try:
    #  links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK'])
    #  for link in links:
    #    href = link.get('href')
    #    parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href)
    #    if parsed is not None:
    #      meeting.committees = [Commitee(numeric_id=int(parsed['committee_id']))]
    #      if hasattr(self, 'committee_queue'):
    #        self.committee_queue.add(int(parsed['committee_id']))
    #except:
    #  logging.critical('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH')
    #  raise TemplateError('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH')
  
    # Meeting identifier, date, address etc
    tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD'])
    if len(tds) == 0:
      logging.critical('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH at session ' + meeting_url)
      raise TemplateError('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH at session ' + meeting_url)
    else:
      for n in range(0, len(tds)):
        try:
          tdcontent = tds[n].text.strip()
          nextcontent = tds[n + 1].text.strip()
        except:
          continue
        if tdcontent == 'Sitzung:':
          meeting.identifier = nextcontent
        # We don't need this any more because it's scraped in committee detail page(?)
        #elif tdcontent == 'Gremium:':
        #  meeting.committee_name = nextcontent
        elif tdcontent == 'Datum:':
          start = nextcontent
          end = nextcontent
          if tds[n + 2].text == 'Zeit:':
            if tds[n + 3].text is not None:
              times = tds[n + 3].text.replace(' Uhr', '').split('-')
              start = start + ' ' + times[0]
              if len(times) > 1:
                end = end + ' ' + times[1]
              else:
                end = start
            meeting.start = start
            meeting.end = end
        elif tdcontent == 'Raum:':
          meeting.address = " ".join(tds[n + 1].xpath('./text()'))
        elif tdcontent == 'Bezeichnung:':
          meeting.description = nextcontent
        if not hasattr(meeting, 'identifier'):
          logging.critical('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD')
          raise TemplateError('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD')
  
    # Agendaitems
    found_documents = []
    rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS'])
    if len(rows) == 0:
      logging.critical('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
      raise TemplateError('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
      meeting.agendaitem = []
    else:
      agendaitems = []
      agendaitem_id = None
      public = True
      agendaitem = None
      for row in rows:
        row_id = row.get('id')
        row_classes = row.get('class').split(' ')
        fields = row.xpath('td')
        number = fields[0].xpath('./text()')
        if len(number) > 0:
          number = number[0]
        else:
          # when theres a updated notice theres an additional spam
          number = fields[0].xpath('.//span/text()')
          if len(number) > 0:
            number = number[0]
        if number == []:
          number = None
        if row_id is not None:
          # Agendaitem main row
          # first: save agendaitem from before
          if agendaitem:
            agendaitems.append(agendaitem)
          # create new agendaitem
          agendaitem = Agendaitem(numeric_id=int(row_id.rsplit('_', 1)[1]))
          if number is not None:
            agendaitem.sequence_number = number
          # in some ris this is a link, sometimes not. test both.
          if len(fields[1].xpath('./a/text()')):
            agendaitem.title = "; ".join(fields[1].xpath('./a/text()'))
          elif len(fields[1].xpath('./text()')):
            agendaitem.title = "; ".join(fields[1].xpath('./text()'))
          # ignore no agendaitem information
          if agendaitem.title == 'keine Tagesordnungspunkte':
            agendaitem = None
            continue
          agendaitem.public = public
          # paper links
          links = row.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK'])
          papers = []
          for link in links:
            href = link.get('href')
            if href is None:
              continue
            parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
            if parsed is not None:
              paper = Paper(numeric_id=int(parsed['paper_id']), identifier=link.text)
              papers.append(paper)
              # Add paper to paper queue
              if hasattr(self, 'paper_queue'):
                self.paper_queue.add(int(parsed['paper_id']))
          if len(papers):
            agendaitem.paper = papers
          """
          Note: we don't scrape agendaitem-related documents for now,
          based on the assumption that they are all found via paper
          detail pages. All we do here is get a list of document IDs
          in found_documents
          """
          # find links
          links = row.xpath('.//a[contains(@href,"getfile.")]')
          for link in links:
            if not link.xpath('.//img'):
              file_link = self.config.BASE_URL + link.get('href')
              document_id = file_link.split('id=')[1].split('&')[0]
              found_documents.append(document_id)
          # find forms
          forms = row.xpath('.//form')
          for form in forms:
            for hidden_field in form.xpath('input'):
              if hidden_field.get('name') != 'DT':
                continue
              document_id = hidden_field.get('value')
              found_documents.append(document_id)
        # Alternative für smc_tophz wegen Version 4.3.5 bi (Layout 3)
        elif ('smc_tophz' in row_classes) or (row.get('valign') == 'top' and row.get('debug') == '3'):
          # additional (optional row for agendaitem)
          label = fields[1].text
          value = fields[2].text
          if label is not None and value is not None:
            label = label.strip()
            value = value.strip()
            if label in ['Ergebnis:', 'Beschluss:', 'Beratungsergebnis:']:
              if value in self.config.RESULT_STRINGS:
                agendaitem.result = self.config.RESULT_STRINGS[value]
              else:
                logging.warn("String '%s' not found in configured RESULT_STRINGS", value)
              agendaitem.result = value
            elif label in ['Bemerkung:', 'Abstimmung:']:
              agendaitem.result_details = value
            # What's this?
            #elif label == 'Abstimmung:':
            #  agendaitems[agendaitem_id]['voting'] = value
            else:
              logging.critical("Agendaitem info label '%s' is unknown", label)
              raise ValueError('Agendaitem info label "%s" is unknown' % label)
        elif 'smcrowh' in row_classes:
          # Subheading (public / nonpublic part)
          if fields[0].text is not None and "Nicht öffentlich" in fields[0].text.encode('utf-8'):
            public = False
      meeting.agendaitem = agendaitems

    # meeting-related documents
    containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS'])
    for container in containers:
      classes = container.get('class')
      if classes is None:
        continue
      classes = classes.split(' ')
      if self.xpath['SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
        continue
      documents = []
      rows = container.xpath('.//tr')
      for row in rows:
        if not row.xpath('.//form'):
          links = row.xpath('.//a')
          for link in links:
            # ignore additional pdf icon links
            if not link.xpath('.//img'):
              title = ' '.join(link.xpath('./text()')).strip()
              file_link = self.config.BASE_URL + link.get('href')
              document_id = file_link.split('id=')[1].split('&')[0]
              if document_id in found_documents:
                continue
              document = Document(
                identifier=document_id,
                numeric_id=document_id,
                title=title,
                original_url=file_link)
              document = self.get_document_file(document=document, link=file_link)
              if 'Einladung' in title:
                document_type = 'invitation'
              elif 'Niederschrift' in title:
                document_type = 'results_protocol'
              else:
                document_type = 'misc'
              documents.append({'relation': document_type, 'document': document})
              found_documents.append(document_id)
        else:
          forms = row.xpath('.//form')
          for form in forms:
            title = " ".join(row.xpath('./td/text()')).strip()
            for hidden_field in form.xpath('input'):
              if hidden_field.get('name') != 'DT':
                continue
              document_id = hidden_field.get('value')
              # make sure to add only those which aren't agendaitem-related
              if document_id not in found_documents:
                document = Document(
                  identifier=document_id,
                  numeric_id=document_id,
                  title=title
                )
                # Traversing the whole mechanize response to submit this form
                for mform in mechanize_forms:
                  for control in mform.controls:
                    if control.name == 'DT' and control.value == document_id:
                      document = self.get_document_file(document, mform)
                if 'Einladung' in title:
                  document_type = 'invitation'
                elif 'Niederschrift' in title:
                  document_type = 'results_protocol'
                else:
                  document_type = 'misc'
                documents.append({'relation': document_type, 'document': document})
                found_documents.append(document_id)
      if len(documents):
        meeting.document = documents
    oid = self.db.save_meeting(meeting)
    logging.info("Meeting %d stored with _id %s", meeting_id, oid)