Пример #1
0
 def find_person(self):
   """
   Load committee details for the given detail page URL or numeric ID
   """
   # Read either person_id or committee_url from the opposite
   user_overview_url = self.urls['PERSON_OVERVIEW_PRINT_PATTERN']
   logging.info("Getting user overview from %s", user_overview_url)
   
   time.sleep(self.config.WAIT_TIME)
   response = self.get_url(user_overview_url)
   if not response:
     return
   
   # seek(0) is necessary to reset response pointer.
   response.seek(0)
   html = response.read()
   html = html.replace(' ', ' ')
   parser = etree.HTMLParser()
   dom = etree.parse(StringIO(html), parser)
   
   trs = dom.xpath(self.xpath['PERSONLIST_LINES'])
   for tr in trs:
     current_person = None
     link = tr.xpath('.//a')
     if len(link):
       parsed = parse.search(self.urls['PERSON_DETAIL_PARSE_PATTERN'], link[0].get('href'))
       if not parsed:
         parsed = parse.search(self.urls['PERSON_DETAIL_PARSE_PATTERN_ALT'], link[0].get('href'))
       if parsed:
         person_id = parsed['person_id']
         current_person = Person(numeric_id=person_id)
     if current_person:
       tds = tr.xpath('.//td')
       if len(tds):
         if len(tds[0]):
           person_name = tds[0][0].text.strip()
           if person_name:
             current_person.title = person_name
       if len(tds) > 1:
         person_party = tds[1].text.strip()
         if person_party:
           if person_party in self.config.PARTY_ALIAS:
             person_party = self.config.PARTY_ALIAS[person_party]
           current_person.committee = [{'committee': Committee(identifier=person_party, title=person_party, type='party')}]
       if current_person:
         if hasattr(self, 'person_queue'):
           self.person_queue.add(current_person.numeric_id)
         self.db.save_person(current_person)
   return
Пример #2
0
  def get_person_committee(self, person_id=None, committee_url=None):
    url = "%skp020.asp?KPLFDNR=%s&history=true" % (self.config.BASE_URL, person_id)
    response = self.get_url(url)
    if not url:
      return
    tree = html.fromstring(response.text)
      
    committees = []
    person = Person(numeric_id=person_id)
    # maps name of type to form name and membership type
    type_map = {
      u'Rat der Stadt' : {'mtype' : 'parliament', 'field' : 'PALFDNR'},
      u'Fraktion' : {'mtype' : 'organisation', 'field' : 'FRLFDNR'},
      u'Ausschüsse' : {'mtype' : 'committee', 'field' : 'AULFDNR'},
      'Stadtbezirk': {'mtype' : 'parliament', 'field' : 'PALFDNR'},
      'BVV': {'mtype' : 'parliament', 'field' : 'PALFDNR'}
    }

    # obtain the table with the membership list via a simple state machine
    mtype = "parliament"
    field = 'PALFDNR'
    old_group_id = None         # for checking if it changes
    old_group_name = None       # for checking if it changes
    group_id = None             # might break otherwise
    table = tree.xpath('//*[@id="rismain_raw"]/table[2]')[0]
    for line in table.findall("tr"):
      if line[0].tag == "th":
        what = line[0].text.strip()
        if what not in type_map:
          logging.error("Unknown committee type %s at person detail page %s", what, person_id)
          continue
        mtype = type_map[what]['mtype']
        field = type_map[what]['field']
      else:
        if "Keine Information" in line.text_content():
          # skip because no content is available
          continue
        
        membership = {}
        
        # first get the name of group
        group_name = line[1].text_content()
        committee = Committee(identifier=group_name)
        committee.type = mtype

        # now the first col might be a form with more useful information which will carry through until we find another one
        # with it. we still check the name though
        form = line[0].find("form")
        if form is not None:
          group_id = int(form.find("input[@name='%s']" % field).get("value"))
          committee.numeric_id = group_id
          old_group_id = group_id # remember it for next loop
          old_group_name = group_name # remember it for next loop
          
        else:
          # we did not find a form. We assume that the old group still applies but we nevertheless check if the groupname is still the same
          if old_group_name != group_name:
            logging.debug("Group name differs but we didn't get a form with new group id: group name=%s, old group name=%s, group_id=%s at url %s", group_name, old_group_name, old_group_id, url)
        
        # TODO: create a list of functions so we can index them somehow
        function = line[2].text_content()
        raw_date = line[3].text_content()
        
        # parse the date information
        if "seit" in raw_date:
          dparts = raw_date.split()
          membership['end'] = dparts[-1]
        elif "Keine" in raw_date:
          # no date information available
          start_date = end_date = None
        else:
          dparts = raw_date.split()
          membership['start'] = dparts[0]
          membership['end'] = dparts[-1]
        
        membership['committee'] = committee
        committees.append(membership)
        
    person.committee = committees
    oid = self.db.save_person(person)
Пример #3
0
 def find_person(self):
   find_person_url = self.config.BASE_URL + 'kp041.asp?template=xyz&selfaction=ws&showAll=true&PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&kpsonst=&kpampa=99999999&kpfr=99999999&kpamfr=99999999&kpau=99999999&kpamau=99999999&searchForm=true&search=Suchen'
   
   """parse an XML file and return the tree"""
   parser = etree.XMLParser(recover=True)
   r = self.get_url(find_person_url)
   if not r:
     return
   xml = r.text.encode('ascii','xmlcharrefreplace') 
   tree = etree.fromstring(xml, parser=parser)
 
   # element 0 is the special block
   # element 1 is the list of persons
   for node in tree[1].iterchildren():
     elem = {}
     for e in node.iterchildren():
       elem[e.tag] = e.text
     
     # now retrieve person details such as committee memberships etc.
     # we also get the age (but only that, no date of birth)
     person = Person(numeric_id=int(elem['kplfdnr']), identifier=elem['kplfdnr'])
     if elem['link_kp']:
       person.original_url = elem['link_kp']
     # personal information
     
     if elem['adtit']:
       person.title = elem['adtit']
     if elem['antext1'] == 'Frau':
       person.sex = 1
     elif elem['antext1'] == 'Herr':
       person.sex = 2
     if elem['advname']:
       person.firstname = elem['advname']
     if elem['adname']:
       person.lastname = elem['adname']
     
     # address
     if elem['adstr']:
       person.address = elem['adstr']
     if elem['adhnr']:
       person.house_number = elem['adhnr']
     if elem['adplz']:
       person.postalcode = elem['adplz']
     if elem['adtel']:
       person.phone = elem['adtel']
     
     # contact
     if elem['adtel']:
       person.phone = elem['adtel']
     if elem['adtel2']:
       person.mobile = elem['adtel2']
     if elem['adfax']:
       person.fax = elem['adfax']
     if elem['adfax']:
       person.fax = elem['adfax']
     if elem['ademail']:
       person.email = elem['ademail']
     if elem['adwww1']:
       person.website = elem['adwww1']
     
     person_party = elem['kppartei']
     if person_party:
       if person_party in self.config.PARTY_ALIAS:
         person_party = self.config.PARTY_ALIAS[person_party]
       person.committee = [{'committee': Committee(identifier=person_party, title=person_party, type='party')}]
     
     if elem['link_kp'] is not None:
       if hasattr(self, 'person_queue'):
         self.person_queue.add(person.numeric_id)
     else:
       logging.info("Person %s %s has no link", person.firstname, person.lastname)
     oid = self.db.save_person(person)
Пример #4
0
 def get_person_committee(self, person_committee_url=None, person_id=None):
   """
   Load committee details for the given detail page URL or numeric ID
   """
   # Read either committee_id or committee_url from the opposite
   if person_id is not None:
     person_committee_url = self.urls['PERSON_COMMITTEE_PRINT_PATTERN'] % person_id
   elif person_committee_url is not None:
     parsed = parse.search(self.urls['PERSON_COMMITTEE_PRINT_PATTERN'], person_committee_url)
     person_id = parsed['person_id']
 
   logging.info("Getting meeting (committee) %d from %s", person_id, person_committee_url)
   
   person = Person(numeric_id=person_id)
   
   time.sleep(self.config.WAIT_TIME)
   response = self.get_url(person_committee_url)
   if not response:
     return
   
   # seek(0) is necessary to reset response pointer.
   response.seek(0)
   html = response.read()
   html = html.replace(' ', ' ')
   parser = etree.HTMLParser()
   dom = etree.parse(StringIO(html), parser)
   
   trs = dom.xpath(self.xpath['PERSON_COMMITTEE_LINES'])
   committees = []
   for tr in trs:
     new_committee = None
     tds = tr.xpath('.//td')
     long_info = False
     if len(tds) == 5:
       long_info = True
     if len(tds) == 5 or len(tds) == 2:
       if tds[0].xpath('.//a'):
         href = tds[0][0].get('href')
         href_tmp = href.split('&')
         # delete __cgrname when it's there
         if len(href_tmp) == 2:
           if href_tmp[1][0:10] == '__cgrname=':
             href = href_tmp[0]
         parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href)
         if not parsed:
           parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN_FULL'], href)
         if parsed is not None:
           new_committee = { 'committee': Committee(numeric_id=int(parsed['committee_id']))}
           new_committee['committee'].identifier = tds[0][0].text
           new_committee['committee'].title = tds[0][0].text
       else:
         new_committee = {'committee': Committee(identifier=tds[0].text)}
       if new_committee and long_info:
         new_committee['position'] = tds[2].text
         if tds[3].text:
           new_committee['start'] = tds[3].text
         if tds[4].text:
           new_committee['end'] = tds[4].text
       else:
         if not new_committee:
           logging.error("Bad Table Structure in %s", person_committee_url)
     if new_committee:
       committees.append(new_committee)
   if committees:
     person.committee = committees
   oid = self.db.save_person(person)
   logging.info("Person %d stored with _id %s", person_id, oid)
   return