def find_person(self): """ Load committee details for the given detail page URL or numeric ID """ # Read either person_id or committee_url from the opposite user_overview_url = self.urls['PERSON_OVERVIEW_PRINT_PATTERN'] logging.info("Getting user overview from %s", user_overview_url) time.sleep(self.config.WAIT_TIME) response = self.get_url(user_overview_url) if not response: return # seek(0) is necessary to reset response pointer. response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) trs = dom.xpath(self.xpath['PERSONLIST_LINES']) for tr in trs: current_person = None link = tr.xpath('.//a') if len(link): parsed = parse.search(self.urls['PERSON_DETAIL_PARSE_PATTERN'], link[0].get('href')) if not parsed: parsed = parse.search(self.urls['PERSON_DETAIL_PARSE_PATTERN_ALT'], link[0].get('href')) if parsed: person_id = parsed['person_id'] current_person = Person(numeric_id=person_id) if current_person: tds = tr.xpath('.//td') if len(tds): if len(tds[0]): person_name = tds[0][0].text.strip() if person_name: current_person.title = person_name if len(tds) > 1: person_party = tds[1].text.strip() if person_party: if person_party in self.config.PARTY_ALIAS: person_party = self.config.PARTY_ALIAS[person_party] current_person.committee = [{'committee': Committee(identifier=person_party, title=person_party, type='party')}] if current_person: if hasattr(self, 'person_queue'): self.person_queue.add(current_person.numeric_id) self.db.save_person(current_person) return
def get_person_committee(self, person_id=None, committee_url=None): url = "%skp020.asp?KPLFDNR=%s&history=true" % (self.config.BASE_URL, person_id) response = self.get_url(url) if not url: return tree = html.fromstring(response.text) committees = [] person = Person(numeric_id=person_id) # maps name of type to form name and membership type type_map = { u'Rat der Stadt' : {'mtype' : 'parliament', 'field' : 'PALFDNR'}, u'Fraktion' : {'mtype' : 'organisation', 'field' : 'FRLFDNR'}, u'Ausschüsse' : {'mtype' : 'committee', 'field' : 'AULFDNR'}, 'Stadtbezirk': {'mtype' : 'parliament', 'field' : 'PALFDNR'}, 'BVV': {'mtype' : 'parliament', 'field' : 'PALFDNR'} } # obtain the table with the membership list via a simple state machine mtype = "parliament" field = 'PALFDNR' old_group_id = None # for checking if it changes old_group_name = None # for checking if it changes group_id = None # might break otherwise table = tree.xpath('//*[@id="rismain_raw"]/table[2]')[0] for line in table.findall("tr"): if line[0].tag == "th": what = line[0].text.strip() if what not in type_map: logging.error("Unknown committee type %s at person detail page %s", what, person_id) continue mtype = type_map[what]['mtype'] field = type_map[what]['field'] else: if "Keine Information" in line.text_content(): # skip because no content is available continue membership = {} # first get the name of group group_name = line[1].text_content() committee = Committee(identifier=group_name) committee.type = mtype # now the first col might be a form with more useful information which will carry through until we find another one # with it. we still check the name though form = line[0].find("form") if form is not None: group_id = int(form.find("input[@name='%s']" % field).get("value")) committee.numeric_id = group_id old_group_id = group_id # remember it for next loop old_group_name = group_name # remember it for next loop else: # we did not find a form. We assume that the old group still applies but we nevertheless check if the groupname is still the same if old_group_name != group_name: logging.debug("Group name differs but we didn't get a form with new group id: group name=%s, old group name=%s, group_id=%s at url %s", group_name, old_group_name, old_group_id, url) # TODO: create a list of functions so we can index them somehow function = line[2].text_content() raw_date = line[3].text_content() # parse the date information if "seit" in raw_date: dparts = raw_date.split() membership['end'] = dparts[-1] elif "Keine" in raw_date: # no date information available start_date = end_date = None else: dparts = raw_date.split() membership['start'] = dparts[0] membership['end'] = dparts[-1] membership['committee'] = committee committees.append(membership) person.committee = committees oid = self.db.save_person(person)
def find_person(self): find_person_url = self.config.BASE_URL + 'kp041.asp?template=xyz&selfaction=ws&showAll=true&PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&kpsonst=&kpampa=99999999&kpfr=99999999&kpamfr=99999999&kpau=99999999&kpamau=99999999&searchForm=true&search=Suchen' """parse an XML file and return the tree""" parser = etree.XMLParser(recover=True) r = self.get_url(find_person_url) if not r: return xml = r.text.encode('ascii','xmlcharrefreplace') tree = etree.fromstring(xml, parser=parser) # element 0 is the special block # element 1 is the list of persons for node in tree[1].iterchildren(): elem = {} for e in node.iterchildren(): elem[e.tag] = e.text # now retrieve person details such as committee memberships etc. # we also get the age (but only that, no date of birth) person = Person(numeric_id=int(elem['kplfdnr']), identifier=elem['kplfdnr']) if elem['link_kp']: person.original_url = elem['link_kp'] # personal information if elem['adtit']: person.title = elem['adtit'] if elem['antext1'] == 'Frau': person.sex = 1 elif elem['antext1'] == 'Herr': person.sex = 2 if elem['advname']: person.firstname = elem['advname'] if elem['adname']: person.lastname = elem['adname'] # address if elem['adstr']: person.address = elem['adstr'] if elem['adhnr']: person.house_number = elem['adhnr'] if elem['adplz']: person.postalcode = elem['adplz'] if elem['adtel']: person.phone = elem['adtel'] # contact if elem['adtel']: person.phone = elem['adtel'] if elem['adtel2']: person.mobile = elem['adtel2'] if elem['adfax']: person.fax = elem['adfax'] if elem['adfax']: person.fax = elem['adfax'] if elem['ademail']: person.email = elem['ademail'] if elem['adwww1']: person.website = elem['adwww1'] person_party = elem['kppartei'] if person_party: if person_party in self.config.PARTY_ALIAS: person_party = self.config.PARTY_ALIAS[person_party] person.committee = [{'committee': Committee(identifier=person_party, title=person_party, type='party')}] if elem['link_kp'] is not None: if hasattr(self, 'person_queue'): self.person_queue.add(person.numeric_id) else: logging.info("Person %s %s has no link", person.firstname, person.lastname) oid = self.db.save_person(person)
def get_person_committee(self, person_committee_url=None, person_id=None): """ Load committee details for the given detail page URL or numeric ID """ # Read either committee_id or committee_url from the opposite if person_id is not None: person_committee_url = self.urls['PERSON_COMMITTEE_PRINT_PATTERN'] % person_id elif person_committee_url is not None: parsed = parse.search(self.urls['PERSON_COMMITTEE_PRINT_PATTERN'], person_committee_url) person_id = parsed['person_id'] logging.info("Getting meeting (committee) %d from %s", person_id, person_committee_url) person = Person(numeric_id=person_id) time.sleep(self.config.WAIT_TIME) response = self.get_url(person_committee_url) if not response: return # seek(0) is necessary to reset response pointer. response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) trs = dom.xpath(self.xpath['PERSON_COMMITTEE_LINES']) committees = [] for tr in trs: new_committee = None tds = tr.xpath('.//td') long_info = False if len(tds) == 5: long_info = True if len(tds) == 5 or len(tds) == 2: if tds[0].xpath('.//a'): href = tds[0][0].get('href') href_tmp = href.split('&') # delete __cgrname when it's there if len(href_tmp) == 2: if href_tmp[1][0:10] == '__cgrname=': href = href_tmp[0] parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href) if not parsed: parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN_FULL'], href) if parsed is not None: new_committee = { 'committee': Committee(numeric_id=int(parsed['committee_id']))} new_committee['committee'].identifier = tds[0][0].text new_committee['committee'].title = tds[0][0].text else: new_committee = {'committee': Committee(identifier=tds[0].text)} if new_committee and long_info: new_committee['position'] = tds[2].text if tds[3].text: new_committee['start'] = tds[3].text if tds[4].text: new_committee['end'] = tds[4].text else: if not new_committee: logging.error("Bad Table Structure in %s", person_committee_url) if new_committee: committees.append(new_committee) if committees: person.committee = committees oid = self.db.save_person(person) logging.info("Person %d stored with _id %s", person_id, oid) return