def find_person(self): find_person_url = (self.config['scraper']['base_url'] + 'kp041.asp?template=xyz&selfaction=ws&showAll=true&' 'PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&' 'kpsonst=&kpampa=99999999&kpfr=99999999&' 'kpamfr=99999999&kpau=99999999&kpamau=99999999&' 'searchForm=true&search=Suchen') logging.info("Getting person overview from %s", find_person_url) """parse an XML file and return the tree""" parser = etree.XMLParser(recover=True) r = self.get_url(find_person_url) if not r: return xml = r.text.encode('ascii', 'xmlcharrefreplace') tree = etree.fromstring(xml, parser=parser) h = HTMLParser.HTMLParser() # element 0 is the special block # element 1 is the list of persons for node in tree[1].iterchildren(): elem = {} for e in node.iterchildren(): if e.text: elem[e.tag] = h.unescape(e.text) else: elem[e.tag] = '' # now retrieve person details such as organization memberships etc. # we also get the age (but only that, no date of birth) person = Person(originalId=int(elem['kplfdnr'])) if elem['link_kp']: person.originalUrl = elem['link_kp'] # personal information if elem['adtit']: person.title = elem['adtit'] if elem['antext1'] == 'Frau': person.sex = 1 elif elem['antext1'] == 'Herr': person.sex = 2 if elem['advname']: person.firstname = elem['advname'] if elem['adname']: person.lastname = elem['adname'] # address if elem['adstr']: person.address = elem['adstr'] if elem['adhnr']: person.house_number = elem['adhnr'] if elem['adplz']: person.postalcode = elem['adplz'] if elem['adtel']: person.phone = elem['adtel'] # contact if elem['adtel']: person.phone = elem['adtel'] if elem['adtel2']: person.mobile = elem['adtel2'] if elem['adfax']: person.fax = elem['adfax'] if elem['adfax']: person.fax = elem['adfax'] if elem['ademail']: person.email = elem['ademail'] if elem['adwww1']: person.website = elem['adwww1'] person_party = elem['kppartei'] if person_party: if person_party in self.config['scraper']['party_alias']: person_party = self.config['scraper']['party_alias'][ person_party] new_organization = Organization(originalId=person_party, name=person_party, classification='party') original_id = unicode(person.originalId) + '-' + person_party person.membership = [ Membership(originalId=original_id, organization=new_organization) ] if elem['link_kp'] is not None: if hasattr(self, 'person_queue'): self.person_queue.add(person.originalId) else: logging.info("Person %s %s has no link", person.firstname, person.lastname) self.db.save_person(person)
def find_person(self): find_person_url = self.config.BASE_URL + 'kp041.asp?template=xyz&selfaction=ws&showAll=true&PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&kpsonst=&kpampa=99999999&kpfr=99999999&kpamfr=99999999&kpau=99999999&kpamau=99999999&searchForm=true&search=Suchen' """parse an XML file and return the tree""" parser = etree.XMLParser(recover=True) r = self.get_url(find_person_url) if not r: return xml = r.text.encode('ascii','xmlcharrefreplace') tree = etree.fromstring(xml, parser=parser) # element 0 is the special block # element 1 is the list of persons for node in tree[1].iterchildren(): elem = {} for e in node.iterchildren(): elem[e.tag] = e.text # now retrieve person details such as committee memberships etc. # we also get the age (but only that, no date of birth) person = Person(numeric_id=int(elem['kplfdnr']), identifier=elem['kplfdnr']) if elem['link_kp']: person.original_url = elem['link_kp'] # personal information if elem['adtit']: person.title = elem['adtit'] if elem['antext1'] == 'Frau': person.sex = 1 elif elem['antext1'] == 'Herr': person.sex = 2 if elem['advname']: person.firstname = elem['advname'] if elem['adname']: person.lastname = elem['adname'] # address if elem['adstr']: person.address = elem['adstr'] if elem['adhnr']: person.house_number = elem['adhnr'] if elem['adplz']: person.postalcode = elem['adplz'] if elem['adtel']: person.phone = elem['adtel'] # contact if elem['adtel']: person.phone = elem['adtel'] if elem['adtel2']: person.mobile = elem['adtel2'] if elem['adfax']: person.fax = elem['adfax'] if elem['adfax']: person.fax = elem['adfax'] if elem['ademail']: person.email = elem['ademail'] if elem['adwww1']: person.website = elem['adwww1'] person_party = elem['kppartei'] if person_party: if person_party in self.config.PARTY_ALIAS: person_party = self.config.PARTY_ALIAS[person_party] person.committee = [{'committee': Committee(identifier=person_party, title=person_party, type='party')}] if elem['link_kp'] is not None: if hasattr(self, 'person_queue'): self.person_queue.add(person.numeric_id) else: logging.info("Person %s %s has no link", person.firstname, person.lastname) oid = self.db.save_person(person)
def find_person(self): find_person_url = (self.config['scraper']['base_url'] + 'kp041.asp?template=xyz&selfaction=ws&showAll=true&' 'PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&' 'kpsonst=&kpampa=99999999&kpfr=99999999&' 'kpamfr=99999999&kpau=99999999&kpamau=99999999&' 'searchForm=true&search=Suchen') logging.info("Getting person overview from %s", find_person_url) """parse an XML file and return the tree""" parser = etree.XMLParser(recover=True) r = self.get_url(find_person_url) if not r: return xml = r.text.encode('ascii', 'xmlcharrefreplace') tree = etree.fromstring(xml, parser=parser) h = HTMLParser.HTMLParser() # element 0 is the special block # element 1 is the list of persons for node in tree[1].iterchildren(): elem = {} for e in node.iterchildren(): if e.text: elem[e.tag] = h.unescape(e.text) else: elem[e.tag] = '' # now retrieve person details such as organization memberships etc. # we also get the age (but only that, no date of birth) person = Person(originalId=int(elem['kplfdnr'])) if elem['link_kp']: person.originalUrl = elem['link_kp'] # personal information if elem['adtit']: person.title = elem['adtit'] if elem['antext1'] == 'Frau': person.sex = 1 elif elem['antext1'] == 'Herr': person.sex = 2 if elem['advname']: person.firstname = elem['advname'] if elem['adname']: person.lastname = elem['adname'] # address if elem['adstr']: person.address = elem['adstr'] if elem['adhnr']: person.house_number = elem['adhnr'] if elem['adplz']: person.postalcode = elem['adplz'] if elem['adtel']: person.phone = elem['adtel'] # contact if elem['adtel']: person.phone = elem['adtel'] if elem['adtel2']: person.mobile = elem['adtel2'] if elem['adfax']: person.fax = elem['adfax'] if elem['adfax']: person.fax = elem['adfax'] if elem['ademail']: person.email = elem['ademail'] if elem['adwww1']: person.website = elem['adwww1'] person_party = elem['kppartei'] if person_party: if person_party in self.config['scraper']['party_alias']: person_party = self.config['scraper']['party_alias'][person_party] new_organization = Organization(originalId=person_party, name=person_party, classification='party') original_id = unicode(person.originalId) + '-' + person_party person.membership = [Membership(originalId=original_id, organization=new_organization)] if elem['link_kp'] is not None: if hasattr(self, 'person_queue'): self.person_queue.add(person.originalId) else: logging.info("Person %s %s has no link", person.firstname, person.lastname) self.db.save_person(person)