Пример #1
0
    def find_person(self):
        find_person_url = (self.config['scraper']['base_url'] +
                           'kp041.asp?template=xyz&selfaction=ws&showAll=true&'
                           'PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&'
                           'kpsonst=&kpampa=99999999&kpfr=99999999&'
                           'kpamfr=99999999&kpau=99999999&kpamau=99999999&'
                           'searchForm=true&search=Suchen')
        logging.info("Getting person overview from %s", find_person_url)
        """parse an XML file and return the tree"""
        parser = etree.XMLParser(recover=True)
        r = self.get_url(find_person_url)
        if not r:
            return
        xml = r.text.encode('ascii', 'xmlcharrefreplace')
        tree = etree.fromstring(xml, parser=parser)
        h = HTMLParser.HTMLParser()

        # element 0 is the special block
        # element 1 is the list of persons
        for node in tree[1].iterchildren():
            elem = {}
            for e in node.iterchildren():
                if e.text:
                    elem[e.tag] = h.unescape(e.text)
                else:
                    elem[e.tag] = ''

            # now retrieve person details such as organization memberships etc.
            # we also get the age (but only that, no date of birth)
            person = Person(originalId=int(elem['kplfdnr']))
            if elem['link_kp']:
                person.originalUrl = elem['link_kp']
            # personal information

            if elem['adtit']:
                person.title = elem['adtit']
            if elem['antext1'] == 'Frau':
                person.sex = 1
            elif elem['antext1'] == 'Herr':
                person.sex = 2
            if elem['advname']:
                person.firstname = elem['advname']
            if elem['adname']:
                person.lastname = elem['adname']

            # address
            if elem['adstr']:
                person.address = elem['adstr']
            if elem['adhnr']:
                person.house_number = elem['adhnr']
            if elem['adplz']:
                person.postalcode = elem['adplz']
            if elem['adtel']:
                person.phone = elem['adtel']

            # contact
            if elem['adtel']:
                person.phone = elem['adtel']
            if elem['adtel2']:
                person.mobile = elem['adtel2']
            if elem['adfax']:
                person.fax = elem['adfax']
            if elem['adfax']:
                person.fax = elem['adfax']
            if elem['ademail']:
                person.email = elem['ademail']
            if elem['adwww1']:
                person.website = elem['adwww1']

            person_party = elem['kppartei']
            if person_party:
                if person_party in self.config['scraper']['party_alias']:
                    person_party = self.config['scraper']['party_alias'][
                        person_party]
                new_organization = Organization(originalId=person_party,
                                                name=person_party,
                                                classification='party')
                original_id = unicode(person.originalId) + '-' + person_party
                person.membership = [
                    Membership(originalId=original_id,
                               organization=new_organization)
                ]

            if elem['link_kp'] is not None:
                if hasattr(self, 'person_queue'):
                    self.person_queue.add(person.originalId)
            else:
                logging.info("Person %s %s has no link", person.firstname,
                             person.lastname)
            self.db.save_person(person)
Пример #2
0
 def find_person(self):
   find_person_url = self.config.BASE_URL + 'kp041.asp?template=xyz&selfaction=ws&showAll=true&PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&kpsonst=&kpampa=99999999&kpfr=99999999&kpamfr=99999999&kpau=99999999&kpamau=99999999&searchForm=true&search=Suchen'
   
   """parse an XML file and return the tree"""
   parser = etree.XMLParser(recover=True)
   r = self.get_url(find_person_url)
   if not r:
     return
   xml = r.text.encode('ascii','xmlcharrefreplace') 
   tree = etree.fromstring(xml, parser=parser)
 
   # element 0 is the special block
   # element 1 is the list of persons
   for node in tree[1].iterchildren():
     elem = {}
     for e in node.iterchildren():
       elem[e.tag] = e.text
     
     # now retrieve person details such as committee memberships etc.
     # we also get the age (but only that, no date of birth)
     person = Person(numeric_id=int(elem['kplfdnr']), identifier=elem['kplfdnr'])
     if elem['link_kp']:
       person.original_url = elem['link_kp']
     # personal information
     
     if elem['adtit']:
       person.title = elem['adtit']
     if elem['antext1'] == 'Frau':
       person.sex = 1
     elif elem['antext1'] == 'Herr':
       person.sex = 2
     if elem['advname']:
       person.firstname = elem['advname']
     if elem['adname']:
       person.lastname = elem['adname']
     
     # address
     if elem['adstr']:
       person.address = elem['adstr']
     if elem['adhnr']:
       person.house_number = elem['adhnr']
     if elem['adplz']:
       person.postalcode = elem['adplz']
     if elem['adtel']:
       person.phone = elem['adtel']
     
     # contact
     if elem['adtel']:
       person.phone = elem['adtel']
     if elem['adtel2']:
       person.mobile = elem['adtel2']
     if elem['adfax']:
       person.fax = elem['adfax']
     if elem['adfax']:
       person.fax = elem['adfax']
     if elem['ademail']:
       person.email = elem['ademail']
     if elem['adwww1']:
       person.website = elem['adwww1']
     
     person_party = elem['kppartei']
     if person_party:
       if person_party in self.config.PARTY_ALIAS:
         person_party = self.config.PARTY_ALIAS[person_party]
       person.committee = [{'committee': Committee(identifier=person_party, title=person_party, type='party')}]
     
     if elem['link_kp'] is not None:
       if hasattr(self, 'person_queue'):
         self.person_queue.add(person.numeric_id)
     else:
       logging.info("Person %s %s has no link", person.firstname, person.lastname)
     oid = self.db.save_person(person)
    def find_person(self):
        find_person_url = (self.config['scraper']['base_url'] +
                           'kp041.asp?template=xyz&selfaction=ws&showAll=true&'
                           'PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&'
                           'kpsonst=&kpampa=99999999&kpfr=99999999&'
                           'kpamfr=99999999&kpau=99999999&kpamau=99999999&'
                           'searchForm=true&search=Suchen')
        logging.info("Getting person overview from %s", find_person_url)

        """parse an XML file and return the tree"""
        parser = etree.XMLParser(recover=True)
        r = self.get_url(find_person_url)
        if not r:
            return
        xml = r.text.encode('ascii', 'xmlcharrefreplace')
        tree = etree.fromstring(xml, parser=parser)
        h = HTMLParser.HTMLParser()

        # element 0 is the special block
        # element 1 is the list of persons
        for node in tree[1].iterchildren():
            elem = {}
            for e in node.iterchildren():
                if e.text:
                    elem[e.tag] = h.unescape(e.text)
                else:
                    elem[e.tag] = ''

            # now retrieve person details such as organization memberships etc.
            # we also get the age (but only that, no date of birth)
            person = Person(originalId=int(elem['kplfdnr']))
            if elem['link_kp']:
                person.originalUrl = elem['link_kp']
            # personal information

            if elem['adtit']:
                person.title = elem['adtit']
            if elem['antext1'] == 'Frau':
                person.sex = 1
            elif elem['antext1'] == 'Herr':
                person.sex = 2
            if elem['advname']:
                person.firstname = elem['advname']
            if elem['adname']:
                person.lastname = elem['adname']

            # address
            if elem['adstr']:
                person.address = elem['adstr']
            if elem['adhnr']:
                person.house_number = elem['adhnr']
            if elem['adplz']:
                person.postalcode = elem['adplz']
            if elem['adtel']:
                person.phone = elem['adtel']

            # contact
            if elem['adtel']:
                person.phone = elem['adtel']
            if elem['adtel2']:
                person.mobile = elem['adtel2']
            if elem['adfax']:
                person.fax = elem['adfax']
            if elem['adfax']:
                person.fax = elem['adfax']
            if elem['ademail']:
                person.email = elem['ademail']
            if elem['adwww1']:
                person.website = elem['adwww1']

            person_party = elem['kppartei']
            if person_party:
                if person_party in self.config['scraper']['party_alias']:
                    person_party = self.config['scraper']['party_alias'][person_party]
                new_organization = Organization(originalId=person_party,
                                                name=person_party,
                                                classification='party')
                original_id = unicode(person.originalId) + '-' + person_party
                person.membership = [Membership(originalId=original_id,
                                                organization=new_organization)]

            if elem['link_kp'] is not None:
                if hasattr(self, 'person_queue'):
                    self.person_queue.add(person.originalId)
            else:
                logging.info("Person %s %s has no link", person.firstname,
                             person.lastname)
            self.db.save_person(person)