Exemplo n.º 1
0
    def process(self):
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1
            self.print_progress(index)
            id = person[0]
            name = person[1]
            name = name.decode('latin1')
            name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue

            encoded_name = urllib.quote(urllib.quote(name.encode('utf8')))
            biourl = "http://www.inghist.nl/retroboeken/nib/?zoekveld=abdul"
            biourl += "&soort=persoon#accessor=cumulatieveindex&accessor_href=CumulatieveIndex%2FPersonenIndex%3Fzoekveld%3D"
            biourl += name
            biourl += "%26soort%3Dpersoon"

            text = person[2]
            text = text.decode('latin1')

            # ----
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
                text=text,
            )
            self.write_file(bdes, index)
Exemplo n.º 2
0
    def process(self):
        tree = etree.parse(INPUT)
        entries = tree.xpath("//item")
        self.total = len(entries)
        for index, person in enumerate(entries):
            index += 1
            self.print_progress(index)
            text = person.xpath("name")[0].text
            name = text[:text.find('.')]
            snippet = text[text.find('.') + 1:]
            snippet = snippet.strip()
           
            encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) 
            biourl = "http://www.inghist.nl/retroboeken/heinsius/#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + encoded_name

            # skip
            if not name:
                self.skip("empty name")
                continue
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue

            bdes = biodes.BioDesDoc()
            bdes.from_args(naam = name,
                           naam_publisher = "Instituut voor Nederlandse Geschiedenis",
                           url_biografie = biourl,
                           url_publisher = "http://www.inghist.nl/",
                           text = snippet,
                           )
            self.write_file(bdes, index)
Exemplo n.º 3
0
    def write_data(self, persons):
        self.total = len(persons)
        for index, text in enumerate(persons):
            name_and_date = self.get_name_date_and_descr(text)
            if ' zie ' in text:
                continue

            if not name_and_date:
                self.skipped += 1
                continue

            name, born, dead, descr = name_and_date
            if self.name_already_processed(name):
                self.skipped += 1
                continue

            base_dev = "http://dev.inghist.nl/retrotest2010/thorbecke/"
            base_production = "http://www.inghist.nl/retroboeken/thorbecke/"
            encoded_name = urllib.quote(urllib.quote(name.encode('utf8')))
            url = base_production + "#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + encoded_name
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=url,
                url_publisher="http://www.inghist.nl/",
                birth_date=born,
                death_date=dead,
                text=descr,
            )
            self.write_file(bdes, index + 1)
Exemplo n.º 4
0
 def process(self):
     with open(INPUT, 'r') as f:
         data = f.read()
     people = eval(data)
     self.total = len(people)
     index = 0
     for (id, firstn, lastn, place) in people:
         index += 1
         name = "%s %s" % (firstn, lastn)
         if self.name_already_processed(name + str(place)):
             self.skip("dupe name")
             continue
         snippet = ("kapitein komende uit %s" %
                    place.capitalize() if place else None)
         url = "http://www.inghist.nl/Onderzoek/Projecten/Elbing/captain/journeys?id=%s" % id
         #
         bdes = biodes.BioDesDoc()
         bdes.from_args(
             naam=name,
             naam_publisher="Instituut voor Nederlandse Geschiedenis",
             url_biografie=url,
             url_publisher="http://www.inghist.nl/",
             text=snippet,
         )
         self.write_file(bdes, index)
Exemplo n.º 5
0
    def process(self):
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1
            name = ""
            for x in (person[1], person[2], person[0]):
                if x is not None:
                    name += x
                    name += " "
            name = name.strip()
            if not name:
                self.skip("empty name")
                continue
            else:
                name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue

            encoded_name = urllib.quote(name)
            biourl = "http://www.inghist.nl/Onderzoek/Projecten/WVO/brieven?af_naam_vol=" + encoded_name
            # ----
            name = name.decode('latin1')
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
            )
            self.write_file(bdes, index)
Exemplo n.º 6
0
    def process(self):
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1

            # id
            id = person[0]
            if not id:
                self.skip("no id")
                continue
            # name
            name = ""
            for x in (person[3], person[4], person[1]):
                if x is not None:
                    name += x
                    name += " "
            name = name.strip()
            if not name:
                self.skip("empty name")
                continue
            else:
                name = sanitize_name(name)

            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue
            name = name.decode('latin1')
            print repr(name)

            # dates
            bplace = person[9] and person[9].decode('latin1') or None
            bdate = person[10] and str(person[10]) or None
            ddate = person[13] and str(person[13]) or None
            text = person[15]
            if text is not None:
                text = text.replace("\x00", "")
                text = text.decode('latin1')

            biourl = "http://www.inghist.nl/Onderzoek/Projecten/Egodocumenten/persoon_detail/%s" % id

            # ----
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
                birth_place=bplace,
                birth_date=bdate,
                death_date=ddate,
                text=text,
            )
            self.write_file(bdes, index)
Exemplo n.º 7
0
    def process(self):
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1
            #            self.print_progress(index)
            id = person[0]
            name = ""
            for x in (person[4], person[2], person[3], person[1]):
                if x is not None:
                    name += x
                    name += " "
            name = name.decode('latin1')
            name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue
            biourl = "http://www.inghist.nl/Onderzoek/Projecten/KPP/PersoonDetail?Id=%s" % id
            sex = person[5]
            if sex == 'm':
                sex = 1
            elif sex == 'v':
                sex = 2
            else:
                sex = None
            text = (person[16], person[17], person[18])
            text = ' '.join([x for x in text if x])
            text = text.strip()
            text = text.decode('latin1')

            if not text:
                text = person[14]
                if text:
                    text = text.decode('latin1')

            # ----
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
                sex=sex,
                text=text,
            )
            self.write_file(bdes, index)
Exemplo n.º 8
0
    def process(self):
        DateTime = lambda x: None
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1
            self.print_progress(index)
            id = person[0]
            name = person[1]
            if not id:
                self.skip("id is None")
                continue
            if name is None:
                self.skip("name is None")
                continue
            name = name.decode('latin1')
            name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("duplicate name: %s" % name)
                continue

            encoded_name = urllib.quote(name.encode('utf8'))
            biourl = "http://www.inghist.nl/Onderzoek/Projecten/RapportenCentraleInlichtingendienst1919-1940/data/GeavanceerdResult.html?batch_size=15&persoon=" + encoded_name

            text = person[7]
            if text is not None:
                text = text.strip()
                text = text.decode('latin1')

            if "Berger, L.M., zie Morisset" in name:
                self.skip("name causing unknwon encoding error: %s" % name)
                continue

            # ----
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
                text=text,
            )

            self.write_file(bdes, index)
Exemplo n.º 9
0
 def process(self):
     with open(INPUT, 'r') as f:
         data = f.read()
     data = data.replace("\n", "")
     people = eval(data)
     self.total = len(people)
     index = 0
     for (url, name) in people:
         index += 1
         if self.name_already_processed(name):
             self.skip("dupe name")
             continue
         #
         bdes = biodes.BioDesDoc()
         bdes.from_args(
             naam=name,
             naam_publisher="Instituut voor Nederlandse Geschiedenis",
             url_biografie=url,
             url_publisher="http://www.inghist.nl/",
         )
         self.write_file(bdes, index)
Exemplo n.º 10
0
    def process(self):
        tree = etree.parse(INPUT)
        entries = tree.xpath("//item")
        self.total = len(entries)
        for index, person in enumerate(entries):
            index += 1
            self.print_progress(index)
            name = person.xpath("name")[0].text
            if not name:
                continue
            name = sanitize_name(name)
            while name.endswith('('):
                name = name[:-1]

            encoded_name = urllib.quote(urllib.quote(name.encode('utf8')))
            biourl = "http://www.inghist.nl/retroboeken/schutte/#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + encoded_name

            # skip
            if not name:
                self.skip("empty name")
                continue
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue

            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
            )
            try:
                self.write_file(bdes, index)
            except etree.XMLSyntaxError, err:
                self.skip(str(err))
                continue