def process(self): data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 self.print_progress(index) id = person[0] name = person[1] name = name.decode('latin1') name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name") continue encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) biourl = "http://www.inghist.nl/retroboeken/nib/?zoekveld=abdul" biourl += "&soort=persoon#accessor=cumulatieveindex&accessor_href=CumulatieveIndex%2FPersonenIndex%3Fzoekveld%3D" biourl += name biourl += "%26soort%3Dpersoon" text = person[2] text = text.decode('latin1') # ---- bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", text=text, ) self.write_file(bdes, index)
def process(self): tree = etree.parse(INPUT) entries = tree.xpath("//item") self.total = len(entries) for index, person in enumerate(entries): index += 1 self.print_progress(index) text = person.xpath("name")[0].text name = text[:text.find('.')] snippet = text[text.find('.') + 1:] snippet = snippet.strip() encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) biourl = "http://www.inghist.nl/retroboeken/heinsius/#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + encoded_name # skip if not name: self.skip("empty name") continue if self.name_already_processed(name): self.skip("duplicate name") continue bdes = biodes.BioDesDoc() bdes.from_args(naam = name, naam_publisher = "Instituut voor Nederlandse Geschiedenis", url_biografie = biourl, url_publisher = "http://www.inghist.nl/", text = snippet, ) self.write_file(bdes, index)
def write_data(self, persons): self.total = len(persons) for index, text in enumerate(persons): name_and_date = self.get_name_date_and_descr(text) if ' zie ' in text: continue if not name_and_date: self.skipped += 1 continue name, born, dead, descr = name_and_date if self.name_already_processed(name): self.skipped += 1 continue base_dev = "http://dev.inghist.nl/retrotest2010/thorbecke/" base_production = "http://www.inghist.nl/retroboeken/thorbecke/" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + "#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + encoded_name bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=url, url_publisher="http://www.inghist.nl/", birth_date=born, death_date=dead, text=descr, ) self.write_file(bdes, index + 1)
def process(self): with open(INPUT, 'r') as f: data = f.read() people = eval(data) self.total = len(people) index = 0 for (id, firstn, lastn, place) in people: index += 1 name = "%s %s" % (firstn, lastn) if self.name_already_processed(name + str(place)): self.skip("dupe name") continue snippet = ("kapitein komende uit %s" % place.capitalize() if place else None) url = "http://www.inghist.nl/Onderzoek/Projecten/Elbing/captain/journeys?id=%s" % id # bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=url, url_publisher="http://www.inghist.nl/", text=snippet, ) self.write_file(bdes, index)
def process(self): data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 name = "" for x in (person[1], person[2], person[0]): if x is not None: name += x name += " " name = name.strip() if not name: self.skip("empty name") continue else: name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name") continue encoded_name = urllib.quote(name) biourl = "http://www.inghist.nl/Onderzoek/Projecten/WVO/brieven?af_naam_vol=" + encoded_name # ---- name = name.decode('latin1') bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", ) self.write_file(bdes, index)
def process(self): data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 # id id = person[0] if not id: self.skip("no id") continue # name name = "" for x in (person[3], person[4], person[1]): if x is not None: name += x name += " " name = name.strip() if not name: self.skip("empty name") continue else: name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name") continue name = name.decode('latin1') print repr(name) # dates bplace = person[9] and person[9].decode('latin1') or None bdate = person[10] and str(person[10]) or None ddate = person[13] and str(person[13]) or None text = person[15] if text is not None: text = text.replace("\x00", "") text = text.decode('latin1') biourl = "http://www.inghist.nl/Onderzoek/Projecten/Egodocumenten/persoon_detail/%s" % id # ---- bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", birth_place=bplace, birth_date=bdate, death_date=ddate, text=text, ) self.write_file(bdes, index)
def process(self): data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 # self.print_progress(index) id = person[0] name = "" for x in (person[4], person[2], person[3], person[1]): if x is not None: name += x name += " " name = name.decode('latin1') name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name") continue biourl = "http://www.inghist.nl/Onderzoek/Projecten/KPP/PersoonDetail?Id=%s" % id sex = person[5] if sex == 'm': sex = 1 elif sex == 'v': sex = 2 else: sex = None text = (person[16], person[17], person[18]) text = ' '.join([x for x in text if x]) text = text.strip() text = text.decode('latin1') if not text: text = person[14] if text: text = text.decode('latin1') # ---- bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", sex=sex, text=text, ) self.write_file(bdes, index)
def process(self): DateTime = lambda x: None data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 self.print_progress(index) id = person[0] name = person[1] if not id: self.skip("id is None") continue if name is None: self.skip("name is None") continue name = name.decode('latin1') name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name: %s" % name) continue encoded_name = urllib.quote(name.encode('utf8')) biourl = "http://www.inghist.nl/Onderzoek/Projecten/RapportenCentraleInlichtingendienst1919-1940/data/GeavanceerdResult.html?batch_size=15&persoon=" + encoded_name text = person[7] if text is not None: text = text.strip() text = text.decode('latin1') if "Berger, L.M., zie Morisset" in name: self.skip("name causing unknwon encoding error: %s" % name) continue # ---- bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", text=text, ) self.write_file(bdes, index)
def process(self): with open(INPUT, 'r') as f: data = f.read() data = data.replace("\n", "") people = eval(data) self.total = len(people) index = 0 for (url, name) in people: index += 1 if self.name_already_processed(name): self.skip("dupe name") continue # bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=url, url_publisher="http://www.inghist.nl/", ) self.write_file(bdes, index)
def process(self): tree = etree.parse(INPUT) entries = tree.xpath("//item") self.total = len(entries) for index, person in enumerate(entries): index += 1 self.print_progress(index) name = person.xpath("name")[0].text if not name: continue name = sanitize_name(name) while name.endswith('('): name = name[:-1] encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) biourl = "http://www.inghist.nl/retroboeken/schutte/#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + encoded_name # skip if not name: self.skip("empty name") continue if self.name_already_processed(name): self.skip("duplicate name") continue bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", ) try: self.write_file(bdes, index) except etree.XMLSyntaxError, err: self.skip(str(err)) continue