Exemplo n.º 1
0
def download_biographies(id):
    if id not in get_ids():
        raise ValueError("No source with id '%s' was found" % id)
    for src in _repo.get_sources():
        if src.id == id:
            break

    ls = biodes.parse_list(src.url)
    total = len(ls)
    skipped = 0
    for index, biourl in enumerate(ls):
        if not biourl.startswith("http:"):
            # we're dealing with a fs path
            biourl = os.path.normpath(biourl)
            if not os.path.isabs(biourl):
                biourl = os.path.join(os.path.dirname(src.url), biourl)

        bio = Biography(source_id=src.id, repository=src.repository)
        try:
            bio.from_url(biourl)
            print "%s/%s %s" % (index + 1, total, bio.get_names())
        except Exception, err:
            skipped += 1
            print err
            continue

        try:
            _repo.add_biography(bio)
        except:
            from pdb import set_trace

            set_trace()  ############################## Breakpoint ##############################
Exemplo n.º 2
0
 def _create_bioport_biography(self, person):
     source = BioPortSource(id='dummy')
     bio = Biography(id='%s/%s' % (source.id, person.get_bioport_id()), source_id=source.id)
     bio._set_up_basic_structure()
     bio.set_value(local_id=person.get_bioport_id())
     bio.set_value(bioport_id=person.get_bioport_id())
     self.save_biography(bio, u'created Bioport biography')
     return bio
    def test_biography(self):
        repo = self.repo

        # get an existing biography
        bio = list(repo.get_biographies())[5]

        # get the information of this biography
        # this biography has no identifier yet
        assert bio.get_bioport_id()
        # but it has lost of other properties (inherited from BioDesDoc?)

        # create a new biography e
        source = Source(id='bioport_test')
        repo.save_source(source)
        # make a new biography
        bio = Biography(id='bioport_test/test_bio', source_id=source.id)

        self.assertEqual(bio.id, 'bioport_test/test_bio')

        bio.from_args(
              url_biografie='http://ladida/didum',
              naam_publisher='nogeensiets',
              url_publisher='http://pbulihser_url',
              naam='Tiedel Doodle Dum',
              )

        # save it
        self._save_biography(bio)

        # the new biography now also has a bioport_id
        assert bio.get_bioport_id()
        self.assertEqual(bio.title(), 'Tiedel Doodle Dum')

        p = bio.get_person()
        self.assertEqual(bio.get_bioport_id(), p.get_bioport_id())

        bio.set_value('geboortedatum', '2009-01-01')
        bio.set_value('geboortedatum', '2009-01-02')
        self.assertEqual(bio.get_value('geboortedatum'), '2009-01-02')

        bio.set_value('geboortedatum', '2009-01-02')
        bio.set_value('geboorteplaats', 'nog een test')
        self.assertEqual(bio.get_value('geboortedatum'), '2009-01-02')
        bio.set_value('sterfdatum', '2010-01-02')
        bio.set_value('sterfplaats', 'nog een test')
        self.assertEqual(bio.get_value('sterfdatum'), '2010-01-02')
        bio.set_value('sterfdatum', u'')
        self.assertEqual(bio.get_value('sterfdatum'), None)
    def test_snippet(self):
        s = """<?xml version="1.0" encoding="UTF-8"?>
<!--2011-05-18 11:26:12-->
<biodes version="1.0">
  <fileDesc>
    <author>Nationaal Archief</author>
    <ref target="http://proxy.handle.net/10648/cd48fc47-2b91-42f6-bb63-c2de770135b1"/>
    <date when="1920"/>
    <publisher>
      <name>Nationaal Archief</name>
      <ref target="http://www.gahetna.nl/collectie/archief"/>
    </publisher>
  </fileDesc>
  <person>
    <persName>Jan Daniël Cornelis Carel Willem de Constant Rebecque</persName>
  </person>
  <biography>
    <text>
      <title>Inventaris van het archief van De Constant Rebecque</title>
      <span></span>
    </text>
  </biography>
</biodes>
        """
        bio = Biography().from_string(s)
        self.assertEqual(bio.snippet(1000), 'Inventaris van het archief van De Constant Rebecque')

        s = """<biodes version="1.0">
  <fileDesc>
    <author>Nationaal Archief</author>
    <ref target="http://proxy.handle.net/10648/cd48fc47-2b91-42f6-bb63-c2de770135b1"/>
    <date when="1920"/>
    <publisher>
      <name>Nationaal Archief</name>
      <ref target="http://www.gahetna.nl/collectie/archief"/>
    </publisher>
  </fileDesc>
  <person>
    <persName>Jan Daniël Cornelis Carel Willem de Constant Rebecque</persName>
  </person>
  <biography>
  </biography>
</biodes>
        """
        bio = Biography().from_string(s)
        self.assertEqual(bio.snippet(), '')

        source = Source(id='bioport_test')
        self.repo.save_source(source)
        # make a new biography
        bio = Biography(id='bioport_test/test_bio', source_id=source.id)

        self.assertEqual(bio.id, 'bioport_test/test_bio')
        # XXX Do we need this?
        bio.from_args(
              url_biografie='http://ladida/didum',
              naam_publisher='nogeensiets',
              url_publisher='http://pbulihser_url',
              naam='Tiedel Doodle Dum',
              tekst="""Lemuel is in charge, he raises his hatchet on which the blood will never dry, but not to hit anyone, he will not hit anyone, he will not hit anyone any more, he will not touch anyone any more, either with it or with it or with it or with or

or with it or with his hammer or with his stick or with his fist or in thought in dream I mean never he will never
or with his pencil or with his stick or

or light light I mean

never there he will never

never anything

there

any more""",
              )
        self.assertEqual(bio.snippet(size=20), 'Lemuel is in...')

        bio.set_value('text', 'abc')
        self.assertEqual(bio.snippet(), 'abc')

        bio.set_value('text', 'ca. 1800-1900')
        self.assertEqual(bio.snippet(), 'ca. 1800-1900')
    def _merge_biographies(bio1, bio2):
        """try to merge bio1 and bio2 - if we cannot (because they are not consistent), return None"""
        # single values that must be equal in both biographies
        ls = ['name_publisher', 'url_publisher', 'url_biography', 'sex', 'title_biography']
        dct = {}
        for k in ls:
            v1 = bio1.get_value(k)
            v2 = bio2.get_value(k)
            if v1 and v2 and v1 != v2:
                raise Exception('Cannot merge biographies because values for %s are different (%s and %s)' % (k, v1, v2))
                return
            else:
                dct[k] = v1 or v2

        names = bio1.get_names()

        for n2 in bio2.get_names():
            if n2 not in names:
                names.append(n2)

        dct['names'] = names

        merged_bio = Biography(source_id=bio1.source_id, biodes_document=bio1.to_string())
        merged_bio.from_args(**dct)

        # non-unique states
        states1 = bio1.get_states()
        states2 = bio2.get_states()
        for state in states2:
            if etree.tostring(state).strip() not in [etree.tostring(s).strip() for s in states1]:  # @UndefinedVariable
                # copy the state (instead of moving it, which will change bio2 as well)
                state = copy.deepcopy(state)
                merged_bio._add_state_element(state)

        # unique events
        unique_events = ['birth', 'death']
        # non-unieuqe events
        events1 = merged_bio.get_events()  # these are all events from bio1
        events2 = bio2.get_events()
        events = events1
        for bio2_event in events2:
            if etree.tostring(bio2_event).strip() not in [etree.tostring(e).strip() for e in events]:  # @UndefinedVariable
                if bio2_event.get('type') in unique_events:
                    # if this event can occur only once, we check for consistency with an eventual existing event
                    # and if they are consistent, update accordingly
                    bio1_event = merged_bio.get_event(type=bio2_event.get('type'))
                    if bio1_event is not None:
                        when1 = bio1_event.get('when', '')
                        when2 = bio2_event.get('when', '')
                        if when1 and when2 and not (when1 in when2 or when2 in when1):
                            # these are incompatible
                            return
                        elif when1 in when2:
                            bio1_event.set('when', when2)

                    else:
                        # no event of this type exists yet in bio1
                        merged_bio._add_event_element(copy.deepcopy(bio2_event))
                else:
                    merged_bio._add_event_element(copy.deepcopy(bio2_event))
        return merged_bio
Exemplo n.º 6
0
 def get_bio(self, bdate=None, ddate=None, bplace=None, dplace=None):
     self.x += 1
     bio = Biography(id=str(self.x), source_id=u"knaw",
                     repository=self.repo)
     bio.from_args(url_biografie='http://google.it',
                   naam_publisher='jelle',
                   url_publisher='http://gerbrandy.com',
                   naam="gino")
     if bdate is not None:
         bio.set_value('birth_date', bdate)
     if ddate is not None:
         bio.set_value('death_date', ddate)
     if bplace is not None:
         bio.set_value('birth_place', bplace)
     if dplace is not None:
         bio.set_value('death_place', dplace)
     self._save_biography(bio)
     return bio