예제 #1
0
    def get_package_ids(self, set_ids, config, last_time, client):
        ''' Get package identifiers from given set identifiers.
        '''
        def filter_map_args(list_tuple):
            for key, value in list_tuple:
                if key in ['until', 'from']:
                    if key == 'from':
                        key = 'from_'
                    yield (key, dp(value).replace(tzinfo=None))

        kwargs = dict(filter_map_args(config.items()))
        kwargs['metadataPrefix'] = self.md_format
        if last_time and 'from_' not in kwargs:
            kwargs['from_'] = dp(last_time).replace(tzinfo=None)
        if set_ids:
            for set_id in set_ids:
                try:
                    for header in client.listIdentifiers(set=set_id, **kwargs):
                        yield header.identifier()
                except oaipmh.error.NoRecordsMatchError:
                    pass
        else:
            try:
                for header in client.listIdentifiers(**kwargs):
                    yield header.identifier()
            except oaipmh.error.NoRecordsMatchError:
                pass
예제 #2
0
    def get_package_ids(self, set_ids, config, last_time, client):
        ''' Get package identifiers from given set identifiers.
        '''
        def filter_map_args(list_tuple):
            for key, value in list_tuple:
                if key in ['until', 'from']:
                    if key == 'from':
                        key = 'from_'
                    yield (key, dp(value).replace(tzinfo=None))

        kwargs = dict(filter_map_args(config.items()))
        kwargs['metadataPrefix'] = self.md_format
        if last_time and 'from_' not in kwargs:
            kwargs['from_'] = dp(last_time).replace(tzinfo=None)
        if set_ids:
            for set_id in set_ids:
                try:
                    for header in client.listIdentifiers(set=set_id, **kwargs):
                        yield header.identifier()
                except oaipmh.error.NoRecordsMatchError:
                    pass
        else:
            try:
                for header in client.listIdentifiers(**kwargs):
                    yield header.identifier()
            except oaipmh.error.NoRecordsMatchError:
                pass
예제 #3
0
 def _identifier_generator(self, client):
     """
     pyoai generates the URL based on the given method parameters
     Therefore one may not use the set parameter if it is not there
     """
     if self.set_spec:
         for header in client.listIdentifiers(metadataPrefix=self.md_format, set=self.set_spec):
             yield header
     else:
         for header in client.listIdentifiers(metadataPrefix=self.md_format):
             yield header
예제 #4
0
 def _identifier_generator(self, client):
     """
     pyoai generates the URL based on the given method parameters
     Therefore one may not use the set parameter if it is not there
     """
     if self.set_spec:
         for header in client.listIdentifiers(metadataPrefix=self.md_format,
                                              set=self.set_spec):
             yield header
     else:
         for header in client.listIdentifiers(
                 metadataPrefix=self.md_format):
             yield header
예제 #5
0
 def test_get_record(self):
     metadata_reg = MetadataRegistry()
     metadata_reg.registerReader('oai_dc', oai_dc_reader)
     client = Client(config.get('ckan.site_url') + self.base_url, metadata_reg)
     res = self._oai_get_method_and_validate('?verb=ListIdentifiers&metadataPrefix=oai_dc&set=roger')
     urllib2.urlopen = mock.Mock(return_value=StringIO(res))
     ids = client.listIdentifiers(metadataPrefix='oai_dc')
     offset = self.base_url + '?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc' % ids.next().identifier()
     res = self.app.get(offset)
     self.assert_(oaischema.validate(etree.fromstring(res.body)))
     self.assert_("abraham" in res.body)
    def test_list(self):
        '''
        Parse ListIdentifiers result
        '''

        registry = importformats.create_metadata_registry()
        client = oaipmh.client.Client(_get_fixture(FIXTURE_LISTIDENTIFIERS), registry)
        identifiers = (header.identifier() for header in client.listIdentifiers(metadataPrefix='oai_dc'))

        assert 'oai:arXiv.org:hep-th/9801001' in identifiers
        assert 'oai:arXiv.org:hep-th/9801002' in identifiers
        assert 'oai:arXiv.org:hep-th/9801005' in identifiers
        assert 'oai:arXiv.org:hep-th/9801010' in identifiers
예제 #7
0
 def test_resumption_identifiers(self):
     metadata_reg = MetadataRegistry()
     metadata_reg.registerReader('oai_dc', oai_dc_reader)
     urllib2.urlopen = realopen
     client = CKANServer()
     metadata_registry = metadata.MetadataRegistry()
     metadata_registry.registerReader('oai_dc', oai_dc_reader)
     metadata_registry.registerWriter('oai_dc', oai_dc_writer)
     serv = BatchingServer(client, metadata_registry=metadata_registry)
     client = ServerClient(serv, metadata_reg)
     recs = client.listIdentifiers(metadataPrefix='oai_dc')
     for rec in recs:
         self.assert_(rec)
예제 #8
0
    def test_list(self):
        '''
        Parse ListIdentifiers result
        '''

        registry = importformats.create_metadata_registry()
        client = oaipmh.client.Client(_get_fixture(FIXTURE_LISTIDENTIFIERS),
                                      registry)
        identifiers = (header.identifier()
                       for header in client.listIdentifiers(
                           metadataPrefix='oai_dc'))

        assert 'oai:arXiv.org:hep-th/9801001' in identifiers
        assert 'oai:arXiv.org:hep-th/9801002' in identifiers
        assert 'oai:arXiv.org:hep-th/9801005' in identifiers
        assert 'oai:arXiv.org:hep-th/9801010' in identifiers
예제 #9
0
def test_list(url):
        registry = importformats.create_metadata_registry()
        client = oaipmh.client.Client(url, registry)
        return (header.identifier() for header in
                        client.listIdentifiers(metadataPrefix='oai_dc'))
예제 #10
0
 def _fetch_import_set(self, harvest_object, master_data, client, group):
     # Could be genuine fetch or retry of set insertions.
     if 'set' in master_data:
         # Fetch stage.
         args = {self.metadata_prefix_key: self.metadata_prefix_value, 'set': master_data['set']}
         if 'from_' in master_data:
             args['from_'] = self._datetime_from_str(master_data['from_'])
         if 'until' in master_data:
             args['until'] = self._datetime_from_str(master_data['until'])
         ids = []
         try:
             for identity in client.listIdentifiers(**args):
                 ids.append(identity.identifier())
         except NoRecordsMatchError:
             return False  # Ok, empty set. Nothing to do.
         except socket.error:
             errno, errstr = sys.exc_info()[:2]
             self._save_object_error(
                 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr,),
                 harvest_object, stage='Fetch')
             return False
         except httplib.BadStatusLine:
             self._save_object_error(
                 'Bad HTTP response status line.',
                 harvest_object, stage='Fetch')
             return False
         master_data['record_ids'] = ids
     else:
         log.debug('Reinsert: %s %i' % (master_data['set_name'], len(master_data['record_ids']),))
     # Do not save to DB because we can't.
     # Import stage.
     model.repo.new_revision()
     subg_name = '%s - %s' % (group.name, master_data['set_name'],)
     subgroup = Group.by_name(subg_name)
     if not subgroup:
         subgroup = Group(name=subg_name, description=subg_name)
         setup_default_user_roles(subgroup)
         subgroup.save()
     missed = []
     for ident in master_data['record_ids']:
         pkg_name = self._package_name_from_identifier(ident)
         # Package may have been omitted due to missing metadata.
         pkg = Package.get(pkg_name)
         if pkg:
             subgroup.add_package_by_name(pkg_name)
             subgroup.save()
             if 'set' not in master_data:
                 log.debug('Inserted %s into %s' % (pkg_name, subg_name,))
         else:
             # Either omitted due to missing metadata or fetch error.
             # In the latter case, we want to add record later once the
             # fetch succeeds after retry.
             missed.append(ident)
             if 'set' not in master_data:
                 log.debug('Omitted %s from %s' % (pkg_name, subg_name,))
     if len(missed):
         # Store missing names for retry.
         master_data['record_ids'] = missed
         if 'set' in master_data:
             del master_data['set']  # Omit fetch later.
         harvest_object.content = json.dumps(master_data)
         log.debug('Missed %s %i' % (master_data['set_name'], len(missed),))
     else:
         harvest_object.content = None  # Clear data.
     model.repo.commit()
     return True
예제 #11
0
 def _gather_stage(self, harvest_job):
     from_until = self._get_time_limits(harvest_job)
     client, identifier = self._get_client_identifier(
         harvest_job.source.url, harvest_job)
     if not identifier:
         raise RuntimeError('Could not get source identifier.')
     # Get things to retry.
     ident2rec, ident2set = {}, {}
     rec_idents = []
     domain = identifier.repositoryName()
     try:
         args = {self.metadata_prefix_key: self.metadata_prefix_value}
         if not self.config.get('force_all', False):
             args.update(from_until)
         for ident in client.listIdentifiers(**args):
             if ident.identifier() in ident2rec:
                 continue  # On our retry list already, do not fetch twice.
             rec_idents.append(ident.identifier())
     except NoRecordsMatchError:
         log.debug('No records matched: %s' % domain)
         pass  # Ok. Just nothing to get.
     except Exception as e:
         # Once we know of something specific, handle it separately.
         log.debug(traceback.format_exc(e))
         self._save_gather_error(
             'Could not fetch identifier list.', harvest_job)
         raise RuntimeError('Could not fetch an identifier list.')
     # Gathering the set list here. Member identifiers in fetch.
     sets = []
     try:
         for set_ in client.listSets():
             identifier, name, _ = set_
             # Is set due for retry and it is not missing member insertion?
             # Set either failed in retry of misses packages but not both.
             # Set with failed insertions may have new members.
             if name in ident2set:
                 continue
             sets.append((identifier, name,))
     except NoSetHierarchyError:
         log.debug('No sets: %s' % domain)
     except urllib2.URLError:
         # Possibly timeout.
         self._save_gather_error(
             'Could not fetch a set list.', harvest_job)
         # We got something so perhaps records can gen gotten, hence [].
         raise RuntimeError('Could not fetch set list.')
     # Since network errors can't occur anymore, it's ok to create the
     # harvest objects to return to caller since we are not missing anything
     # crucial.
     harvest_objs, set_objs, insertion_retries = [], [], set()
     for ident in rec_idents:
         info = {'fetch_type': 'record', 'record': ident, 'domain': domain}
         harvest_obj = HarvestObject(job=harvest_job)
         harvest_obj.content = json.dumps(info)
         harvest_obj.save()
         harvest_objs.append(harvest_obj.id)
     log.info('Gathered %i records from %s.' % (len(harvest_objs), domain,))
     # Add sets to retry first.
     harvest_objs.extend(set_objs)
     for set_id, set_name in sets:
         harvest_obj = HarvestObject(job=harvest_job)
         info = {'fetch_type': 'set', 'set': set_id, 'set_name': set_name, 'domain': domain}
         if 'from_' in from_until:
             info['from_'] = self._str_from_datetime(from_until['from_'])
         if 'until' in from_until:
             info['until'] = self._str_from_datetime(from_until['until'])
         harvest_obj.content = json.dumps(info)
         harvest_obj.save()
         harvest_objs.append(harvest_obj.id)
     log.info(
         'Gathered %i records/sets from %s.' % (len(harvest_objs), domain,))
     return harvest_objs
예제 #12
0
def test_list(url):
    registry = importformats.create_metadata_registry()
    client = oaipmh.client.Client(url, registry)
    return (header.identifier()
            for header in client.listIdentifiers(metadataPrefix='oai_dc'))