示例#1
0
    def test_zaincremental_harvester(self):

        client = CKANServer()
        metadata_registry = metadata.MetadataRegistry()
        metadata_registry.registerReader('oai_dc', oai_dc_reader)
        metadata_registry.registerWriter('oai_dc', oai_dc_writer)
        serv = BatchingServer(client, metadata_registry=metadata_registry)
        oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
        harv = OAIPMHHarvester()
        harvest_job = HarvestJob()
        harvest_job.source = HarvestSource()
        harvest_job.source.title = "Test"
        harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
        harvest_job.gather_started = ((datetime.now() + timedelta(days=1)))
        harvest_job.source.config = '{"incremental":"True"}'
        harvest_job.source.type = "OAI-PMH"
        Session.add(harvest_job)
        rev = model.repo.new_revision()
        rev.timestamp = ((datetime.now() + timedelta(days=2)))
        pkg = Package(name='footest', revision=rev)
        Session.add(pkg)
        pkg.save()
        roger = Group.get('roger')
        roger.add_package_by_name('footest')
        Session.add(roger)
        roger.save()
        gathered = harv.gather_stage(harvest_job)
        harvest_object = HarvestObject.get(gathered[0])
        harv.fetch_stage(harvest_object)
        harvobj = json.loads(harvest_object.content)
        self.assert_(harvobj['records'])
示例#2
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        model.repo.new_revision()
        master_data = json.loads(harvest_object.content)
        domain = master_data['domain']
        group = Group.get(domain)
        if not group:
            group = Group(name=domain, description=domain)
        if 'records' in master_data:
            records = master_data['records']
            set_name = master_data['set_name']
            for rec in records:
                identifier, metadata, _ = rec
                if metadata:
                    name = metadata['title'][0] if len(metadata['title'])\
                                                else identifier
                    title = name
                    norm_title = unicodedata.normalize('NFKD', name)\
                                 .encode('ASCII', 'ignore')\
                                 .lower().replace(' ', '_')[:35]
                    slug = ''.join(e for e in norm_title
                                    if e in string.ascii_letters + '_')
                    name = slug
                    creator = metadata['creator'][0]\
                                if len(metadata['creator']) else ''
                    description = metadata['description'][0]\
                                if len(metadata['description']) else ''
                    pkg = Package.by_name(name)
                    if not pkg:
                        pkg = Package(name=name, title=title)
                    extras = {}
                    for met in metadata.items():
                        key, value = met
                        if len(value) > 0:
                            if key == 'subject' or key == 'type':
                                for tag in value:
                                    if tag:
                                        tag = munge_tag(tag[:100])
                                        tag_obj = model.Tag.by_name(tag)
                                        if not tag_obj:
                                            tag_obj = model.Tag(name=tag)
                                        if tag_obj:
                                            pkgtag = model.PackageTag(
                                                                  tag=tag_obj,
                                                                  package=pkg)
                                            Session.add(tag_obj)
                                            Session.add(pkgtag)
                            else:
                                extras[key] = ' '.join(value)
                    pkg.author = creator
                    pkg.author_email = creator
                    pkg.title = title
                    pkg.notes = description
                    pkg.extras = extras
                    pkg.url = \
                    "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\
                                % (harvest_object.job.source.url, identifier)
                    pkg.save()
                    harvest_object.package_id = pkg.id
                    Session.add(harvest_object)
                    setup_default_user_roles(pkg)
                    url = ''
                    for ids in metadata['identifier']:
                        if ids.startswith('http://'):
                            url = ids
                    title = metadata['title'][0] if len(metadata['title'])\
                                                    else ''
                    description = metadata['description'][0]\
                                    if len(metadata['description']) else ''
                    pkg.add_resource(url, description=description, name=title)
                    group.add_package_by_name(pkg.name)
                    subg_name = "%s - %s" % (domain, set_name)
                    subgroup = Group.by_name(subg_name)
                    if not subgroup:
                        subgroup = Group(name=subg_name, description=subg_name)
                    subgroup.add_package_by_name(pkg.name)
                    Session.add(group)
                    Session.add(subgroup)
                    setup_default_user_roles(group)
                    setup_default_user_roles(subgroup)
            model.repo.commit()
        else:
            self._save_object_error('Could not receive any objects from fetch!'
                                    , harvest_object, stage='Import')
            return False
        return True
示例#3
0
def _oai_dc2ckan(data, namespaces, group, harvest_object):
    model.repo.new_revision()
    identifier = data['identifier']
    metadata_oai_dc = data['metadata']['oai_dc']
    titles = _handle_title(metadata_oai_dc.get('titleNode', []), namespaces)
    # Store title in pkg.title and keep all in extras as well. That way
    # UI will work some way in any case.
    title = titles.get('title_0', identifier)
    #title = metadata['title'][0] if len(metadata['title']) else identifier
    name = data['package_name']
    esc_identifier = identifier.replace('/','-')
    pkg = Package.get(esc_identifier)
    if not pkg:
        pkg = Package(name=name, title=title, id=esc_identifier)
        pkg.save()
        setup_default_user_roles(pkg)
    else:
        log.debug('Updating: %s' % name)
        # There are old resources which are replaced by new ones if they are
        # relevant anymore so "delete" all existing resources now.
        for r in pkg.resources:
            r.state = 'deleted'
    extras = titles
    idx = 0
    for s in ('subject', 'type'):
        for tag in metadata_oai_dc.get(s, []):
            # Turn each subject or type field into it's own tag.
            tagi = tag.strip()
            if tagi.startswith('http://www.yso.fi'):
                tags = label_list_yso(tagi)
                extras['tag_source_%i' % idx] = tagi
                idx += 1
            elif tagi.startswith('http://') or tagi.startswith('https://'):
                extras['tag_source_%i' % idx] = tagi
                idx += 1
                tags = []  # URL tags break links in UI.
            else:
                tags = [tagi]
            for tagi in tags:
                tagi = tagi[:100]  # 100 char limit in DB.
                #tagi = munge_tag(tagi[:100]) # 100 char limit in DB.
                tag_obj = model.Tag.by_name(tagi)
                if not tag_obj:
                    tag_obj = model.Tag(name=tagi)
                    tag_obj.save()
                pkgtag = model.Session.query(model.PackageTag).filter(
                    model.PackageTag.package_id == pkg.id).filter(
                    model.PackageTag.tag_id == tag_obj.id).limit(1).first()
                if pkgtag is None:
                    pkgtag = model.PackageTag(tag=tag_obj, package=pkg)
                    pkgtag.save()  # Avoids duplicates if tags have duplicates.
    lastidx = 0
    for auth in metadata_oai_dc.get('creator', []):
        extras['organization_%d' % lastidx] = ''
        extras['author_%d' % lastidx] = auth
        lastidx += 1
    extras.update(_handle_contributor(metadata_oai_dc.get('contributorNode', []), namespaces))
    extras.update(_handle_publisher(metadata_oai_dc.get('publisherNode', []), namespaces))
    # This value belongs to elsewhere.
    if 'package.maintainer_email' in extras:
        pkg.maintainer_email = extras['package.maintainer_email']
        del extras['package.maintainer_email']
    extras.update(_handle_rights(metadata_oai_dc.get('rightsNode', []), namespaces))
    if 'package.license' in extras:
        pkg.license = extras['package.license']
        del extras['package.license']
    # Causes failure in commit for some reason.
    #for f in _handle_format(metadata.get('formatNode', []), namespaces):
    #    pprint.pprint(f)
    #    pkg.add_resource(**f)
    # There may be multiple identifiers (URL, ISBN, ...) in the metadata.
    id_idx = 0
    for ident in metadata_oai_dc.get('identifier', []):
        extras['identifier_%i' % id_idx] = ident
        id_idx += 1
    # Check that we have a language.
    lang = metadata_oai_dc.get('language', [])
    if lang and len(lang) and len(lang[0]) > 1:
        pkg.language = lang[0]
    if 'date' in extras:
        pkg.version = extras['date']
        del extras['date']
    pkg.extras = extras
    pkg.url = data['package_url']
    
    # Metadata may have different identifiers, pick link, if exists.
    for ids in metadata_oai_dc['identifier']:
        if ids.startswith('http://') or ids.startswith('https://'):
            pkg.add_resource(ids, name=pkg.title, format='html')
    # All belong to the main group even if they do not belong to any set.
    if group:
        group.add_package_by_name(pkg.name)
    # The rest.
    # description below goes to pkg.notes. I think it should not added here.
    for mdp, metadata in data['metadata'].items():
        for key, value in metadata.items():
            if value is None or len(value) == 0 or key in ('titleNode', 'subject', 'type', 'rightsNode',
                                                           'publisherNode', 'creator', 'contributorNode',
                                                           'description', 'identifier', 'language', 'formatNode'):
                continue
            extras[key] = ' '.join(value)
        #description = metadata['description'][0] if len(metadata['description']) else ''
        notes = ' '.join(metadata.get('description', []))
        pkg.notes = notes.replace('\n', ' ').replace('  ', ' ')
    
    for mdp, resource in data['package_resource'].items():
        ofs = get_ofs()
        ofs.put_stream(BUCKET, data['package_xml_save'][mdp]['label'], data['package_xml_save'][mdp]['xml'], {})
        pkg.add_resource(**(resource))
    
    if harvest_object:
        harvest_object.package_id = pkg.id
        harvest_object.content = None
        harvest_object.current = True
        harvest_object.save()
    
    model.repo.commit()
    return pkg.id
def _oai_dc2ckan(data, namespaces, group, harvest_object):
    model.repo.new_revision()
    identifier = data['identifier']
    metadata = data['metadata']
    # Store title in pkg.title and keep all in extras as well. That way
    # UI will work some way in any case.
    title = metadata.get('title', identifier)[0]
    #title = metadata['title'][0] if len(metadata['title']) else identifier
    name = data['package_name']
    pkg = Package.get(name)
    if not pkg:
        pkg = Package(name=name, title=title, id=identifier)
        pkg.save()
        setup_default_user_roles(pkg)
    else:
        log.debug('Updating: %s' % name)
        # There are old resources which are replaced by new ones if they are
        # relevant anymore so "delete" all existing resources now.
        for r in pkg.resources:
            r.state = 'deleted'
    extras = {}
    idx = 0
    for s in ('subject', 'type',):
        for tag in metadata.get(s, []):
            # Turn each subject or type field into it's own tag.
            tagi = tag.strip()
            if tagi.startswith('http://') or tagi.startswith('https://'):
                extras['tag_source_%i' % idx] = tagi
                idx += 1
                tags = []  # URL tags break links in UI.
            else:
                tags = [tagi]
            for tagi in tags:
                tagi = tagi[:100]  # 100 char limit in DB.
                tag_obj = model.Tag.by_name(tagi)
                if not tag_obj:
                    tag_obj = model.Tag(name=tagi)
                    tag_obj.save()
                pkgtag = model.Session.query(model.PackageTag).filter(
                    model.PackageTag.package_id == pkg.id).filter(
                        model.PackageTag.tag_id == tag_obj.id
                    ).limit(1).first()
                if pkgtag is None:
                    pkgtag = model.PackageTag(tag=tag_obj, package=pkg)
                    pkgtag.save()  # Avoids duplicates if tags have duplicates.
    extras.update(
        _handle_contributor(metadata.get('contributorNode', []), namespaces))
    extras.update(
        _handle_publisher(metadata.get('publisherNode', []), namespaces))
    # This value belongs to elsewhere.
    if 'package.maintainer_email' in extras:
        pkg.maintainer_email = extras['package.maintainer_email']
        del extras['package.maintainer_email']
    extras.update(_handle_rights(metadata.get('rightsNode', []), namespaces))
    if 'package.license' in extras:
        pkg.license = extras['package.license']
        del extras['package.license']
    # Check that we have a language.
    lang = metadata.get('language', [])
    if lang is not None and len(lang) and len(lang[0]) > 1:
        pkg.language = lang[0]
    # The rest.
    # description below goes to pkg.notes. I think it should not added here.
    for key, value in metadata.items():
        if value is None or len(value) == 0 or key in (
            'title',
            'description',
            'publisherNode',
            'contributorNode',
            'formatNode',
            'identifier',
            'source',
            'rightsNode'
        ):
            continue
        extras[key] = value[0]
    #description = metadata['description'][0] if len(metadata['description']) else ''
    notes = ' '.join(metadata.get('description', []))
    pkg.notes = notes.replace('\n', ' ').replace('  ', ' ')
    if 'date' in extras:
        pkg.version = extras['date']
        extras['modified'] = extras['date']
        del extras['date']
    pkg.extras = extras
    pkg.url = data['package_url']
    if 'package_resource' in data:
        try:
            ofs = get_ofs()
            ofs.put_stream(BUCKET, data['package_xml_save']['label'], data['package_xml_save']['xml'], {})
            pkg.add_resource(**(data['package_resource']))
        except KeyError:
            pass
    if harvest_object is not None:
        harvest_object.package_id = pkg.id
        harvest_object.content = None
        harvest_object.current = True
        harvest_object.save()
    # Metadata may have different identifiers, pick link, if exists.

    # See: https://github.com/okfn/ckan/blob/master/ckan/public/base/images/sprite-resource-icons.png
    # "Data" format is used by CKAN to identify unknown resources.
    # You can use it if you want (default format is "html"). For example:
    # - http://my.data.com/my-generated-resource?data
    # - http://my.data.com/my-resource.data
    available_formats = ['data', 'rdf', 'pdf', 'api', 'zip', 'xls', 'csv', 'txt', 'xml', 'json', 'html']
    default_format = 'html'

    for ids in metadata['identifier']:
        if ids.startswith('http://') or ids.startswith('https://'):
            # The end of the URL must be the format, otherwise it will use "html" by default
            infer_format = default_format

            for ext in available_formats:
                if ids.endswith(ext):
                    infer_format = ext

            pkg.add_resource(ids, name=pkg.title, format=infer_format)
    # All belong to the main group even if they do not belong to any set.
    if group is not None:
        group.add_package_by_name(pkg.name)
    model.repo.commit()
    return pkg.id