Exemplo n.º 1
0
 def test_discipline_split(self):
     '''
     RDF export would look odd if discipline splitting function would break
     '''
     disciplines = helpers.split_disciplines('Matematiikka,Fysiikka')
     assert disciplines[1] == 'Fysiikka'
     assert helpers.split_disciplines('Tiede-, taide- ja liikuntakasvatus')[0] == 'Tiede-, taide- ja liikuntakasvatus'
Exemplo n.º 2
0
 def test_discipline_split(self):
     '''
     RDF export would look odd if discipline splitting function would break
     '''
     disciplines = helpers.split_disciplines('Matematiikka,Fysiikka')
     assert disciplines[1] == 'Fysiikka'
     assert helpers.split_disciplines('Tiede-, taide- ja liikuntakasvatus')[
                0] == 'Tiede-, taide- ja liikuntakasvatus'
Exemplo n.º 3
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Etsin: homepage
        uri = url_for(controller='package', action='read',
                      id=dataset_dict.get('name'), qualified=True)
        g.add((dataset_ref, FOAF.homepage, URIRef(uri)))

        # Etsin: primary identifiers
        data_pids = get_pids_by_type('data', dataset_dict)
        for pid in data_pids:
            g.add((dataset_ref, ADMS.identifier, URIRef(pid.get('id'))))

        version_pids = get_pids_by_type('version', dataset_dict)
        for pid in version_pids:
            g.add((dataset_ref, DCT.identifier, URIRef(pid.get('id'))))
            g.add((dataset_ref, DCT.isVersionOf, URIRef(pid.get('id'))))

        # Etsin: Title and Description, including translations
        items = [
            (DCT.title, 'langtitle', 'title'),
            (DCT.description, 'notes'),
        ]

        for item in items:
            self._add_translated_triple_from_dict(
                dataset_dict, dataset_ref, *item)

        # Etsin: Agents
        for agent in dataset_dict.get('agent', []):
            agent_role = agent.get('role')
            agent_id = agent.get('id')

            # Rights Holders
            if agent_role in ['owner', 'distributor']:
                name = agent.get('name', None)

                if agent_role == 'owner':
                    if not get_if_url(agent.get('name')):
                        name = agent.get('name', agent.get('organisation', ''))
                    nodetype = DCT.rightsHolder

                if agent_role == 'distributor':
                    nodetype = DCT.publisher

                agent_node_ref = BNode()
                g.add((agent_node_ref, RDF.type, FOAF.Agent))
                g.add((dataset_ref, nodetype, agent_node_ref))
                g.add((agent_node_ref, FOAF.name, Literal(name)))
                if agent_id:
                    g.add((agent_node_ref, DCT.identifier, Literal(agent_id)))

            # Authors
            if agent_role in ['author', 'contributor']:
                if agent_role == 'author':
                    nodetype = DCT.creator

                if agent_role == 'contributor':
                    nodetype = DCT.contributor

                organization_ref = BNode()
                agent_ref = BNode()
                memberof_ref = BNode()
                creator_ref = BNode()

                g.add((organization_ref, FOAF.name, Literal(
                    agent.get('organisation', None))))
                g.add((memberof_ref, FOAF.organization, organization_ref))
                g.add((agent_ref, ORG.memberOf, memberof_ref))
                g.add((agent_ref, FOAF.name, Literal(agent.get('name', None))))
                g.add((creator_ref, FOAF.Agent, agent_ref))
                g.add((dataset_ref, nodetype, creator_ref))

                if agent_id:
                    g.add((agent_ref, DCT.identifier, Literal(agent_id)))


            # Funders
            if agent.get('role') == 'funder':
                organization_ref = BNode()
                memberof_ref = BNode()
                project_ref = BNode()
                isoutputof_ref = BNode()

                agent_url = agent.get('URL')
                if agent_url:
                    g.add((project_ref, FOAF.homepage, Literal(agent_url)))

                funding_id = agent.get('fundingid')
                if funding_id:
                    g.add((project_ref, RDFS.comment, Literal(funding_id)))

                g.add((organization_ref, FOAF.name, Literal(
                    agent.get('organisation', None))))
                g.add((memberof_ref, FOAF.organization, organization_ref))
                g.add((project_ref, ORG.memberOf, memberof_ref))

                agent_name = agent.get('name', None)
                g.add((project_ref, FOAF.name, Literal(agent_name)))

                if agent_id:
                    g.add((project_ref, DCT.identifier, Literal(agent_id)))

                g.add((isoutputof_ref, FOAF.Project, project_ref))
                g.add((dataset_ref, FRAPO.isOutputOf, isoutputof_ref))

        # Etsin: Publishers
        for contact in dataset_dict.get('contact'):
            agent_node_ref = BNode()
            agent_id = contact.get('id')

            g.add((agent_node_ref, RDF.type, FOAF.Agent))
            g.add((dataset_ref, DCT.publisher, agent_node_ref))

            contact_name = contact.get('name', None)
            g.add((agent_node_ref, FOAF.name, Literal(contact_name)))
            if agent_id:
                g.add((agent_node_ref, DCT.identifier, Literal(agent_id)))

            contact_email = contact.get('email')
            if contact_email and contact_email != 'hidden':
                g.add((agent_node_ref, FOAF.mbox,
                       URIRef("mailto:" + contact_email)))

            contact_url = contact.get('URL')
            if contact_url:
                g.add((agent_node_ref, FOAF.homepage, URIRef(contact_url)))

            contact_phone = contact.get('phone')
            if contact_phone:
                g.add((agent_node_ref, FOAF.phone,
                       URIRef("tel:" + contact_phone)))

        # Etsin: Organization
        organization_name = resolve_org_name(dataset_dict.get('owner_org'))
        publisher_ref = BNode()
        g.add((dataset_ref, DCT.publisher, publisher_ref))
        g.add((publisher_ref, FOAF.organization, Literal(organization_name)))

        # Etsin: Tags - can be URLs or user inputted keywords
        # TODO: resolve URLs from Finto. Currently get_label_for_uri() breaks
        # RDFlib.
        for tag in dataset_dict.get('tags', []):
            display_name = tag.get('display_name')
            g.add((dataset_ref, DCAT.keyword, Literal(display_name)))
            tag_name = tag.get('name')
            if is_url(tag_name):
                g.add((dataset_ref, DCAT.theme, URIRef(tag_name)))

        # Etsin: Dates
        # Peter: Issued-field is new. This used to be inside CatalogRecord.
        items = [
            ('issued', DCT.issued, ['metadata_created'], Literal),
            ('modified', DCT.modified, ['metadata_modified'], Literal),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        # Etsin: Events
        for event in dataset_dict.get('event', []):
            event_ref = BNode()
            g.add((dataset_ref, DCT.event, event_ref))
            g.add((event_ref, DCT.type, Literal(event.get('type'))))
            g.add((event_ref, DCT.creator, Literal(event.get('who'))))
            g.add((event_ref, DCT.date, Literal(str(event.get('when')))))
            g.add((event_ref, DCT.description, Literal(event.get('descr'))))

        # Etsin: Citation
        citation = dataset_dict.get('citation')
        if citation:
            g.add((dataset_ref, DCT.bibliographicCitation, Literal(citation)))      


        # Etsin: Distribution
        availability_list = ['access_application',
                             'access_request', 'through_provider']

        checksum_ref = BNode()
        checksum_parent_ref = BNode()
        distribution_ref = BNode()
        dist_parent_ref = BNode()

        if dataset_dict.get('availability') == 'direct_download':
            access_url = get_download_url(dataset_dict)
            g.add((distribution_ref, DCAT.downloadURL, Literal(access_url)))

        checksum = dataset_dict.get('checksum')
        algorithm = dataset_dict.get('algorithm')
        if checksum and algorithm:
            g.add((checksum_ref, SPDX.checksumValue, Literal(checksum)))
            g.add((checksum_ref, SPDX.algorithm, Literal(algorithm)))
            g.add((checksum_parent_ref, SPDX.Checksum, checksum_ref))
            g.add((distribution_ref, SPDX.checksum, checksum_parent_ref))

        if dataset_dict.get('availability') in availability_list:
            access_url = get_download_url(dataset_dict)
            g.add((distribution_ref, DCAT.accessURL, Literal(access_url)))

        mimetype = dataset_dict.get('mimetype')
        if mimetype:
            g.add((distribution_ref, DCAT.mediaType, Literal(mimetype)))

        dist_format = dataset_dict.get('format')
        if dist_format:
            g.add((distribution_ref, DCT['format'], Literal(dist_format)))

        g.add((dist_parent_ref, DCAT.Distribution, distribution_ref))
        g.add((dataset_ref, DCAT.distribution, dist_parent_ref))

        # Etsin: Disciplines
        disciplines = dataset_dict.get('discipline', '')
        for discipline in split_disciplines(disciplines):
            if is_url(discipline):
                disc = URIRef(discipline)

            else:
                disc = Literal(discipline)
            g.add((dataset_ref, DCT.subject, disc))

        # Etsin: Rights Declaration
        # Peter: There's no way to add an xmlns attribute under
        # the parent <DCT:rights> in rdflib
        category, declarations = get_rightscategory(dataset_dict)
        declaration_strings = ''
        for declaration in declarations:
            declaration_strings += u'<RightsDeclaration>{}</RightsDeclaration>\n'\
                .format(declaration)
        xml_string = u'<RightsDeclarationMD RIGHTSCATEGORY="{}" \
            xmlns="http://www.loc.gov/METS/" >\n{}</RightsDeclarationMD>'\
            .format(category, declaration_strings)

        license_url = dataset_dict.get('license_URL')

        rights_ref = BNode()
        g.add((dataset_ref, DCT.rights, rights_ref))
        g.add((rights_ref, DCT.RightsStatement, Literal(
            xml_string, datatype=RDF.XMLLiteral)))
        g.add((rights_ref, DCT.RightsStatement, Literal(license_url)))


        # Etsin: Spatial
        coverage = dataset_dict.get('geographic_coverage')
        if coverage:
            spatial_ref = BNode()
            location_ref = BNode()
            g.add((location_ref, RDFS.label, Literal(coverage)))
            g.add((spatial_ref, DCT.Location, location_ref))
            g.add((dataset_ref, DCT.spatial_ref, spatial_ref))

        # Etsin: Temporal
        # Peter: hasBeginning and hasEnd left out
        temporal_coverage_begin = dataset_dict.get('temporal_coverage_begin')
        temporal_coverage_end = dataset_dict.get('temporal_coverage_end')
        if temporal_coverage_begin or temporal_coverage_end:
            temporal_extent = BNode()

            g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
            if temporal_coverage_begin:
                self._add_date_triple(
                    temporal_extent, SCHEMA.startDate, temporal_coverage_begin)

            if temporal_coverage_end:
                self._add_date_triple(
                    temporal_extent, SCHEMA.endDate, temporal_coverage_end)

            g.add((dataset_ref, DCT.temporal, temporal_extent))

        # Etsin: language field needs to be stripped from spaces
        langs = self._get_dict_value(dataset_dict, 'language', '').split(', ')
        for lang in langs:
            params = (dataset_ref, DCAT.language, Literal(lang))
            self.g.add(params)
Exemplo n.º 4
0
    def before_index(self, pkg_dict):
        '''
        Modification to package dictionary before
        indexing it to Solr index. For example, we
        add resource mimetype to the index, modify
        agents and hide the email address

        :param pkg_dict: pkg_dict to modify
        :returns: the modified package dict to be indexed
        '''
        EMAIL = re.compile(r'.*contact_\d*_email')

        # Add res_mimetype to pkg_dict. Can be removed after res_mimetype is
        # added to CKAN's index function.
        data = json.loads(pkg_dict['data_dict'])
        # We do not want owner_org to organization facets. Note that owner_org.name
        # is an id in our case and thus not human readable
        pkg_dict['organization'] = ''
        pkg_dict['isopen'] = data.get('isopen')

        pkg_dict['res_mimetype'] = [res['mimetype'] for res in data.get('resources', []) if res.get('mimetype')]

        # Extract plain text from resources and add to the data dict for indexing
        for resource in data.get('resources', []):
            if resource['resource_type'] in ('file', 'file.upload'):
                try:
                    text = extractor.extract_text(resource['url'], resource['format'])
                except IOError as ioe:
                    log.debug(str(ioe))
                    text = ""
                if text:
                    all_text = pkg_dict.get('res_text_contents', '')
                    all_text += (text + '\n')
                    pkg_dict['res_text_contents'] = all_text

        # Separate agent roles for Solr indexing

        new_items = {}

        for key, value in pkg_dict.iteritems():
            tokens = key.split('_')
            if tokens[0] == 'agent' and tokens[2] == 'role':
                role_idx = '{role}_{id}'.format(role=value, id=tokens[1])        # Must not be unicode
                org_idx = 'organization_{id}'.format(id=tokens[1])

                agent_name = pkg_dict.get('agent_{id}_name'.format(id=tokens[1]))
                agent_org = pkg_dict.get('agent_{id}_organisation'.format(id=tokens[1]))
                agent_id = pkg_dict.get('agent_{id}_id'.format(id=tokens[1]))

                if agent_name:
                    new_items[role_idx] = agent_name
                    new_items['agent_name_' + tokens[1]] = agent_name
                if agent_org:
                    new_items[org_idx] = agent_org
                    new_items['agent_name_' + tokens[1] + '_org'] = agent_org
                if agent_id:
                    new_items['agent_name_' + tokens[1] + '_id'] = agent_id

            # hide sensitive data
            if EMAIL.match(key):
                pkg_dict[key] = u''

        pkg_dict.update(new_items)

        # hide sensitive data
        for item in data.get('extras', []):
            if EMAIL.match(item['key']):
                item['value'] = u''

        # Resolve uri labels and add them to the Solr index.
        # Discipline field in pkg_dict is of type comma-separated-string, while
        # tags are given already as a list
        split_disciplines = helpers.split_disciplines(pkg_dict.get('discipline'))
        pkg_dict['extras_discipline_resolved'] = self._resolve_labels(split_disciplines, 'okm-tieteenala')
        pkg_dict['extras_keywords_resolved'] = self._resolve_labels(pkg_dict.get('tags'), 'koko')

        # Make dates compliant with ISO 8601 used by Solr.
        # We assume here that what we get is partial date (YYYY or YYYY-MM) that is compliant with the standard.
        # Eg. the standard always uses 4-digit year (1583-9999) and two-digit month
        DATE_TEMPLATES = {'temporal_coverage_begin': '2000-01-01T00:00:00Z',
                          'temporal_coverage_end': '2000-12-31T23:59:59Z'}

        for temporal_field, date_template in DATE_TEMPLATES.iteritems():
            temporal_date = pkg_dict.get(temporal_field)

            # Remove time zone as Solr doesn't support it.
            # NOTE: Date time is not converted to UTC, but time zone is just stripped. Could be converted with arrow.
            if temporal_date:
                try:
                    datetime_obj = iso8601.parse_date(temporal_date)
                    temporal_date = datetime_obj.replace(tzinfo=None).isoformat()

                    pkg_dict[temporal_field] = temporal_date + date_template[len(temporal_date):]
                except iso8601.ParseError:
                    temporal_date = ''

            if temporal_date == '':
                # Remove empty strings as they won't fit into Solr's TrieDateField
                pkg_dict.pop(temporal_field)

        self._handle_titles(pkg_dict)

        validated = json.loads(pkg_dict.get('validated_data_dict'))

        _crypt = self._crypto()

        try:
            for item in validated.get('contact'):
                for k_ in item.iterkeys():
                    if k_ == u'email':
                        item[k_] = base64.b64encode(_crypt.encrypt(self._pad(unicode(item[k_], "utf-8"))))
        except TypeError:
            # Harves sources
            pass

        pkg_dict['validated_data_dict'] = json.dumps(validated)

        pkg_dict['data_dict'] = json.dumps(data)

        return pkg_dict
Exemplo n.º 5
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):
        primary_pid = get_primary_pid(dataset_dict)
        if not primary_pid:
            return

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Etsin: homepage
        uri = url_for(controller='package',
                      action='read',
                      id=dataset_dict.get('name'),
                      qualified=True)
        g.add(
            (dataset_ref, FOAF.homepage, URIRef(remove_trailing_spaces(uri))))

        # Etsin: primary identifier
        g.add((dataset_ref, ADMS.identifier,
               URIRef(remove_trailing_spaces(primary_pid))))

        # Etsin: Relation identifiers
        relation_pids = get_pids_by_type('relation', dataset_dict)
        for rpid in relation_pids:
            if rpid.get('relation') == 'isNewVersionOf' or rpid.get(
                    'relation') == 'isPreviousVersionOf':
                g.add((dataset_ref, DCT.isVersionOf,
                       URIRef(remove_trailing_spaces(rpid.get('id')))))
            elif rpid.get('relation') == 'hasPart':
                g.add((dataset_ref, DCT.hasPart,
                       URIRef(remove_trailing_spaces(rpid.get('id')))))
            elif rpid.get('relation') == 'isPartOf':
                g.add((dataset_ref, DCT.isPartOf,
                       URIRef(remove_trailing_spaces(rpid.get('id')))))
            else:
                g.add((dataset_ref, DCT.identifier,
                       URIRef(remove_trailing_spaces(rpid.get('id')))))

        # Etsin: Title and Description, including translations
        items = [
            (DCT.title, 'langtitle', 'title'),
            (DCT.description, 'notes'),
        ]

        for item in items:
            self._add_translated_triple_from_dict(dataset_dict, dataset_ref,
                                                  *item)

        # Etsin: Agents
        for agent in dataset_dict.get('agent', []):
            agent_role = agent.get('role')
            agent_id = agent.get('id')

            # Rights Holders
            if agent_role in ['owner', 'distributor']:
                name = agent.get('name', None)

                if agent_role == 'owner':
                    if not get_if_url(agent.get('name')):
                        name = agent.get('name', agent.get('organisation', ''))
                    nodetype = DCT.rightsHolder

                if agent_role == 'distributor':
                    nodetype = DCT.publisher

                agent_node_ref = BNode()
                g.add((agent_node_ref, RDF.type, FOAF.Agent))
                g.add((dataset_ref, nodetype, agent_node_ref))
                g.add((agent_node_ref, FOAF.name, Literal(name)))
                if agent_id:
                    g.add((agent_node_ref, DCT.identifier, Literal(agent_id)))

            # Authors
            if agent_role in ['author', 'contributor']:
                if agent_role == 'author':
                    nodetype = DCT.creator

                if agent_role == 'contributor':
                    nodetype = DCT.contributor

                organization_ref = BNode()
                agent_ref = BNode()
                memberof_ref = BNode()
                creator_ref = BNode()

                g.add((organization_ref, FOAF.name,
                       Literal(agent.get('organisation', None))))
                g.add((memberof_ref, FOAF.organization, organization_ref))
                g.add((agent_ref, ORG.memberOf, memberof_ref))
                g.add((agent_ref, FOAF.name, Literal(agent.get('name', None))))
                g.add((creator_ref, FOAF.Agent, agent_ref))
                g.add((dataset_ref, nodetype, creator_ref))

                if agent_id:
                    g.add((agent_ref, DCT.identifier, Literal(agent_id)))

            # Funders
            if agent.get('role') == 'funder':
                organization_ref = BNode()
                memberof_ref = BNode()
                project_ref = BNode()
                isoutputof_ref = BNode()

                agent_url = agent.get('URL')
                if agent_url:
                    g.add((project_ref, FOAF.homepage, Literal(agent_url)))

                funding_id = agent.get('fundingid')
                if funding_id:
                    g.add((project_ref, RDFS.comment, Literal(funding_id)))

                g.add((organization_ref, FOAF.name,
                       Literal(agent.get('organisation', None))))
                g.add((memberof_ref, FOAF.organization, organization_ref))
                g.add((project_ref, ORG.memberOf, memberof_ref))

                agent_name = agent.get('name', None)
                g.add((project_ref, FOAF.name, Literal(agent_name)))

                if agent_id:
                    g.add((project_ref, DCT.identifier, Literal(agent_id)))

                g.add((isoutputof_ref, FOAF.Project, project_ref))
                g.add((dataset_ref, FRAPO.isOutputOf, isoutputof_ref))

        # Etsin: Publishers
        for contact in dataset_dict.get('contact'):
            agent_node_ref = BNode()
            agent_id = contact.get('id')

            g.add((agent_node_ref, RDF.type, FOAF.Agent))
            g.add((dataset_ref, DCT.publisher, agent_node_ref))

            contact_name = contact.get('name', None)
            g.add((agent_node_ref, FOAF.name, Literal(contact_name)))
            if agent_id:
                g.add((agent_node_ref, DCT.identifier, Literal(agent_id)))

            contact_email = contact.get('email')
            if contact_email and contact_email != 'hidden':
                g.add((agent_node_ref, FOAF.mbox,
                       URIRef("mailto:" +
                              remove_trailing_spaces(contact_email))))

            contact_url = contact.get('URL')
            if contact_url:
                g.add((agent_node_ref, FOAF.homepage,
                       URIRef(remove_trailing_spaces(contact_url))))

            contact_phone = remove_all_spaces(contact.get('phone'))
            if contact_phone:
                g.add((agent_node_ref, FOAF.phone,
                       URIRef("tel:" + remove_trailing_spaces(contact_phone))))

        # Etsin: Organization
        organization_name = resolve_org_name(dataset_dict.get('owner_org'))
        publisher_ref = BNode()
        g.add((dataset_ref, DCT.publisher, publisher_ref))
        g.add((publisher_ref, FOAF.organization, Literal(organization_name)))

        # Etsin: Tags - can be URLs or user inputted keywords
        # TODO: resolve URLs from Finto. Currently get_label_for_uri() breaks
        # RDFlib.
        for tag in dataset_dict.get('tags', []):
            display_name = tag.get('display_name')
            g.add((dataset_ref, DCAT.keyword, Literal(display_name)))
            tag_name = tag.get('name')
            if is_url(tag_name):
                g.add((dataset_ref, DCAT.theme,
                       URIRef(remove_trailing_spaces(tag_name))))

        # Etsin: Dates
        # Peter: Issued-field is new. This used to be inside CatalogRecord.
        items = [
            ('issued', DCT.issued, ['metadata_created'], Literal),
            ('modified', DCT.modified, ['metadata_modified'], Literal),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        # Etsin: Events
        for event in dataset_dict.get('event', []):
            event_ref = BNode()
            g.add((dataset_ref, DCT.event, event_ref))
            g.add((event_ref, DCT.type, Literal(event.get('type'))))
            g.add((event_ref, DCT.creator, Literal(event.get('who'))))
            g.add((event_ref, DCT.date, Literal(str(event.get('when')))))
            g.add((event_ref, DCT.description, Literal(event.get('descr'))))

        # Etsin: Citation
        citation = dataset_dict.get('citation')
        if citation:
            g.add((dataset_ref, DCT.bibliographicCitation, Literal(citation)))

        # Etsin: Distribution
        availability_list = [
            'access_application_rems', 'access_application_other',
            'access_request'
        ]

        checksum_ref = BNode()
        checksum_parent_ref = BNode()
        distribution_ref = BNode()
        dist_parent_ref = BNode()

        if dataset_dict.get('availability') == 'direct_download':
            access_url = get_download_url(dataset_dict)
            g.add((distribution_ref, DCAT.downloadURL, Literal(access_url)))

        checksum = dataset_dict.get('checksum')
        algorithm = dataset_dict.get('algorithm')
        if checksum and algorithm:
            g.add((checksum_ref, SPDX.checksumValue, Literal(checksum)))
            g.add((checksum_ref, SPDX.algorithm, Literal(algorithm)))
            g.add((checksum_parent_ref, SPDX.Checksum, checksum_ref))
            g.add((distribution_ref, SPDX.checksum, checksum_parent_ref))

        if dataset_dict.get('availability') in availability_list:
            access_url = get_download_url(dataset_dict)
            g.add((distribution_ref, DCAT.accessURL, Literal(access_url)))

        mimetype = dataset_dict.get('mimetype')
        if mimetype:
            g.add((distribution_ref, DCAT.mediaType, Literal(mimetype)))

        dist_format = dataset_dict.get('format')
        if dist_format:
            g.add((distribution_ref, DCT['format'], Literal(dist_format)))

        g.add((dist_parent_ref, DCAT.Distribution, distribution_ref))
        g.add((dataset_ref, DCAT.distribution, dist_parent_ref))

        # Etsin: Disciplines
        disciplines = dataset_dict.get('discipline', '')
        for discipline in split_disciplines(disciplines):
            if is_url(discipline):
                disc = URIRef(remove_trailing_spaces(discipline))

            else:
                disc = Literal(discipline)
            g.add((dataset_ref, DCT.subject, disc))

        # Etsin: Rights Declaration
        # Peter: There's no way to add an xmlns attribute under
        # the parent <DCT:rights> in rdflib
        category, declarations = get_rightscategory(dataset_dict)
        declaration_strings = ''
        for declaration in declarations:
            declaration_strings += u'<RightsDeclaration>{}</RightsDeclaration>\n'\
                .format(declaration)
        xml_string = u'<RightsDeclarationMD RIGHTSCATEGORY="{}" \
            xmlns="http://www.loc.gov/METS/" >\n{}</RightsDeclarationMD>'\
            .format(category, declaration_strings)

        license_url = dataset_dict.get('license_URL')

        rights_ref = BNode()
        g.add((dataset_ref, DCT.rights, rights_ref))
        g.add((rights_ref, DCT.RightsStatement,
               Literal(xml_string, datatype=RDF.XMLLiteral)))
        g.add((rights_ref, DCT.RightsStatement, Literal(license_url)))

        # Etsin: Spatial
        coverage = dataset_dict.get('geographic_coverage')
        if coverage:
            spatial_ref = BNode()
            location_ref = BNode()
            g.add((location_ref, RDFS.label, Literal(coverage)))
            g.add((spatial_ref, DCT.Location, location_ref))
            g.add((dataset_ref, DCT.spatial_ref, spatial_ref))

        # Etsin: Temporal
        # Peter: hasBeginning and hasEnd left out
        temporal_coverage_begin = dataset_dict.get('temporal_coverage_begin')
        temporal_coverage_end = dataset_dict.get('temporal_coverage_end')
        if temporal_coverage_begin or temporal_coverage_end:
            temporal_extent = BNode()

            g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
            if temporal_coverage_begin:
                self._add_date_triple(temporal_extent, SCHEMA.startDate,
                                      temporal_coverage_begin)

            if temporal_coverage_end:
                self._add_date_triple(temporal_extent, SCHEMA.endDate,
                                      temporal_coverage_end)

            g.add((dataset_ref, DCT.temporal, temporal_extent))

        # Etsin: language field needs to be stripped from spaces
        langs = self._get_dict_value(dataset_dict, 'language', '').split(', ')
        for lang in langs:
            params = (dataset_ref, DCAT.language, Literal(lang))
            self.g.add(params)
Exemplo n.º 6
0
    def before_index(self, pkg_dict):
        '''
        Modification to package dictionary before
        indexing it to Solr index. For example, we
        add resource mimetype to the index, modify
        agents and hide the email address

        :param pkg_dict: pkg_dict to modify
        :returns: the modified package dict to be indexed
        '''
        EMAIL = re.compile(r'.*contact_\d*_email')

        # Add res_mimetype to pkg_dict. Can be removed after res_mimetype is
        # added to CKAN's index function.
        data = json.loads(pkg_dict['data_dict'])
        # We do not want owner_org to organization facets. Note that owner_org.name
        # is an id in our case and thus not human readable
        pkg_dict['organization'] = ''
        pkg_dict['isopen'] = data.get('isopen')

        pkg_dict['res_mimetype'] = [res['mimetype'] for res in data.get('resources', []) if res.get('mimetype')]

        # Extract plain text from resources and add to the data dict for indexing
        for resource in data.get('resources', []):
            if resource['resource_type'] in ('file', 'file.upload'):
                try:
                    text = extractor.extract_text(resource['url'], resource['format'])
                except IOError as ioe:
                    log.debug(str(ioe))
                    text = ""
                if text:
                    all_text = pkg_dict.get('res_text_contents', '')
                    all_text += (text + '\n')
                    pkg_dict['res_text_contents'] = all_text

        # Separate agent roles for Solr indexing

        new_items = {}

        for key, value in pkg_dict.iteritems():
            tokens = key.split('_')
            if tokens[0] == 'agent' and tokens[2] == 'role':
                role_idx = '{role}_{id}'.format(role=value, id=tokens[1])        # Must not be unicode
                org_idx = 'organization_{id}'.format(id=tokens[1])

                agent_name = pkg_dict.get('agent_{id}_name'.format(id=tokens[1]))
                agent_org = pkg_dict.get('agent_{id}_organisation'.format(id=tokens[1]))
                agent_id = pkg_dict.get('agent_{id}_id'.format(id=tokens[1]))

                if agent_name:
                    new_items[role_idx] = agent_name
                    new_items['agent_name_' + tokens[1]] = agent_name
                if agent_org:
                    new_items[org_idx] = agent_org
                    new_items['agent_name_' + tokens[1] + '_org'] = agent_org
                if agent_id:
                    new_items['agent_name_' + tokens[1] + '_id'] = agent_id

            # hide sensitive data
            if EMAIL.match(key):
                pkg_dict[key] = u''

        pkg_dict.update(new_items)

        # hide sensitive data
        for item in data.get('extras', []):
            if EMAIL.match(item['key']):
                item['value'] = u''

        # Resolve uri labels and add them to the Solr index.
        # Discipline field in pkg_dict is of type comma-separated-string, while
        # tags are given already as a list
        split_disciplines = helpers.split_disciplines(pkg_dict.get('discipline'))
        pkg_dict['extras_discipline_resolved'] = self._resolve_labels(split_disciplines, 'okm-tieteenala')
        pkg_dict['extras_keywords_resolved'] = self._resolve_labels(pkg_dict.get('tags'), 'koko')

        # Make dates compliant with ISO 8601 used by Solr.
        # We assume here that what we get is partial date (YYYY or YYYY-MM) that is compliant with the standard.
        # Eg. the standard always uses 4-digit year (1583-9999) and two-digit month
        DATE_TEMPLATES = {'temporal_coverage_begin': '2000-01-01T00:00:00Z',
                          'temporal_coverage_end': '2000-12-31T23:59:59Z'}

        for temporal_field, date_template in DATE_TEMPLATES.iteritems():
            temporal_date = pkg_dict.get(temporal_field)

            # Remove time zone as Solr doesn't support it.
            # NOTE: Date time is not converted to UTC, but time zone is just stripped. Could be converted with arrow.
            if temporal_date:
                try:
                    datetime_obj = iso8601.parse_date(temporal_date)
                    temporal_date = datetime_obj.replace(tzinfo=None).isoformat()

                    pkg_dict[temporal_field] = temporal_date + date_template[len(temporal_date):]
                except iso8601.ParseError:
                    temporal_date = ''

            if temporal_date == '':
                # Remove empty strings as they won't fit into Solr's TrieDateField
                pkg_dict.pop(temporal_field)

        self._handle_titles(pkg_dict)

        validated = json.loads(pkg_dict.get('validated_data_dict'))

        _crypt = self._crypto()

        try:
            for item in validated.get('contact'):
                for k_ in item.iterkeys():
                    if k_ == u'email':
                        item[k_] = base64.b64encode(_crypt.encrypt(self._pad(unicode(item[k_], "utf-8"))))
        except TypeError:
            # Harves sources
            pass

        pkg_dict['validated_data_dict'] = json.dumps(validated)

        pkg_dict['data_dict'] = json.dumps(data)

        return pkg_dict