Пример #1
0
    def get_context_data(self, **kwargs):
        context = super(NaveDocumentTemplateView, self).get_context_data(**kwargs)
        absolute_uri = self.request.build_absolute_uri()
        target_uri = RDFRecord.get_internal_rdf_base_uri(absolute_uri)

        if "detail/foldout/" in target_uri:
            slug = self.kwargs.get('slug')
            record = ElasticSearchRDFRecord(hub_id=slug)
            graph = record.get_graph_by_id(self.kwargs.get('slug'))
            if graph is not None:
                target_uri = record.source_uri
            else:
                logger.warn("Unable to find source_uri for slug: {}".format(slug))
        else:
            target_uri = RDFRecord.get_internal_rdf_base_uri(absolute_uri)
            record = ElasticSearchRDFRecord(hub_id=self.kwargs.get('slug'))
            graph = record.get_graph_by_source_uri(target_uri)
        if graph is None:
            raise UnknownGraph("URI {} is not known in our graph store".format(target_uri))
        if "/resource/cache/" in target_uri:
            target_uri = target_uri.rstrip('/')
            cache_resource = CacheResource.objects.filter(document_uri=target_uri)
            if cache_resource.exists():
                graph = cache_resource.first().get_graph()
        elif settings.RDF_USE_LOCAL_GRAPH:
            mode = self.request.REQUEST.get('mode', 'default')
            acceptance = True if mode == 'acceptance' else False
            context['acceptance'] = acceptance

        elif '/resource/aggregation' in target_uri:
            target_named_graph = "{}/graph".format(target_uri.rstrip('/'))
            graph, nr_levels = RDFModel.get_context_graph(store=rdfstore.get_rdfstore(), named_graph=target_named_graph)
        else:
            graph, nr_levels = RDFModel.get_context_graph(
                store=rdfstore.get_rdfstore(),
                target_uri=target_uri
            )
        # todo: remove: should no longer be necessary with the addition of common.middleware.ForceLangMiddleware
        language = self.request.GET.get('lang', None)
        if language:
            activate(language)
        bindings = GraphBindings(
            about_uri=target_uri,
            graph=graph,
            excluded_properties=settings.RDF_EXCLUDED_PROPERTIES
        )
        context['resources'] = bindings
        context['absolute_uri'] = RDFRecord.get_external_rdf_url(target_uri, self.request)
        for rdf_type in bindings.get_about_resource().get_types():
            search_label = rdf_type.search_label.lower()
            content_template = settings.RDF_CONTENT_FOLDOUTS.get(search_label)
            if content_template:
                self.template_name = content_template
                break

        context['points'] = RDFModel.get_geo_points(graph)

        return context
Пример #2
0
def purge_dataset(self, request, queryset):
    """Purge a dataset from Narthex and Nave."""
    store = rdfstore.get_rdfstore()
    for ds in queryset:
        delete_dataset_with_all_triples.delay(ds, store)
        ds.delete()
    self.message_user(request, "{} dataset(s) scheduled for purging from Narthex and Nave.".format(len(queryset)))
Пример #3
0
def delete_dataset_records(self, request, queryset):
    """Purge a dataset from Narthex and Nave."""
    store = rdfstore.get_rdfstore()
    for ds in queryset:
        ds.delete_all_dataset_records(store)
    self.message_user(request,
                      "Records for {} dataset(s) scheduled for removal from Narthex and Nave.".format(len(queryset)))
Пример #4
0
 def remove_orphaned_records(self, store=None, acceptance=False):
     logger.info("Start removing orphans for dataset {}".format(self.spec))
     if not store:
         store = rdfstore.get_rdfstore()
     es_actions = []
     sparql_update_queries = []
     records_removed = 0
     for record in EDMRecord.objects.filter(dataset=self, orphaned=True):
         records_removed += 1
         es_actions.append(
                 record.create_es_action(
                         action="delete",
                         store=store,
                         context=False,  # todo: fix issue with context indexing later
                         flat=True,
                         exclude_fields=None,
                         acceptance=acceptance
                 )
         )
         if settings.RDF_STORE_TRIPLES:
             sparql_update_queries.append(record.create_sparql_update_query(delete=True, acceptance=acceptance))
         if len(sparql_update_queries) >= 50:
             store.update("\n".join(sparql_update_queries))
             sparql_update_queries[:] = []
         if len(es_actions) >= 1000:
             self.bulk_index( es_actions)
             es_actions[:] = []
     if settings.RDF_STORE_TRIPLES:
         store.update("\n".join(sparql_update_queries))
     if len(es_actions) > 0:
         self.bulk_index(es_actions)
     logger.info("Removed {} orphans for dataset {}".format(records_removed, self.spec))
     return records_removed
Пример #5
0
 def _get_inline_preview(self, link, store=None):
     """Query RDFstore for graph and convert selected fields to JSON dictionary. """
     graph = None
     try:
         if settings.RDF_USE_LOCAL_GRAPH:
             record = RDFRecord(source_uri=link)
             if record.exists():
                 graph = record.get_graph()
             else:
                 raise UnknownGraph("unable to find {}".format(link))
         else:
             if not store:
                 store = get_rdfstore()
             store = store.get_graph_store
             named_graph = "{}/graph".format(link.rstrip('/'))
             graph = store.get(named_graph=named_graph, as_graph=True)
     except UnknownGraph as ug:
         logger.warn("Unable to find Graph for: {}".format(link))
         return None
     preview_fields = settings.EDM_API_INLINE_PREVIEW
     preview_predicates = [URIRef(pred) for pred in preview_fields.keys()]
     inline_dict = {}
     for pred, obj in graph.predicate_objects():
         if pred in preview_predicates:
             inline_dict[preview_fields[str(pred)]] = str(obj)
     if 'delving_hubId' in preview_fields.values():
         hub_id, spec = self.get_hub_id()
         inline_dict['delving_hubId'] = hub_id
     return inline_dict
Пример #6
0
def clean_all_related_nave_items(sender, instance, **kw):
    """
    Signal function to delete all traces of the dataset, its records and mappings from the Nave Storage System
    """
    from . import tasks
    store = rdfstore.get_rdfstore()
    tasks.delete_dataset_with_all_triples.delay(instance, store=store)
Пример #7
0
def process_sparql_updates(sparql_updates, store=None):
    if store is None:
        store = rdfstore.get_rdfstore()

    def store_with_updates(update_queries):
        retries = 0
        while retries < 3:
            try:
                store.update("\n".join(sparql_updates))
                update_queries.clear()
                return True
            except (URLError, socket.timeout) as e:
                retries += 1
                logger.error("sparql update timeout with retries {} and error {}".format(retries, e))
                time.sleep(3)
        if retries > 2:
            #   todo: log the items in the db as not synced
            pass
        return False

    updates = []
    for i, update in enumerate(sparql_updates):
        updates.append(update)
        if i % 25 == 0:
            store_with_updates(updates)
            updates[:] = []
    store_with_updates(updates)
Пример #8
0
    def test_query_against_prod(self):
        target_uri = "http://data.cultureelerfgoed.nl/semnet/7403e26d-cf33-4372-ad72-a2f9fcf8f63b"
        store = rdfstore.get_rdfstore()
        query = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            Construct {{
             ?s ?p ?o .
             ?s skos:broader ?broader.
             ?broader skos:prefLabel ?prefLabel .
             ?o skos:prefLabel ?prefLabel .
            }}

            WHERE {{
              bind(<{}> as ?s)
              {{
              ?s skos:broader* ?broader.
              FILTER ( ?s != ?broader )
              ?broader skos:prefLabel ?prefLabel.
               ?o skos:prefLabel ?prefLabel .
              }}
              union
              {{
                ?s ?p ?o .
                  Optional {{
                   ?o skos:prefLabel ?prefLabel
                }}
              }}}}
              LIMIT 100
        """.format(target_uri)
        skos_graph = store.query(query=query)
        assert skos_graph is not None
        assert isinstance(skos_graph, ConjunctiveGraph)
        broader_links = list(skos_graph.objects(predicate=SKOS.broader))
Пример #9
0
def delete_rdf_resource(obj, store=None):
    if not store:
        store = get_rdfstore()
    if issubclass(obj.__class__, RDFModel):
        graph_store = store.get_graph_store
        response = graph_store.delete(obj.named_graph)
        logger.debug("Delete graph: {}".format(obj.named_graph))
        return response
    return False
Пример #10
0
    def get_dataset_from_graph(store=None, graph=None, dataset_graph_uri=None):
        """Convert a  <http://schemas.delving.eu/narthex/terms/Dataset> to Dataset object. """

        def add_graph_name(ds):
            return ds if ds.endswith('/graph') else "{}/graph".format(ds.rstrip('/'))

        if dataset_graph_uri is None and graph is not None:
            dataset_graph_uri = graph.identifier

        if not store:
            store = rdfstore.get_rdfstore()

        if not graph:
            if not dataset_graph_uri:
                raise ValueError("when graph is None the dataset_graph_uri needs to be given")
            named_graph = add_graph_name(dataset_graph_uri)
            graph = store.get_graph_store.get(named_graph=named_graph, as_graph=True)
        subject = URIRef(dataset_graph_uri.replace('/graph', ''))
        if graph.value(subject=subject, predicate=RDF.type, any=True) != URIRef(
                'http://schemas.delving.eu/narthex/terms/Dataset'):
            return None
        value_of = partial(DataSet.get_first_graph_value, graph=graph, subject=subject)
        data_owner = value_of(predicate='http://schemas.delving.eu/narthex/terms/datasetOwner')
        spec = value_of(predicate='http://schemas.delving.eu/narthex/terms/datasetSpec')
        group, _ = Group.objects.get_or_create(name='dataset_admin')
        if not data_owner:
            data_owner = spec
        data_owner_group, _ = Group.objects.get_or_create(name=data_owner)
        # TODO add OAI-PMH and indexing
        update_values = {
            "description": value_of('http://schemas.delving.eu/narthex/terms/datasetDescription'),
            "name": value_of('http://schemas.delving.eu/narthex/terms/datasetName'),
            "dataset_type": DataSetType.aggregated,
            "total_records": value_of('http://schemas.delving.eu/narthex/terms/datasetRecordCount'),
            "invalid": value_of('http://schemas.delving.eu/narthex/terms/processedInvalid'),
            "valid": value_of('http://schemas.delving.eu/narthex/terms/processedValid'),
            "data_owner": data_owner,
            "document_uri": subject,
            "named_graph": graph.identifier,
            "last_full_harvest_date": value_of("http://schemas.delving.eu/narthex/terms/lastFullHarvestTime"),
        }
        for k, v in update_values.items():
            if k in ['total_records', 'invalid', 'valid'] and v is None:
                update_values[k] = 0
            if k in ['last_full_harvest_date'] and v is not None:
                update_values[k] = parser.parse(v)
        dataset, _ = DataSet.objects.update_or_create(spec=spec, defaults=update_values)
        dataset.groups.add(*[group, data_owner_group])
        ds_synced = value_of('http://schemas.delving.eu/narthex/terms/synced')
        if not ds_synced and store is not None:
            update_switch = QueryType.remove_insert.format(
                    named_graph=dataset.named_graph,
                    remove="?s <http://schemas.delving.eu/narthex/terms/synced> false",
                    insert="?s <http://schemas.delving.eu/narthex/terms/synced> true"
            )
            store.update(query="{}".format(update_switch))
        return dataset
Пример #11
0
    def get_context_data(self, **kwargs):
        context = super(NaveDocumentDetailView, self).get_context_data(**kwargs)
        target_uri = self.object.document_uri
        if "/resource/cache/" in target_uri:
            target_uri = target_uri.rstrip('/')
            cache_resource = CacheResource.objects.filter(document_uri=target_uri)
            if cache_resource.exists():
                graph = cache_resource.first().get_graph()
        elif settings.RDF_USE_LOCAL_GRAPH:
            mode = self.request.REQUEST.get('mode', 'default')
            acceptance = True if mode == 'acceptance' else False
            context['acceptance'] = acceptance
            if isinstance(self.object, EDMRecord):
                graph = self.object.get_graph(with_mappings=True, include_mapping_target=True, acceptance=acceptance)
            else:
                graph = self.object.get_graph(acceptance=acceptance)
        elif '/resource/aggregation' in target_uri:
            target_named_graph = "{}/graph".format(target_uri.rstrip('/'))
            graph, nr_levels = RDFModel.get_context_graph(store=rdfstore.get_rdfstore(), named_graph=target_named_graph)
        else:
            graph, nr_levels = RDFModel.get_context_graph(
                store=rdfstore.get_rdfstore(),
                target_uri=target_uri
            )
        # todo: remove: should no longer be necessary with the addition of common.middleware.ForceLangMiddleware
        language = self.request.GET.get('lang', None)
        if language:
            activate(language)
        bindings = GraphBindings(
            about_uri=target_uri,
            graph=graph,
            excluded_properties=settings.RDF_EXCLUDED_PROPERTIES
        )
        context['resources'] = bindings
        for rdf_type in bindings.get_about_resource().get_types():
            search_label = rdf_type.search_label.lower()
            content_template = settings.RDF_CONTENT_FOLDOUTS.get(search_label)
            if content_template:
                self.template_name = content_template
                break

        context['points'] = RDFModel.get_geo_points(graph)

        return context
Пример #12
0
def store_graph(obj):
    """ Store the RDFModel subclass in the production graph store
    :param obj: a subclass of RDFModel
    :return: Boolean
    """
    if issubclass(obj.__class__, RDFModel):
        store = rdfstore.get_rdfstore().get_graph_store
        store.put(obj.named_graph, obj.get_graph())
        logger.debug("Stored graph data in graph: {}".format(obj.named_graph))
        return True
    return False
Пример #13
0
 def store_ds(self, spec, actor, base_url=None):
     # TODO fix this so new datasets can be created this way
     if not base_url:
         base_url = settings.RDF_BASE_URL
     store = get_rdfstore().get_graph_store
     actor_graph = self.create_actor(actor_name=actor)
     result = store.post(str(actor_graph.identifier), data=actor_graph)
     if not result:
         raise Exception()
     dataset_graph = self.create_dataset(spec, actor, base_url)
     result = store.put(str(dataset_graph.identifier), data=dataset_graph)
     if not result:
         raise Exception()
Пример #14
0
def resynchronize_dataset(ds, store=None):
    """Force synchronise Nave dataset with Narthex. """
    if not store:
        store = rdfstore.get_rdfstore()
    # clear any error
    ds.sync_error_message = ""
    ds.has_sync_error = False
    ds.save()
    response = synchronise_dataset_records.delay(
        ds=ds,
        store=store
    )
    return response
Пример #15
0
def add_cache_urls_to_remote_resources(store=rdfstore.get_rdfstore()):
    # query = """select distinct ?s where
    #     {?s ?p ?o .
    #     FILTER (!STRSTARTS(STR(?s), "http://localhost:8000"))
    #     } limit 10
    # """
    update_query = """insert {Graph ?g  {?s <http://schemas.delving.org/nave/terms/cacheUrl> ?o2 . } }
    WHERE { Graph ?g
      { ?s ?p ?o
        Filter not exists {?s <http://schemas.delving.org/nave/terms/cacheUrl> ?o}
        FILTER ( ! strstarts(str(?s), "http://localhost:8000") ).
        BIND(URI(REPLACE(str(?s), "http://.*?/", "http://localhost:8000/resource/cache/", "i")) AS ?o2)
      }}
    """
    return store.update(update_query)
Пример #16
0
 def get_inline_dict(self, links=None, store=None):
     """ Extract all EDM links from the graph and return a dict with enrichments.
     """
     if not self.about_uri:
         return {}
     if not store:
         store = get_rdfstore()
     if not links:
         links = self._get_inline_links()
     inline_links = {}
     for pred, links in links.items():
         for link in links:
             preview = self._get_inline_preview(link=link, store=store)
             if preview:
                 inline_links[(pred, link)] = preview
     return inline_links
Пример #17
0
 def __init__(self, payload, store=None, force_insert=False):
     self.payload = payload
     self.store = rdfstore.get_rdfstore()
     self.rdf_errors = []
     self.index_errors = []
     self.store_errors = []
     self.json_errors = []
     self.es_actions = {}
     self.records_stored = 0
     self.records_already_stored = 0
     self.records_with_errors = 0
     self.sparql_update_queries = {}
     self.force_insert = force_insert
     self.api_requests = self._get_json_entries_from_payload()
     self.current_dataset = None
     self.spec = None
Пример #18
0
    def get_proxy_resource_from_uri(proxy_uri: str, ds=None, original_label: str = None, store: RDFStore = None):
        if ProxyResource.objects.filter(proxy_uri=proxy_uri).exists():
            return ProxyResource.objects.get(proxy_uri=proxy_uri)

        if not store:
            store = rdfstore.get_rdfstore()
        query = """
        ?predicate ?object
        WHERE {{
         <{}> ?predicate ?object
        }}
        LIMIT 50
        """.format(proxy_uri)
        response = store.select(query=query)
        response_dict = {entry['predicate']['value']: entry['object']['value'] for entry in
                         response['results']['bindings']}
        if not response_dict:
            return ProxyResource.create_proxy_resource_from_uri(proxy_uri, original_label=original_label, ds=ds)
        proxy_literal_field = response_dict['http://schemas.delving.eu/narthex/terms/proxyLiteralField']
        proxy_literal_value = response_dict['http://schemas.delving.eu/narthex/terms/proxyLiteralValue']
        frequency = response_dict['http://schemas.delving.eu/narthex/terms/skosFrequency']
        ds = DataSet.get_dataset(document_uri=response_dict['http://schemas.delving.eu/narthex/terms/belongsTo'])
        proxy_field = ProxyResourceField.objects.filter(dataset=ds, property_uri=proxy_literal_field)
        if not proxy_field:
            proxy_field = ProxyResourceField(
                    property_uri=proxy_literal_field,
                    dataset_uri=ds.document_uri,
                    dataset=ds
            )
            proxy_field.save()
        else:
            proxy_field = proxy_field[0]
        resource_dict = {
            'proxy_uri': proxy_uri,
            'proxy_field': proxy_field,
            'dataset': ds,
            'frequency': frequency,
            'label': proxy_literal_value
        }
        proxy_resource, created = ProxyResource.objects.update_or_create(**resource_dict)
        return proxy_resource
Пример #19
0
 def stage_for_indexing(dataset_graph_uri):
     """Set synced=false for all records that belong to the dataset."""
     store = rdfstore.get_rdfstore()
     query = """
     ?g where {{
         {{GRAPH ?g
             {{
                 ?s  <http://schemas.delving.eu/narthex/terms/belongsTo> <{}> .
                 ?s2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.openarchives.org/ore/terms/Aggregation>}} .
               }}
     }}
     """.format(dataset_graph_uri)
     res = store.select(query=query)
     graph_list = [graph['g']['value'] for graph in res['results']['bindings']]
     update_switch = [QueryType.remove_insert.format(
             named_graph=g,
             remove="?s <http://schemas.delving.eu/narthex/terms/synced> true",
             insert="?s <http://schemas.delving.eu/narthex/terms/synced> false"
     ) for g in graph_list]
     store.update(query="\n".join(update_switch))
     return None
Пример #20
0
def schedule_out_of_sync_datasets(acceptance=False, store=None):
    """Find all out of sync datasets and schedule synchronisation tasks for each."""
    if not store:
        store = rdfstore.get_rdfstore(acceptance)
    nr_datasets, datasets = find_datasets_with_records_out_of_sync(store)
    if nr_datasets == 0:
        return 0
    logger.info("Found {} datasets that have records that are out of sync".format(nr_datasets))
    scheduled_for_indexing = 0
    for dataset_uri in datasets:
        ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_uri, store=store)
        ds.records_in_sync = False
        if ds.can_be_synchronised:
            logger.info("status: {}, {}, {}".format(ds.stay_in_sync, ds.sync_in_progress, ds.has_sync_error))
            process_key = str(uuid.uuid1())
            ds.process_key = process_key
            ds.save()
            async_result = synchronise_dataset_records.apply_async(kwargs={'store': store, 'ds': ds},
                                                                   task_id=process_key)
            scheduled_for_indexing += 1
            logger.info("Scheduled {} for indexing with {} records".format(ds.spec, ds.valid))
    return scheduled_for_indexing
Пример #21
0
 def get_skos_from_uri(skos_uri, store=None):
     if store is None:
         store = rdfstore.get_rdfstore()
     response = EDMRecord.get_skos_broader_context_graph(store, skos_uri)
     g = Graph()
     g.namespace_manager = namespace_manager
     for entry in response['results']['bindings']:
         if all([key in ['broader', 'prefLabel', 's'] for key in entry.keys()]):
             triple = (URIRef(entry['broader']['value']), SKOS.prefLabel,
                       Literal(entry['prefLabel']['value'], lang=entry['prefLabel'].get('xml:lang')))
             g.add(triple)
             g.add((URIRef(entry['s']['value']), SKOS.broader, URIRef(entry['broader']['value'])))
         elif all([key in ['s', 'p', 'o'] for key in entry.keys()]):
             subject = URIRef(entry['s']['value'])
             predicate = URIRef(entry['p']['value'])
             obj_value = entry['o']
             if obj_value['type'] == 'literal':
                 obj = Literal(obj_value['value'], lang=obj_value.get('xml:lang'))
             else:
                 obj = URIRef(obj_value['value'])
             g.add((subject, predicate, obj))
     return g
Пример #22
0
    def create_es_action(self, action="index", record_type=None, index=settings.SITE_NAME, store=None, doc_type=None,
                         context=True, flat=True, exclude_fields=None, acceptance=False):
        if doc_type is None:
            doc_type = self._generate_doc_type()
        if record_type is None:
            record_type = self.get_rdf_type()
        if not store:
            store = rdfstore.get_rdfstore()

        if acceptance:
            index = "{}_acceptance".format(index)

        if record_type == "http://www.openarchives.org/ore/terms/Aggregation":
            record_type = "mdr"

        if action == "delete":
            return {
                '_op_type': action,
                '_index': index,
                '_type': doc_type,
                '_id': self.hub_id
            }

        graph = None

        if not context:
            graph = self.get_graph()
        else:
            graph, nr_levels = self.get_context_graph(store=store, named_graph=self.named_graph)
            graph.namespace_manager = namespace_manager

        bindings = GraphBindings(
            about_uri=self.source_uri,
            graph=graph
        )
        index_doc = bindings.to_flat_index_doc() if flat else bindings.to_index_doc()
        if exclude_fields:
            index_doc = {k: v for k, v in index_doc.items() if k not in exclude_fields}
        # add delving spec for default searchability
        index_doc["delving_spec"] = [
            {'@type': "Literal",
             'value': self.get_spec_name(),
             'raw': self.get_spec_name(),
             'lang': None}
        ]
        logger.debug(index_doc)
        mapping = {
            '_op_type': action,
            '_index': index,
            '_type': doc_type,
            '_id': self.hub_id,
            '_source': index_doc
        }
        thumbnail = bindings.get_about_thumbnail
        mapping['_source']['system'] = {
            'slug': self.hub_id,
            'spec': self.get_spec_name(),
            'thumbnail': thumbnail if thumbnail else "",
            'preview': "detail/foldout/{}/{}".format(doc_type, self.hub_id),
            'caption': bindings.get_about_caption if bindings.get_about_caption else "",
            'about_uri': self.document_uri,
            'source_uri': self.source_uri,
            'graph_name': self.named_graph,
            'created_at': datetime.datetime.now().isoformat(),
            'modified_at': datetime.datetime.now().isoformat(),
            'source_graph': graph.serialize(format='nt', encoding="utf-8").decode(encoding="utf-8"),
            'proxy_resource_graph': None,
            'web_resource_graph': None,
            # 'about_type': [rdf_type.qname for rdf_type in bindings.get_about_resource().get_types()]
            # 'collections': None, todo find a way to add collections via link
        }
        data_owner = self.dataset.data_owner if hasattr(self, 'dataset') else None
        dataset_name = self.dataset.name if hasattr(self, 'dataset') else None
        mapping['_source']['legacy'] = {
            'delving_hubId': self.hub_id,
            'delving_recordType': record_type,
            'delving_spec': self.get_spec_name(),
            'delving_owner': data_owner,
            'delving_orgId': settings.ORG_ID,
            'delving_collection': dataset_name,
            'delving_title': self.get_first_literal(DC.title, graph),
            'delving_creator': self.get_first_literal(DC.creator, graph),
            'delving_description': self.get_first_literal(DC.description, graph),
            'delving_provider': index_doc.get('edm_provider')[0].get('value') if 'edm_provider' in index_doc else None,
            'delving_hasGeoHash': "true" if bindings.has_geo() else "false",
            'delving_hasDigitalObject': "true" if thumbnail else "false",
            'delving_hasLandingePage': "true" if 'edm_isShownAt' in index_doc else "false",
            'delving_hasDeepZoom': "true" if 'nave_deepZoom' in index_doc else "false",
        }
        return mapping
Пример #23
0
    def get_context_data(self, **kwargs):
        target_uri = "http://*****:*****@tonsmitshuis.nl"
                }
            },
            {
                "thumbnail": "",
                "deepzoom": "",
                "mime_type": "audio/wav",
                "source_uri": "media/189467__speedenza__poem-darkness-voice.wav",
                "metadata": {
                    "dc_title": "Poem: Darkness (Voice)",
                    "dc_creator": "Speedenza (freesound.org)",
                    "dc_rights": "freesound.org"
                }
            },
            {
                "thumbnail": "",
                "deepzoom": "",
                "mime_type": "video/mp4",
                "source_uri": "media/NuclearExplosionwww.keepvid.com.mp4",
                "metadata": {
                    "dc_title": "Nuclear explosion",
                    "dc_creator": "Oppenheimer",
                    "dc_rights": "Destructive Commons"
                }
            }
        ]
        # Todo  add search results
        # * build query on uri or property
        # * add facets from configuration + property facet
        # * get NaveResponse
        # * add to context as data
        return context
Пример #24
0
def retrieve_and_cache_remote_lod_resource(cache_uri, store=None):
    if not store:
        store = get_rdfstore()
    cache_resource, created = CacheResource.objects.get_or_create(source_uri=cache_uri)
    return store_cache_resource.delay(cache_resource, store), created
Пример #25
0
    def process_narthex_file(self, store=None, acceptance=False, path=None, console=False):

        if not store:
            store = rdfstore.get_rdfstore()

        if not path:
            processed_fname = self.get_narthex_processed_fname()
        else:
            processed_fname = path
        logger.info("started processing {} for dataset {}".format(processed_fname, self.spec))

        with open(processed_fname, 'r') as f:
            record = []
            lines = 0
            records = 0
            stored = 0
            new = 0
            not_orphaned = []
            bulk_insert_records = []
            sparql_update_queries = []
            es_actions = []
            # set orphaned records
            self.mark_records_as_orphaned(state=True)
            for line in f:
                lines += 1
                exists, named_graph, content_hash = self.is_line_marker(line)
                if exists:
                    edm_record = EDMRecord.objects.filter(source_hash=content_hash, named_graph=named_graph).exists()
                    if not edm_record:
                        triples = " ".join(record)
                        # print(is_marker)
                        new += 1
                        g = Graph(identifier=named_graph)
                        g.parse(data=triples)
                        if not EDMRecord.objects.filter(named_graph=named_graph).exists():
                            created_record = EDMRecord.graph_to_record(
                                    graph=g,
                                    ds=self,
                                    content_hash=None,
                                    acceptance=acceptance,
                                    bulk=True)

                            bulk_insert_records.append(created_record)

                            es_actions.append(
                                    created_record.create_es_action(
                                            action="index",
                                            store=store,
                                            context=False,  # todo: fix issue with context indexing later
                                            flat=True,
                                            exclude_fields=None,
                                            acceptance=acceptance
                                    )
                            )
                            if settings.RDF_STORE_TRIPLES:
                                sparql_update_queries.append(
                                    created_record.create_sparql_update_query(acceptance=acceptance))
                        else:
                            updated_record = EDMRecord.graph_to_record(
                                    graph=g,
                                    ds=self,
                                    content_hash=None,
                                    acceptance=acceptance
                            )
                            if settings.RDF_STORE_TRIPLES:
                                sparql_update_queries.append(
                                    updated_record.create_sparql_update_query(acceptance=acceptance))
                            es_actions.append(
                                    updated_record.create_es_action(
                                            action="index",
                                            store=store,
                                            context=False,  # todo: fix issue with context indexing later
                                            flat=True,
                                            exclude_fields=None,
                                            acceptance=acceptance
                                    )
                            )
                    else:
                        EDMRecord.objects.filter(source_hash=content_hash, named_graph=named_graph).update(
                                orphaned=False)
                        stored += 1
                    records += 1
                    record[:] = []
                    bulk_record_size = len(bulk_insert_records)
                    if bulk_record_size > 0 and bulk_record_size % 1000 == 0:
                        EDMRecord.objects.bulk_create(bulk_insert_records)
                        logger.info("inserted 1000 records of {} at {}".format(self.spec, time.ctime()))
                        bulk_insert_records[:] = []
                    nr_sparql_updates = len(sparql_update_queries)
                    if settings.RDF_STORE_TRIPLES and nr_sparql_updates > 0 and nr_sparql_updates % 50 == 0:
                        store.update("\n".join(sparql_update_queries))
                        sparql_update_queries[:] = []
                    if records % 1000 == 0:
                        logger.info("processed {} records of {} at {}".format(records, self.spec, time.ctime()))
                        if console:
                            print("processed {} records of {} at {}".format(records, self.spec, time.ctime()))
                        if len(es_actions) > 1000:
                            self.bulk_index(es_actions)
                            es_actions[:] = []
                else:
                    record.append(line)
            # store the remaining bulk items
            EDMRecord.objects.bulk_create(bulk_insert_records)
            self.bulk_index(es_actions)
            if settings.RDF_STORE_TRIPLES and len(sparql_update_queries) > 0:
                store.update("\n".join(sparql_update_queries))
            logger.info(
                    "Dataset {}: records inserted {}, records same content hash {}, lines parsed {}, total records processed {}".format(
                            self.spec, new, stored, lines, records)
            )
            return lines, records
Пример #26
0
def store_cache_resource(obj, store=None):
    if not store:
        store = get_rdfstore()
    graph_store = store.get_graph_store
    response = obj.update_cached_resource(graph_store)
    return response
Пример #27
0
def store_graphs(triples, named_graph, store=None):
    if store is None:
        store = rdfstore.get_rdfstore()
    stored = store.get_graph_store.post(data=triples, named_graph=named_graph)
    return stored
Пример #28
0
    def process_narthex_file(self, spec, store=None, acceptance=False, path=None, console=False):

        start = datetime.now()

        if not store:
            store = rdfstore.get_rdfstore()

        if not path:
            processed_fname = self.get_narthex_processed_fname()
        else:
            processed_fname = path
        print("started processing {} for dataset {}".format(processed_fname, spec))

        with open(processed_fname, 'r') as f:
            rdf_record = []
            lines = 0
            records = 0
            stored = 0
            new = 0
            not_orphaned = []
            sparql_update_queries = []
            es_actions = []
            # set orphaned records

            for line in f:
                lines += 1
                exists, named_graph, content_hash = self.is_line_marker(line)
                if exists:
                    new += 1
                    records += 1
                    triples = " ".join(rdf_record)
                    record = ElasticSearchRDFRecord(rdf_string=triples, spec=spec)
                    try:
                        record.from_rdf_string(named_graph=named_graph, rdf_string=triples, input_format="xml")
                        es_actions.append(record.create_es_action(doc_type="void_edmrecord", record_type="mdr", context=True))
                    except Exception as ex:
                        if console:
                            print("problem with {} for spec {} caused by {}".format(triples, spec, ex))
                        else:
                            logger.error("problem with {} for spec {} caused by {}".format(triples, spec, ex))
                    rdf_record[:] = []
                    if settings.RDF_STORE_TRIPLES:
                        sparql_update_queries.append(
                            record.create_sparql_update_query(acceptance=acceptance)
                        )
                    nr_sparql_updates = len(sparql_update_queries)
                    if settings.RDF_STORE_TRIPLES and nr_sparql_updates > 0 and nr_sparql_updates % 50 == 0:
                        store.update("\n".join(sparql_update_queries))
                        sparql_update_queries[:] = []
                    if records % 100 == 0 and records > 0:
                        logger.info("processed {} records of {} at {}".format(records, spec, ctime()))
                        if console:
                            print("processed {} records of {} at {}".format(records, spec, ctime()))
                        if len(es_actions) > 100:
                            self.bulk_index(es_actions, spec)
                            es_actions[:] = []
                else:
                    rdf_record.append(line)
            # store the remaining bulk items
            self.bulk_index(es_actions, spec)
            if settings.RDF_STORE_TRIPLES and len(sparql_update_queries) > 0:
                store.update("\n".join(sparql_update_queries))
            logger.info(
                "Dataset {}: records inserted {}, records same content hash {}, lines parsed {}, total records processed {}".format(
                    spec, new, stored, lines, records)
            )
            print("Finished loading {spec} with {lines} and {records} in {seconds}\n".format(
                spec=spec,
                lines=lines,
                records=records,
                seconds=datetime.now() - start
            ))

            RDFRecord.remove_orphans(spec, start.isoformat())
            return lines, records
Пример #29
0
    def retrieve(self, request, pk=None, format=None, *args, **kwargs):
        def get_mode(default=None):
            params = request.GET
            return params.get('schema', default)

        self._clean_callback(request)

        query = NaveESQuery(
            index_name=self.get_index_name,
            doc_types=self.doc_types,
            default_facets=self.facets,
            cluster_geo=False,
            size=1,
            converter=self.get_converter()
        )
        try:
            query = query.build_item_query(query, request.query_params, pk)
        except ValueError as ve:
            logger.error("Unable to build request because: {}".format(ve))
            # todo display error message when bad/unknown hubId is given
            return HttpResponseBadRequest()
        mlt = True if request.query_params.get('mlt', 'false') == "true" else False
        mlt_count = int(request.query_params.get('mlt.count', 5))
        mlt_filter_queries = request.query_params.getlist('mlt.qf', [])
        mlt_fq_dict = {}
        for fq in mlt_filter_queries:
            if ":" in fq:
                k, v = fq.split(":", maxsplit=1)
                mlt_fq_dict[k] = v
        record = ElasticSearchRDFRecord(hub_id=pk)
        record.get_graph_by_id(hub_id=pk)
        response = NaveItemResponse(
            query,
            self,
            index=self.get_index_name,
            mlt=mlt,
            mlt_count=mlt_count,
            mlt_filter_query=mlt_fq_dict,
            rdf_record=record
        )
        renderer_format = request.accepted_renderer.format
        if renderer_format in list(EXTENSION_TO_MIME_TYPE.keys()) and renderer_format not in ['xml', 'json']:
            graph = record.get_graph()
            graph_string = graph.serialize(format=renderer_format).decode('utf-8')
            mime_type = EXTENSION_TO_MIME_TYPE.get(renderer_format)
            return Response(data=graph_string, content_type=mime_type)
        target_uri = record.document_uri
        if settings.RDF_USE_LOCAL_GRAPH:
            graph = record.get_graph()
        else:
            store = rdfstore.get_rdfstore()
            graph, _ = RDFModel.get_context_graph(store, named_graph=record.named_graph)
        if not graph:
            from django.http import HttpResponseNotFound
            return HttpResponseNotFound()
        mode = get_mode(self.default_converter)
        bindings = GraphBindings(about_uri=target_uri, graph=graph)
        delving_fields = False if request.GET.get("delving_fields") == 'false' else True
        converter = None
        if mode in ['api', 'api-flat']:
            index_doc = bindings.to_index_doc() if mode == 'api' else bindings.to_flat_index_doc()
        elif mode in REGISTERED_CONVERTERS.keys():
            converter = REGISTERED_CONVERTERS.get(mode)
            index_doc = converter(
                bindings=bindings,
                graph=graph,
                about_uri=bindings.about_uri()
            ).convert(add_delving_fields=delving_fields)
        elif self.default_converter in REGISTERED_CONVERTERS.keys():
            converter = REGISTERED_CONVERTERS.get(self.default_converter)
            index_doc = converter(
                bindings=bindings,
                graph=graph,
                about_uri=bindings.about_uri()
            ).convert(add_delving_fields=delving_fields)
        else:
            logger.warn("unable to convert results to schema {}".format(mode))
            index_doc = bindings.to_index_doc()
        layout_fields = OrderedDict()
        layout_fields['layout'] = converter().get_layout_fields() if converter else []
        if response.get_mlt():
            mlt = {"item": [NaveESItemSerializer(item).data for item in response.get_mlt()]}
        else:
            mlt = ""
        result = {'result': {
            'layout': layout_fields,
            'item': {'fields': index_doc},
            "relatedItems": mlt}}
        return Response(result)