def get_context_data(self, **kwargs): context = super(NaveDocumentTemplateView, self).get_context_data(**kwargs) absolute_uri = self.request.build_absolute_uri() target_uri = RDFRecord.get_internal_rdf_base_uri(absolute_uri) if "detail/foldout/" in target_uri: slug = self.kwargs.get('slug') record = ElasticSearchRDFRecord(hub_id=slug) graph = record.get_graph_by_id(self.kwargs.get('slug')) if graph is not None: target_uri = record.source_uri else: logger.warn("Unable to find source_uri for slug: {}".format(slug)) else: target_uri = RDFRecord.get_internal_rdf_base_uri(absolute_uri) record = ElasticSearchRDFRecord(hub_id=self.kwargs.get('slug')) graph = record.get_graph_by_source_uri(target_uri) if graph is None: raise UnknownGraph("URI {} is not known in our graph store".format(target_uri)) if "/resource/cache/" in target_uri: target_uri = target_uri.rstrip('/') cache_resource = CacheResource.objects.filter(document_uri=target_uri) if cache_resource.exists(): graph = cache_resource.first().get_graph() elif settings.RDF_USE_LOCAL_GRAPH: mode = self.request.REQUEST.get('mode', 'default') acceptance = True if mode == 'acceptance' else False context['acceptance'] = acceptance elif '/resource/aggregation' in target_uri: target_named_graph = "{}/graph".format(target_uri.rstrip('/')) graph, nr_levels = RDFModel.get_context_graph(store=rdfstore.get_rdfstore(), named_graph=target_named_graph) else: graph, nr_levels = RDFModel.get_context_graph( store=rdfstore.get_rdfstore(), target_uri=target_uri ) # todo: remove: should no longer be necessary with the addition of common.middleware.ForceLangMiddleware language = self.request.GET.get('lang', None) if language: activate(language) bindings = GraphBindings( about_uri=target_uri, graph=graph, excluded_properties=settings.RDF_EXCLUDED_PROPERTIES ) context['resources'] = bindings context['absolute_uri'] = RDFRecord.get_external_rdf_url(target_uri, self.request) for rdf_type in bindings.get_about_resource().get_types(): search_label = rdf_type.search_label.lower() content_template = settings.RDF_CONTENT_FOLDOUTS.get(search_label) if content_template: self.template_name = content_template break context['points'] = RDFModel.get_geo_points(graph) return context
def purge_dataset(self, request, queryset): """Purge a dataset from Narthex and Nave.""" store = rdfstore.get_rdfstore() for ds in queryset: delete_dataset_with_all_triples.delay(ds, store) ds.delete() self.message_user(request, "{} dataset(s) scheduled for purging from Narthex and Nave.".format(len(queryset)))
def delete_dataset_records(self, request, queryset): """Purge a dataset from Narthex and Nave.""" store = rdfstore.get_rdfstore() for ds in queryset: ds.delete_all_dataset_records(store) self.message_user(request, "Records for {} dataset(s) scheduled for removal from Narthex and Nave.".format(len(queryset)))
def remove_orphaned_records(self, store=None, acceptance=False): logger.info("Start removing orphans for dataset {}".format(self.spec)) if not store: store = rdfstore.get_rdfstore() es_actions = [] sparql_update_queries = [] records_removed = 0 for record in EDMRecord.objects.filter(dataset=self, orphaned=True): records_removed += 1 es_actions.append( record.create_es_action( action="delete", store=store, context=False, # todo: fix issue with context indexing later flat=True, exclude_fields=None, acceptance=acceptance ) ) if settings.RDF_STORE_TRIPLES: sparql_update_queries.append(record.create_sparql_update_query(delete=True, acceptance=acceptance)) if len(sparql_update_queries) >= 50: store.update("\n".join(sparql_update_queries)) sparql_update_queries[:] = [] if len(es_actions) >= 1000: self.bulk_index( es_actions) es_actions[:] = [] if settings.RDF_STORE_TRIPLES: store.update("\n".join(sparql_update_queries)) if len(es_actions) > 0: self.bulk_index(es_actions) logger.info("Removed {} orphans for dataset {}".format(records_removed, self.spec)) return records_removed
def _get_inline_preview(self, link, store=None): """Query RDFstore for graph and convert selected fields to JSON dictionary. """ graph = None try: if settings.RDF_USE_LOCAL_GRAPH: record = RDFRecord(source_uri=link) if record.exists(): graph = record.get_graph() else: raise UnknownGraph("unable to find {}".format(link)) else: if not store: store = get_rdfstore() store = store.get_graph_store named_graph = "{}/graph".format(link.rstrip('/')) graph = store.get(named_graph=named_graph, as_graph=True) except UnknownGraph as ug: logger.warn("Unable to find Graph for: {}".format(link)) return None preview_fields = settings.EDM_API_INLINE_PREVIEW preview_predicates = [URIRef(pred) for pred in preview_fields.keys()] inline_dict = {} for pred, obj in graph.predicate_objects(): if pred in preview_predicates: inline_dict[preview_fields[str(pred)]] = str(obj) if 'delving_hubId' in preview_fields.values(): hub_id, spec = self.get_hub_id() inline_dict['delving_hubId'] = hub_id return inline_dict
def clean_all_related_nave_items(sender, instance, **kw): """ Signal function to delete all traces of the dataset, its records and mappings from the Nave Storage System """ from . import tasks store = rdfstore.get_rdfstore() tasks.delete_dataset_with_all_triples.delay(instance, store=store)
def process_sparql_updates(sparql_updates, store=None): if store is None: store = rdfstore.get_rdfstore() def store_with_updates(update_queries): retries = 0 while retries < 3: try: store.update("\n".join(sparql_updates)) update_queries.clear() return True except (URLError, socket.timeout) as e: retries += 1 logger.error("sparql update timeout with retries {} and error {}".format(retries, e)) time.sleep(3) if retries > 2: # todo: log the items in the db as not synced pass return False updates = [] for i, update in enumerate(sparql_updates): updates.append(update) if i % 25 == 0: store_with_updates(updates) updates[:] = [] store_with_updates(updates)
def test_query_against_prod(self): target_uri = "http://data.cultureelerfgoed.nl/semnet/7403e26d-cf33-4372-ad72-a2f9fcf8f63b" store = rdfstore.get_rdfstore() query = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#> Construct {{ ?s ?p ?o . ?s skos:broader ?broader. ?broader skos:prefLabel ?prefLabel . ?o skos:prefLabel ?prefLabel . }} WHERE {{ bind(<{}> as ?s) {{ ?s skos:broader* ?broader. FILTER ( ?s != ?broader ) ?broader skos:prefLabel ?prefLabel. ?o skos:prefLabel ?prefLabel . }} union {{ ?s ?p ?o . Optional {{ ?o skos:prefLabel ?prefLabel }} }}}} LIMIT 100 """.format(target_uri) skos_graph = store.query(query=query) assert skos_graph is not None assert isinstance(skos_graph, ConjunctiveGraph) broader_links = list(skos_graph.objects(predicate=SKOS.broader))
def delete_rdf_resource(obj, store=None): if not store: store = get_rdfstore() if issubclass(obj.__class__, RDFModel): graph_store = store.get_graph_store response = graph_store.delete(obj.named_graph) logger.debug("Delete graph: {}".format(obj.named_graph)) return response return False
def get_dataset_from_graph(store=None, graph=None, dataset_graph_uri=None): """Convert a <http://schemas.delving.eu/narthex/terms/Dataset> to Dataset object. """ def add_graph_name(ds): return ds if ds.endswith('/graph') else "{}/graph".format(ds.rstrip('/')) if dataset_graph_uri is None and graph is not None: dataset_graph_uri = graph.identifier if not store: store = rdfstore.get_rdfstore() if not graph: if not dataset_graph_uri: raise ValueError("when graph is None the dataset_graph_uri needs to be given") named_graph = add_graph_name(dataset_graph_uri) graph = store.get_graph_store.get(named_graph=named_graph, as_graph=True) subject = URIRef(dataset_graph_uri.replace('/graph', '')) if graph.value(subject=subject, predicate=RDF.type, any=True) != URIRef( 'http://schemas.delving.eu/narthex/terms/Dataset'): return None value_of = partial(DataSet.get_first_graph_value, graph=graph, subject=subject) data_owner = value_of(predicate='http://schemas.delving.eu/narthex/terms/datasetOwner') spec = value_of(predicate='http://schemas.delving.eu/narthex/terms/datasetSpec') group, _ = Group.objects.get_or_create(name='dataset_admin') if not data_owner: data_owner = spec data_owner_group, _ = Group.objects.get_or_create(name=data_owner) # TODO add OAI-PMH and indexing update_values = { "description": value_of('http://schemas.delving.eu/narthex/terms/datasetDescription'), "name": value_of('http://schemas.delving.eu/narthex/terms/datasetName'), "dataset_type": DataSetType.aggregated, "total_records": value_of('http://schemas.delving.eu/narthex/terms/datasetRecordCount'), "invalid": value_of('http://schemas.delving.eu/narthex/terms/processedInvalid'), "valid": value_of('http://schemas.delving.eu/narthex/terms/processedValid'), "data_owner": data_owner, "document_uri": subject, "named_graph": graph.identifier, "last_full_harvest_date": value_of("http://schemas.delving.eu/narthex/terms/lastFullHarvestTime"), } for k, v in update_values.items(): if k in ['total_records', 'invalid', 'valid'] and v is None: update_values[k] = 0 if k in ['last_full_harvest_date'] and v is not None: update_values[k] = parser.parse(v) dataset, _ = DataSet.objects.update_or_create(spec=spec, defaults=update_values) dataset.groups.add(*[group, data_owner_group]) ds_synced = value_of('http://schemas.delving.eu/narthex/terms/synced') if not ds_synced and store is not None: update_switch = QueryType.remove_insert.format( named_graph=dataset.named_graph, remove="?s <http://schemas.delving.eu/narthex/terms/synced> false", insert="?s <http://schemas.delving.eu/narthex/terms/synced> true" ) store.update(query="{}".format(update_switch)) return dataset
def get_context_data(self, **kwargs): context = super(NaveDocumentDetailView, self).get_context_data(**kwargs) target_uri = self.object.document_uri if "/resource/cache/" in target_uri: target_uri = target_uri.rstrip('/') cache_resource = CacheResource.objects.filter(document_uri=target_uri) if cache_resource.exists(): graph = cache_resource.first().get_graph() elif settings.RDF_USE_LOCAL_GRAPH: mode = self.request.REQUEST.get('mode', 'default') acceptance = True if mode == 'acceptance' else False context['acceptance'] = acceptance if isinstance(self.object, EDMRecord): graph = self.object.get_graph(with_mappings=True, include_mapping_target=True, acceptance=acceptance) else: graph = self.object.get_graph(acceptance=acceptance) elif '/resource/aggregation' in target_uri: target_named_graph = "{}/graph".format(target_uri.rstrip('/')) graph, nr_levels = RDFModel.get_context_graph(store=rdfstore.get_rdfstore(), named_graph=target_named_graph) else: graph, nr_levels = RDFModel.get_context_graph( store=rdfstore.get_rdfstore(), target_uri=target_uri ) # todo: remove: should no longer be necessary with the addition of common.middleware.ForceLangMiddleware language = self.request.GET.get('lang', None) if language: activate(language) bindings = GraphBindings( about_uri=target_uri, graph=graph, excluded_properties=settings.RDF_EXCLUDED_PROPERTIES ) context['resources'] = bindings for rdf_type in bindings.get_about_resource().get_types(): search_label = rdf_type.search_label.lower() content_template = settings.RDF_CONTENT_FOLDOUTS.get(search_label) if content_template: self.template_name = content_template break context['points'] = RDFModel.get_geo_points(graph) return context
def store_graph(obj): """ Store the RDFModel subclass in the production graph store :param obj: a subclass of RDFModel :return: Boolean """ if issubclass(obj.__class__, RDFModel): store = rdfstore.get_rdfstore().get_graph_store store.put(obj.named_graph, obj.get_graph()) logger.debug("Stored graph data in graph: {}".format(obj.named_graph)) return True return False
def store_ds(self, spec, actor, base_url=None): # TODO fix this so new datasets can be created this way if not base_url: base_url = settings.RDF_BASE_URL store = get_rdfstore().get_graph_store actor_graph = self.create_actor(actor_name=actor) result = store.post(str(actor_graph.identifier), data=actor_graph) if not result: raise Exception() dataset_graph = self.create_dataset(spec, actor, base_url) result = store.put(str(dataset_graph.identifier), data=dataset_graph) if not result: raise Exception()
def resynchronize_dataset(ds, store=None): """Force synchronise Nave dataset with Narthex. """ if not store: store = rdfstore.get_rdfstore() # clear any error ds.sync_error_message = "" ds.has_sync_error = False ds.save() response = synchronise_dataset_records.delay( ds=ds, store=store ) return response
def add_cache_urls_to_remote_resources(store=rdfstore.get_rdfstore()): # query = """select distinct ?s where # {?s ?p ?o . # FILTER (!STRSTARTS(STR(?s), "http://localhost:8000")) # } limit 10 # """ update_query = """insert {Graph ?g {?s <http://schemas.delving.org/nave/terms/cacheUrl> ?o2 . } } WHERE { Graph ?g { ?s ?p ?o Filter not exists {?s <http://schemas.delving.org/nave/terms/cacheUrl> ?o} FILTER ( ! strstarts(str(?s), "http://localhost:8000") ). BIND(URI(REPLACE(str(?s), "http://.*?/", "http://localhost:8000/resource/cache/", "i")) AS ?o2) }} """ return store.update(update_query)
def get_inline_dict(self, links=None, store=None): """ Extract all EDM links from the graph and return a dict with enrichments. """ if not self.about_uri: return {} if not store: store = get_rdfstore() if not links: links = self._get_inline_links() inline_links = {} for pred, links in links.items(): for link in links: preview = self._get_inline_preview(link=link, store=store) if preview: inline_links[(pred, link)] = preview return inline_links
def __init__(self, payload, store=None, force_insert=False): self.payload = payload self.store = rdfstore.get_rdfstore() self.rdf_errors = [] self.index_errors = [] self.store_errors = [] self.json_errors = [] self.es_actions = {} self.records_stored = 0 self.records_already_stored = 0 self.records_with_errors = 0 self.sparql_update_queries = {} self.force_insert = force_insert self.api_requests = self._get_json_entries_from_payload() self.current_dataset = None self.spec = None
def get_proxy_resource_from_uri(proxy_uri: str, ds=None, original_label: str = None, store: RDFStore = None): if ProxyResource.objects.filter(proxy_uri=proxy_uri).exists(): return ProxyResource.objects.get(proxy_uri=proxy_uri) if not store: store = rdfstore.get_rdfstore() query = """ ?predicate ?object WHERE {{ <{}> ?predicate ?object }} LIMIT 50 """.format(proxy_uri) response = store.select(query=query) response_dict = {entry['predicate']['value']: entry['object']['value'] for entry in response['results']['bindings']} if not response_dict: return ProxyResource.create_proxy_resource_from_uri(proxy_uri, original_label=original_label, ds=ds) proxy_literal_field = response_dict['http://schemas.delving.eu/narthex/terms/proxyLiteralField'] proxy_literal_value = response_dict['http://schemas.delving.eu/narthex/terms/proxyLiteralValue'] frequency = response_dict['http://schemas.delving.eu/narthex/terms/skosFrequency'] ds = DataSet.get_dataset(document_uri=response_dict['http://schemas.delving.eu/narthex/terms/belongsTo']) proxy_field = ProxyResourceField.objects.filter(dataset=ds, property_uri=proxy_literal_field) if not proxy_field: proxy_field = ProxyResourceField( property_uri=proxy_literal_field, dataset_uri=ds.document_uri, dataset=ds ) proxy_field.save() else: proxy_field = proxy_field[0] resource_dict = { 'proxy_uri': proxy_uri, 'proxy_field': proxy_field, 'dataset': ds, 'frequency': frequency, 'label': proxy_literal_value } proxy_resource, created = ProxyResource.objects.update_or_create(**resource_dict) return proxy_resource
def stage_for_indexing(dataset_graph_uri): """Set synced=false for all records that belong to the dataset.""" store = rdfstore.get_rdfstore() query = """ ?g where {{ {{GRAPH ?g {{ ?s <http://schemas.delving.eu/narthex/terms/belongsTo> <{}> . ?s2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.openarchives.org/ore/terms/Aggregation>}} . }} }} """.format(dataset_graph_uri) res = store.select(query=query) graph_list = [graph['g']['value'] for graph in res['results']['bindings']] update_switch = [QueryType.remove_insert.format( named_graph=g, remove="?s <http://schemas.delving.eu/narthex/terms/synced> true", insert="?s <http://schemas.delving.eu/narthex/terms/synced> false" ) for g in graph_list] store.update(query="\n".join(update_switch)) return None
def schedule_out_of_sync_datasets(acceptance=False, store=None): """Find all out of sync datasets and schedule synchronisation tasks for each.""" if not store: store = rdfstore.get_rdfstore(acceptance) nr_datasets, datasets = find_datasets_with_records_out_of_sync(store) if nr_datasets == 0: return 0 logger.info("Found {} datasets that have records that are out of sync".format(nr_datasets)) scheduled_for_indexing = 0 for dataset_uri in datasets: ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_uri, store=store) ds.records_in_sync = False if ds.can_be_synchronised: logger.info("status: {}, {}, {}".format(ds.stay_in_sync, ds.sync_in_progress, ds.has_sync_error)) process_key = str(uuid.uuid1()) ds.process_key = process_key ds.save() async_result = synchronise_dataset_records.apply_async(kwargs={'store': store, 'ds': ds}, task_id=process_key) scheduled_for_indexing += 1 logger.info("Scheduled {} for indexing with {} records".format(ds.spec, ds.valid)) return scheduled_for_indexing
def get_skos_from_uri(skos_uri, store=None): if store is None: store = rdfstore.get_rdfstore() response = EDMRecord.get_skos_broader_context_graph(store, skos_uri) g = Graph() g.namespace_manager = namespace_manager for entry in response['results']['bindings']: if all([key in ['broader', 'prefLabel', 's'] for key in entry.keys()]): triple = (URIRef(entry['broader']['value']), SKOS.prefLabel, Literal(entry['prefLabel']['value'], lang=entry['prefLabel'].get('xml:lang'))) g.add(triple) g.add((URIRef(entry['s']['value']), SKOS.broader, URIRef(entry['broader']['value']))) elif all([key in ['s', 'p', 'o'] for key in entry.keys()]): subject = URIRef(entry['s']['value']) predicate = URIRef(entry['p']['value']) obj_value = entry['o'] if obj_value['type'] == 'literal': obj = Literal(obj_value['value'], lang=obj_value.get('xml:lang')) else: obj = URIRef(obj_value['value']) g.add((subject, predicate, obj)) return g
def create_es_action(self, action="index", record_type=None, index=settings.SITE_NAME, store=None, doc_type=None, context=True, flat=True, exclude_fields=None, acceptance=False): if doc_type is None: doc_type = self._generate_doc_type() if record_type is None: record_type = self.get_rdf_type() if not store: store = rdfstore.get_rdfstore() if acceptance: index = "{}_acceptance".format(index) if record_type == "http://www.openarchives.org/ore/terms/Aggregation": record_type = "mdr" if action == "delete": return { '_op_type': action, '_index': index, '_type': doc_type, '_id': self.hub_id } graph = None if not context: graph = self.get_graph() else: graph, nr_levels = self.get_context_graph(store=store, named_graph=self.named_graph) graph.namespace_manager = namespace_manager bindings = GraphBindings( about_uri=self.source_uri, graph=graph ) index_doc = bindings.to_flat_index_doc() if flat else bindings.to_index_doc() if exclude_fields: index_doc = {k: v for k, v in index_doc.items() if k not in exclude_fields} # add delving spec for default searchability index_doc["delving_spec"] = [ {'@type': "Literal", 'value': self.get_spec_name(), 'raw': self.get_spec_name(), 'lang': None} ] logger.debug(index_doc) mapping = { '_op_type': action, '_index': index, '_type': doc_type, '_id': self.hub_id, '_source': index_doc } thumbnail = bindings.get_about_thumbnail mapping['_source']['system'] = { 'slug': self.hub_id, 'spec': self.get_spec_name(), 'thumbnail': thumbnail if thumbnail else "", 'preview': "detail/foldout/{}/{}".format(doc_type, self.hub_id), 'caption': bindings.get_about_caption if bindings.get_about_caption else "", 'about_uri': self.document_uri, 'source_uri': self.source_uri, 'graph_name': self.named_graph, 'created_at': datetime.datetime.now().isoformat(), 'modified_at': datetime.datetime.now().isoformat(), 'source_graph': graph.serialize(format='nt', encoding="utf-8").decode(encoding="utf-8"), 'proxy_resource_graph': None, 'web_resource_graph': None, # 'about_type': [rdf_type.qname for rdf_type in bindings.get_about_resource().get_types()] # 'collections': None, todo find a way to add collections via link } data_owner = self.dataset.data_owner if hasattr(self, 'dataset') else None dataset_name = self.dataset.name if hasattr(self, 'dataset') else None mapping['_source']['legacy'] = { 'delving_hubId': self.hub_id, 'delving_recordType': record_type, 'delving_spec': self.get_spec_name(), 'delving_owner': data_owner, 'delving_orgId': settings.ORG_ID, 'delving_collection': dataset_name, 'delving_title': self.get_first_literal(DC.title, graph), 'delving_creator': self.get_first_literal(DC.creator, graph), 'delving_description': self.get_first_literal(DC.description, graph), 'delving_provider': index_doc.get('edm_provider')[0].get('value') if 'edm_provider' in index_doc else None, 'delving_hasGeoHash': "true" if bindings.has_geo() else "false", 'delving_hasDigitalObject': "true" if thumbnail else "false", 'delving_hasLandingePage': "true" if 'edm_isShownAt' in index_doc else "false", 'delving_hasDeepZoom': "true" if 'nave_deepZoom' in index_doc else "false", } return mapping
def get_context_data(self, **kwargs): target_uri = "http://*****:*****@tonsmitshuis.nl" } }, { "thumbnail": "", "deepzoom": "", "mime_type": "audio/wav", "source_uri": "media/189467__speedenza__poem-darkness-voice.wav", "metadata": { "dc_title": "Poem: Darkness (Voice)", "dc_creator": "Speedenza (freesound.org)", "dc_rights": "freesound.org" } }, { "thumbnail": "", "deepzoom": "", "mime_type": "video/mp4", "source_uri": "media/NuclearExplosionwww.keepvid.com.mp4", "metadata": { "dc_title": "Nuclear explosion", "dc_creator": "Oppenheimer", "dc_rights": "Destructive Commons" } } ] # Todo add search results # * build query on uri or property # * add facets from configuration + property facet # * get NaveResponse # * add to context as data return context
def retrieve_and_cache_remote_lod_resource(cache_uri, store=None): if not store: store = get_rdfstore() cache_resource, created = CacheResource.objects.get_or_create(source_uri=cache_uri) return store_cache_resource.delay(cache_resource, store), created
def process_narthex_file(self, store=None, acceptance=False, path=None, console=False): if not store: store = rdfstore.get_rdfstore() if not path: processed_fname = self.get_narthex_processed_fname() else: processed_fname = path logger.info("started processing {} for dataset {}".format(processed_fname, self.spec)) with open(processed_fname, 'r') as f: record = [] lines = 0 records = 0 stored = 0 new = 0 not_orphaned = [] bulk_insert_records = [] sparql_update_queries = [] es_actions = [] # set orphaned records self.mark_records_as_orphaned(state=True) for line in f: lines += 1 exists, named_graph, content_hash = self.is_line_marker(line) if exists: edm_record = EDMRecord.objects.filter(source_hash=content_hash, named_graph=named_graph).exists() if not edm_record: triples = " ".join(record) # print(is_marker) new += 1 g = Graph(identifier=named_graph) g.parse(data=triples) if not EDMRecord.objects.filter(named_graph=named_graph).exists(): created_record = EDMRecord.graph_to_record( graph=g, ds=self, content_hash=None, acceptance=acceptance, bulk=True) bulk_insert_records.append(created_record) es_actions.append( created_record.create_es_action( action="index", store=store, context=False, # todo: fix issue with context indexing later flat=True, exclude_fields=None, acceptance=acceptance ) ) if settings.RDF_STORE_TRIPLES: sparql_update_queries.append( created_record.create_sparql_update_query(acceptance=acceptance)) else: updated_record = EDMRecord.graph_to_record( graph=g, ds=self, content_hash=None, acceptance=acceptance ) if settings.RDF_STORE_TRIPLES: sparql_update_queries.append( updated_record.create_sparql_update_query(acceptance=acceptance)) es_actions.append( updated_record.create_es_action( action="index", store=store, context=False, # todo: fix issue with context indexing later flat=True, exclude_fields=None, acceptance=acceptance ) ) else: EDMRecord.objects.filter(source_hash=content_hash, named_graph=named_graph).update( orphaned=False) stored += 1 records += 1 record[:] = [] bulk_record_size = len(bulk_insert_records) if bulk_record_size > 0 and bulk_record_size % 1000 == 0: EDMRecord.objects.bulk_create(bulk_insert_records) logger.info("inserted 1000 records of {} at {}".format(self.spec, time.ctime())) bulk_insert_records[:] = [] nr_sparql_updates = len(sparql_update_queries) if settings.RDF_STORE_TRIPLES and nr_sparql_updates > 0 and nr_sparql_updates % 50 == 0: store.update("\n".join(sparql_update_queries)) sparql_update_queries[:] = [] if records % 1000 == 0: logger.info("processed {} records of {} at {}".format(records, self.spec, time.ctime())) if console: print("processed {} records of {} at {}".format(records, self.spec, time.ctime())) if len(es_actions) > 1000: self.bulk_index(es_actions) es_actions[:] = [] else: record.append(line) # store the remaining bulk items EDMRecord.objects.bulk_create(bulk_insert_records) self.bulk_index(es_actions) if settings.RDF_STORE_TRIPLES and len(sparql_update_queries) > 0: store.update("\n".join(sparql_update_queries)) logger.info( "Dataset {}: records inserted {}, records same content hash {}, lines parsed {}, total records processed {}".format( self.spec, new, stored, lines, records) ) return lines, records
def store_cache_resource(obj, store=None): if not store: store = get_rdfstore() graph_store = store.get_graph_store response = obj.update_cached_resource(graph_store) return response
def store_graphs(triples, named_graph, store=None): if store is None: store = rdfstore.get_rdfstore() stored = store.get_graph_store.post(data=triples, named_graph=named_graph) return stored
def process_narthex_file(self, spec, store=None, acceptance=False, path=None, console=False): start = datetime.now() if not store: store = rdfstore.get_rdfstore() if not path: processed_fname = self.get_narthex_processed_fname() else: processed_fname = path print("started processing {} for dataset {}".format(processed_fname, spec)) with open(processed_fname, 'r') as f: rdf_record = [] lines = 0 records = 0 stored = 0 new = 0 not_orphaned = [] sparql_update_queries = [] es_actions = [] # set orphaned records for line in f: lines += 1 exists, named_graph, content_hash = self.is_line_marker(line) if exists: new += 1 records += 1 triples = " ".join(rdf_record) record = ElasticSearchRDFRecord(rdf_string=triples, spec=spec) try: record.from_rdf_string(named_graph=named_graph, rdf_string=triples, input_format="xml") es_actions.append(record.create_es_action(doc_type="void_edmrecord", record_type="mdr", context=True)) except Exception as ex: if console: print("problem with {} for spec {} caused by {}".format(triples, spec, ex)) else: logger.error("problem with {} for spec {} caused by {}".format(triples, spec, ex)) rdf_record[:] = [] if settings.RDF_STORE_TRIPLES: sparql_update_queries.append( record.create_sparql_update_query(acceptance=acceptance) ) nr_sparql_updates = len(sparql_update_queries) if settings.RDF_STORE_TRIPLES and nr_sparql_updates > 0 and nr_sparql_updates % 50 == 0: store.update("\n".join(sparql_update_queries)) sparql_update_queries[:] = [] if records % 100 == 0 and records > 0: logger.info("processed {} records of {} at {}".format(records, spec, ctime())) if console: print("processed {} records of {} at {}".format(records, spec, ctime())) if len(es_actions) > 100: self.bulk_index(es_actions, spec) es_actions[:] = [] else: rdf_record.append(line) # store the remaining bulk items self.bulk_index(es_actions, spec) if settings.RDF_STORE_TRIPLES and len(sparql_update_queries) > 0: store.update("\n".join(sparql_update_queries)) logger.info( "Dataset {}: records inserted {}, records same content hash {}, lines parsed {}, total records processed {}".format( spec, new, stored, lines, records) ) print("Finished loading {spec} with {lines} and {records} in {seconds}\n".format( spec=spec, lines=lines, records=records, seconds=datetime.now() - start )) RDFRecord.remove_orphans(spec, start.isoformat()) return lines, records
def retrieve(self, request, pk=None, format=None, *args, **kwargs): def get_mode(default=None): params = request.GET return params.get('schema', default) self._clean_callback(request) query = NaveESQuery( index_name=self.get_index_name, doc_types=self.doc_types, default_facets=self.facets, cluster_geo=False, size=1, converter=self.get_converter() ) try: query = query.build_item_query(query, request.query_params, pk) except ValueError as ve: logger.error("Unable to build request because: {}".format(ve)) # todo display error message when bad/unknown hubId is given return HttpResponseBadRequest() mlt = True if request.query_params.get('mlt', 'false') == "true" else False mlt_count = int(request.query_params.get('mlt.count', 5)) mlt_filter_queries = request.query_params.getlist('mlt.qf', []) mlt_fq_dict = {} for fq in mlt_filter_queries: if ":" in fq: k, v = fq.split(":", maxsplit=1) mlt_fq_dict[k] = v record = ElasticSearchRDFRecord(hub_id=pk) record.get_graph_by_id(hub_id=pk) response = NaveItemResponse( query, self, index=self.get_index_name, mlt=mlt, mlt_count=mlt_count, mlt_filter_query=mlt_fq_dict, rdf_record=record ) renderer_format = request.accepted_renderer.format if renderer_format in list(EXTENSION_TO_MIME_TYPE.keys()) and renderer_format not in ['xml', 'json']: graph = record.get_graph() graph_string = graph.serialize(format=renderer_format).decode('utf-8') mime_type = EXTENSION_TO_MIME_TYPE.get(renderer_format) return Response(data=graph_string, content_type=mime_type) target_uri = record.document_uri if settings.RDF_USE_LOCAL_GRAPH: graph = record.get_graph() else: store = rdfstore.get_rdfstore() graph, _ = RDFModel.get_context_graph(store, named_graph=record.named_graph) if not graph: from django.http import HttpResponseNotFound return HttpResponseNotFound() mode = get_mode(self.default_converter) bindings = GraphBindings(about_uri=target_uri, graph=graph) delving_fields = False if request.GET.get("delving_fields") == 'false' else True converter = None if mode in ['api', 'api-flat']: index_doc = bindings.to_index_doc() if mode == 'api' else bindings.to_flat_index_doc() elif mode in REGISTERED_CONVERTERS.keys(): converter = REGISTERED_CONVERTERS.get(mode) index_doc = converter( bindings=bindings, graph=graph, about_uri=bindings.about_uri() ).convert(add_delving_fields=delving_fields) elif self.default_converter in REGISTERED_CONVERTERS.keys(): converter = REGISTERED_CONVERTERS.get(self.default_converter) index_doc = converter( bindings=bindings, graph=graph, about_uri=bindings.about_uri() ).convert(add_delving_fields=delving_fields) else: logger.warn("unable to convert results to schema {}".format(mode)) index_doc = bindings.to_index_doc() layout_fields = OrderedDict() layout_fields['layout'] = converter().get_layout_fields() if converter else [] if response.get_mlt(): mlt = {"item": [NaveESItemSerializer(item).data for item in response.get_mlt()]} else: mlt = "" result = {'result': { 'layout': layout_fields, 'item': {'fields': index_doc}, "relatedItems": mlt}} return Response(result)