Exemplo n.º 1
0
 def test_creation_of_dataset_from_graph(self):
     ds = DataSet.get_dataset_from_graph(
         dataset_graph_uri=self.dataset_graph_uri,
         store=self.store
     )
     self.assertIsNotNone(ds)
     self.assertIsInstance(ds, DataSet)
     self.assertEquals(
         ds.named_graph,
         URIRef(self.dataset_graph_uri)
     )
     self.assertIn(
         "Ton Smits Huis",
         [group.name for group in ds.groups.all()]
     )
     ds_from_db = DataSet.objects.get(named_graph=self.dataset_graph_uri)
     self.assertEquals(
         ds_from_db.spec,
         str(ds.spec)
     )
     self.assertTrue(
         self.store.ask(
             named_graph=self.dataset_graph_uri,
             query="""where {{?s <http://schemas.delving.eu/narthex/terms/synced> true }}"""
         )
     )
Exemplo n.º 2
0
def purge_deleted_datasets(store):
    """Find datasets which are deleted and purge all their information in Nave. """
    nr_deleted, datasets_uris = find_datasets_by_sync_or_deleted_status(store, deleted=True)
    if nr_deleted == 0:
        logger.info("No deleted datasets found.")
        return 0
    for dataset_uri in datasets_uris:
        ds = DataSet.get_dataset_from_graph(store, dataset_uri)
        delete_dataset_with_all_triples(ds, store)
        ds.delete()
    logger.info("Purged {} datasets from Nave and Narthex: {}".format(nr_deleted, datasets_uris))
    return nr_deleted
Exemplo n.º 3
0
 def test_context_graph(self):
     ds = DataSet.get_dataset_from_graph(
         dataset_graph_uri=self.dataset_graph_uri,
         store=self.store
     )
     es_actions = []
     edm_record, es_action = tasks.synchronise_record(
         graph_uri="http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph",
         ds=ds,
         store=self.store,
         es_actions=es_actions
     )
     context_graph, nr_levels = edm_record.get_context_graph(store=self.store, named_graph=edm_record.named_graph)
     self.assertIsNotNone(context_graph)
     self.assertIsInstance(context_graph, Graph)
     predicates = set(list(context_graph.predicates()))
     assert URIRef('http://www.openarchives.org/ore/terms/aggregates') in predicates
Exemplo n.º 4
0
 def test_graph_indexing(self):
     ds = DataSet.get_dataset_from_graph(
         dataset_graph_uri=self.dataset_graph_uri,
         store=self.store
     )
     es_actions = []
     edm_record, es_action = tasks.synchronise_record(
         graph_uri="http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph",
         ds=ds,
         store=self.store,
         es_actions=es_actions
     )
     self.assertTrue(
         edm_record.hub_id.endswith("ton-smits-huis_454")
     )
     action = edm_record.create_es_action(
         index=self.index_name,
         record_type="Aggregation",
         store=self.store,
         exclude_fields=['dc_rights']
     )
     self.assertIsNotNone(action)
     assert 'dc_rights' not in action['_source']
     assert action['_source']['system']['delving_recordType'] == "Aggregation"
     required_fields = [
         "_op_type", "_index", "_type", "_id", "_source"
     ]
     #  "graph", "slug", "delving_hubId", "delving_spec", "delving_recordType"
     assert set(list(es_action.keys())).issuperset(set(required_fields))
     assert 'about' in es_action['_source']
     assert 'edm_object' in es_action['_source']
     assert 'rdf' in es_action['_source']
     assert 'system' in es_action['_source']
     subjects = es_action['_source']['dc_subject']
     assert 'dc_rights' in es_action['_source']
     inline_id = 'http://data.cultureelerfgoed.nl/semnet/7403e26d-cf33-4372-ad72-a2f9fcf8f63b'
     inlined_example = [subject for subject in subjects if 'id' in subject and subject['id'] in [inline_id]][0]
     assert inlined_example
     assert inlined_example['id'] == inline_id
     assert inlined_example['value'] == "bomen"
     assert inlined_example['lang'] == "nl"
Exemplo n.º 5
0
 def test_synchronise_record(self):
     ds = DataSet.get_dataset_from_graph(
         dataset_graph_uri=self.dataset_graph_uri,
         store=self.store
     )
     graph_list = tasks.get_out_of_sync_dataset_record_graph_uris(
         self.dataset_graph_uri,
         self.store
     )
     es_actions = []
     edm_record, es_action = tasks.synchronise_record(
         graph_uri=graph_list[0],
         ds=ds,
         store=self.store,
         es_actions=es_actions
     )
     self.assertIsNotNone(edm_record)
     self.assertIsNotNone(es_action)
     self.assertEqual(
         EDMRecord.objects.count(),
         1,
         "Only one record should be saved"
     )
     self.assertEquals(
         edm_record.dataset,
         ds
     )
     self.assertTrue(
         edm_record.hub_id.endswith("ton-smits-huis_454"),
     )
     self.assertRegex(
         edm_record.hub_id,
         "(.*?)_(.*?)_(.*?)"
     )
     assert edm_record.document_uri == 'http://localhost:8000/resource/aggregation/ton-smits-huis/454'
     self.assertEquals(
         edm_record.named_graph,
         URIRef('http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph')
     )
Exemplo n.º 6
0
def schedule_out_of_sync_datasets(acceptance=False, store=None):
    """Find all out of sync datasets and schedule synchronisation tasks for each."""
    if not store:
        store = rdfstore.get_rdfstore(acceptance)
    nr_datasets, datasets = find_datasets_with_records_out_of_sync(store)
    if nr_datasets == 0:
        return 0
    logger.info("Found {} datasets that have records that are out of sync".format(nr_datasets))
    scheduled_for_indexing = 0
    for dataset_uri in datasets:
        ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_uri, store=store)
        ds.records_in_sync = False
        if ds.can_be_synchronised:
            logger.info("status: {}, {}, {}".format(ds.stay_in_sync, ds.sync_in_progress, ds.has_sync_error))
            process_key = str(uuid.uuid1())
            ds.process_key = process_key
            ds.save()
            async_result = synchronise_dataset_records.apply_async(kwargs={'store': store, 'ds': ds},
                                                                   task_id=process_key)
            scheduled_for_indexing += 1
            logger.info("Scheduled {} for indexing with {} records".format(ds.spec, ds.valid))
    return scheduled_for_indexing
Exemplo n.º 7
0
 def synchronise_dataset_metadata(store, dataset_graph_uri):
     """Synchronise the metadata of the dataset between Narthex and Nave."""
     ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_graph_uri, store=store)
     ds.save()
     return ds
Exemplo n.º 8
0
 def test_synchronise_dataset(self):
     from search import get_es_client
     client = get_es_client()
     s = Search(client).index(self.index_name)
     del_response = client.delete_by_query(index=self.index_name, q="*:*")
     es_response = s.execute()
     self.assertEquals(
         es_response.hits.total,
         0
     )
     self.assertEquals(
         EDMRecord.objects.count(),
         0
     )
     assert self.store.ask(query="""where {{
         GRAPH <http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph>
         {{?s <http://schemas.delving.eu/narthex/terms/synced> false}}
         }}"""
                           )
     response = tasks.synchronise_dataset_records(
         dataset_graph_uri=self.dataset_graph_uri,
         store=self.store,
         index=self.index_name
     )
     # self.assertTrue(response.successful)
     # self.assertEquals(
     #     response.result,
     #     1
     # )
     self.assertEquals(
         EDMRecord.objects.count(),
         1
     )
     time.sleep(2)
     es_response = s.execute()
     self.assertEquals(
         es_response.hits.total,
         1,
         "there should be one record in the test index"
     )
     record = es_response.hits[0]
     self.assertEquals(
         record.meta.doc_type,
         "void_edmrecord"
     )
     self.assertEquals(
         "_".join(record.meta.id.split('_')[1:]),
         "ton-smits-huis_454"
     )
     # test if switch is flipped
     assert self.store.ask(query="""where {{
         GRAPH <http://localhost:8000/resource/aggregation/ton-smits-huis/454/graph>
         {{?s <http://schemas.delving.eu/narthex/terms/synced> true}}
         }}"""
                           )
     # test if dataset is deleted from index
     ds = DataSet.get_dataset_from_graph(
         dataset_graph_uri=self.dataset_graph_uri, store=self.store)
     ds.delete_from_index(self.index_name)
     es_response = s.execute()
     self.assertEquals(
         es_response.hits.total,
         0,
         "there should be no records in the test index after the dataset is deleted"
     )
     rdf_store_response = ds.delete_from_triple_store(self.store)
     assert rdf_store_response
     assert not self.store.ask(query="""where {{
         GRAPH <http://localhost:8000/resource/dataset/ton-smits-huis/graph>
         {{?s ?p ?o}}
         }}"""
                           )
Exemplo n.º 9
0
def synchronise_dataset_records(store, dataset_graph_uri=None, ds=None, index=settings.SITE_NAME):
    """Iterate over all records that are out of sync for a dataset and update them in the index and database. """
    if not ds and dataset_graph_uri:
        ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_graph_uri, store=store)
    elif ds and not dataset_graph_uri:
        dataset_graph_uri = ds.document_uri
    elif not dataset_graph_uri and not ds:
        raise ValueError("Unable to find dataset due to missing value in dataset_graph_uri and/or ds")
    logger.info("Graph uri to synchronise: {}".format(dataset_graph_uri))
    # materialize nodes
    # ore:aggregates + remove ore:isAggregatedBy
    graph_list = get_out_of_sync_dataset_record_graph_uris(dataset_graph_uri, store, 200)
    if not ds.stay_in_sync:
        logger.warn("Should not start synchronization for {} when marked as not stay in sync".format(ds.spec))
        return 0
    elif ds.has_sync_error:
        logger.warn("Can't start synchronization of {} due to previous sync error.".format(ds.spec))
        return 0
    ds.has_sync_error = False
    ds.sync_error_message = None
    ds.records_in_sync = False
    ds.processed_records = 0
    ds.save()
    records_processed = 0
    try:
        valid_records = ds.valid
        while len(graph_list) > 0:
            actions = []
            # todo use the graphs instead of the URIs
            for graph_uri in graph_list:
                synchronise_record(graph_uri, ds, store, actions, index=index)
            # index actions
            logger.info("number of actions scheduled: {}".format(len(actions)))
            response = helpers.bulk(client=get_es(), actions=actions, stats_only=True)
            records_processed += len(graph_list)
            logger.info("processed {}/{} for {}".format(records_processed, valid_records, ds.spec))
            logger.debug("ElasticSearch bulk update: {}".format(response))
            update_switch = [QueryType.remove_insert.format(
                named_graph=g,
                remove="?s <http://schemas.delving.eu/narthex/terms/synced> false",
                insert="?s <http://schemas.delving.eu/narthex/terms/synced> true"
            ) for g in graph_list]
            response = store.update(query="\n".join(update_switch))
            logger.debug("SPARQL update succeeded: {}".format(response))
            ds = DataSet.objects.get(id=ds.id)
            ds.processed_records = records_processed
            ds.save()
            graph_list = get_out_of_sync_dataset_record_graph_uris(dataset_graph_uri, store, 200)

        ds.process_key = None
        ds.records_in_sync = True
        ds.dataset_type = DataSetType.aggregated
        if not ds.oai_pmh.real > 0:
            ds.oai_pmh = OaiPmhPublished.none
        ds.save()
        logger.info("Finishing synchronising {} records from dataset: {}".format(records_processed, dataset_graph_uri))
    except Exception as e:
        logger.error("Unable to index all records for dataset {} due to {} at record {}.".format(ds.spec, e, graph_uri))
        ds.sync_error_message = "{} with error: {}".format(graph_uri, e)
        ds.has_sync_error = True
        ds.process_key = None
        ds.save()
        logger.warn("Only index {} of {} valid for dataset {}".format(records_processed, valid_records, ds.spec))
    return records_processed