def get_context_data(self, **kwargs): context = super(AggregatorExportView, self).get_context_data(**kwargs) context.update({ 'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sparql_endpoint': get_virtuoso_endpoint(), 'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'], 'mastergraph_port': settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'], 'mastergraph_graphname': settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'], 'resource_namespace': settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'], }) return context
def process_aggregator(aggregator, force=False): """ execute the aggregator workflow: run silk on every archive item associated to the aggregator. """ from tempfile import mkdtemp from webui.cnmain.utils import get_virtuoso_endpoint logger_name = process_aggregator.request.id loggy = get_redis_logger(logger_name) local_manager.cleanup() local.logger = loggy tmpdir = mkdtemp() scheduler = Scheduler.objects.create( content_type=ContentType.objects.get_for_model(aggregator), object_id=aggregator.pk, status=Scheduler.RUNNING, logger_name=logger_name, ) try: loggy.info("Processing aggregator %s", unicode(aggregator)) loggy.debug("Working dir: %s", tmpdir) context = { 'aggregator': aggregator, 'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sparql_endpoint': get_virtuoso_endpoint(), 'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'], 'mastergraph_port': settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'], 'mastergraph_graphname': settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'], 'resource_namespace': settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'], } loggy.info("Connecting to virtuoso") aggregator_archiveitems = aggregator.aggregatorarchiveitem_set\ .all().order_by('first_workflow_success') if not force: res = [] for aggregator_archiveitem in aggregator_archiveitems: if aggregator_archiveitem.needs_update(): res.append(aggregator_archiveitem) else: loggy.info('Skipped archiveitem %s', unicode(aggregator_archiveitem.archiveitem)) aggregator_archiveitems = res _aggregator_process_archiveitems(aggregator_archiveitems, scheduler, tmpdir, context) loggy.info('Workflow completed') except Exception, e: loggy.exception('Generic exception in the workflow') scheduler.status = Scheduler.FAIL scheduler.error = e.message or str(e) # send the exception to sentry raise
def process_aggregator(aggregator, force=False): """ execute the aggregator workflow: run silk on every archive item associated to the aggregator. """ from tempfile import mkdtemp from webui.cnmain.utils import get_virtuoso_endpoint logger_name = process_aggregator.request.id loggy = get_redis_logger(logger_name) local_manager.cleanup() local.logger = loggy tmpdir = mkdtemp() scheduler = Scheduler.objects.create( content_type=ContentType.objects.get_for_model(aggregator), object_id=aggregator.pk, status=Scheduler.RUNNING, logger_name=logger_name, ) try: loggy.info("Processing aggregator %s", unicode(aggregator)) loggy.debug("Working dir: %s", tmpdir) context = { 'aggregator': aggregator, 'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sparql_endpoint': get_virtuoso_endpoint(), 'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'], 'mastergraph_port': settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'], 'mastergraph_graphname': settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'], 'resource_namespace': settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'], } loggy.info("Connecting to virtuoso") aggregator_archiveitems = aggregator.aggregatorarchiveitem_set\ .all().order_by('first_workflow_success') if not force: res = [] for aggregator_archiveitem in aggregator_archiveitems: if aggregator_archiveitem.needs_update(): res.append(aggregator_archiveitem) else: loggy.info('Skipped archiveitem %s', unicode(aggregator_archiveitem.archiveitem)) aggregator_archiveitems = res _aggregator_process_archiveitems( aggregator_archiveitems, scheduler, tmpdir, context ) loggy.info('Workflow completed') except Exception, e: loggy.exception('Generic exception in the workflow') scheduler.status = Scheduler.FAIL scheduler.error = e.message or str(e) # send the exception to sentry raise
def test_silk_project_file_is_valid(self): import xml.etree.ElementTree as ET self.client_login('admin') item1 = ArchiveItemFactory() item2 = ArchiveItemFactory() for item in (item1, item2): AggregatorArchiveItem.objects.create( aggregator=self.aggregator, archiveitem=item ) response = self.client.get(self.export_url) tree = ET.fromstring(response.content) self.assertIn( (settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sd'), [(x.get('namespace'), x.get('id')) for x in tree.findall('.//Prefix')] ) # check datasources datasources = tree.findall('.//DataSource') self.assertEqual(len(datasources), 3) self.assertEqual(datasources[0].get('id'), 'master-graph') mastergraph = datasources[0] datasources = datasources[1:] # check datasources endpoints self.assertEqual( mastergraph.find('Param[@name="host"]').get('value'), settings.TRIPLE_DATABASE_MASTER['HOST'] ) self.assertEqual( [get_virtuoso_endpoint()] * 2, [x.find('Param[@name="endpointURI"]').get("value") for x in datasources] ) # check datasources graph names self.assertEqual( mastergraph.find('Param[@name="graph"]').get('value'), settings.TRIPLE_DATABASE_MASTER["KWARGS"]["graph"] ) self.assertEqual( [item1.datagraph_mapped_name, item2.datagraph_mapped_name], [x.find('Param[@name="graph"]').get("value") for x in datasources] ) # check tasks datasource_ids = [x.get('id') for x in datasources] tasks = tree.findall('.//LinkingTask') self.assertEqual(len(tasks), 2) self.assertEqual( datasource_ids, [x.find('.//Interlink').get('id') for x in tasks] ) # check task parameters for datasource_id, task in zip(datasource_ids, tasks): self.assertEqual( task.find('.//SourceDataset').get('dataSource'), datasource_id ) self.assertEqual( task.find('.//TargetDataset').get('dataSource'), 'master-graph' ) self.assertEqual( task.find('.//SourceDataset').find('RestrictTo').text.strip(), '?a rdf:type <{}> .'.format(self.aggregator.entity_type) ) self.assertEqual( task.find('.//TargetDataset').find('RestrictTo').text.strip(), 'b -> {}'.format(self.aggregator.vertex_selector) ) self.assertIsNone(task.find('.//LinkageRule').text) self.assertIsNone(task.find('.//Filter').text) self.assertIsNone(task.find('.//Outputs').text) self.assertIsNone(task.find('.//PositiveEntities').text) self.assertIsNone(task.find('.//NegativeEntities').text) self.assertIsNone( task.find('.//Alignment/') .find('{}Alignment'.format('{http://knowledgeweb.' 'semanticweb.org' '/heterogeneity/alignment#}') ).text )
def test_silk_project_file_is_valid(self): import xml.etree.ElementTree as ET self.client_login('admin') item1 = ArchiveItemFactory() item2 = ArchiveItemFactory() for item in (item1, item2): AggregatorArchiveItem.objects.create(aggregator=self.aggregator, archiveitem=item) response = self.client.get(self.export_url) tree = ET.fromstring(response.content) self.assertIn((settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sd'), [(x.get('namespace'), x.get('id')) for x in tree.findall('.//Prefix')]) # check datasources datasources = tree.findall('.//DataSource') self.assertEqual(len(datasources), 3) self.assertEqual(datasources[0].get('id'), 'master-graph') mastergraph = datasources[0] datasources = datasources[1:] # check datasources endpoints self.assertEqual( mastergraph.find('Param[@name="host"]').get('value'), settings.TRIPLE_DATABASE_MASTER['HOST']) self.assertEqual([get_virtuoso_endpoint()] * 2, [ x.find('Param[@name="endpointURI"]').get("value") for x in datasources ]) # check datasources graph names self.assertEqual( mastergraph.find('Param[@name="graph"]').get('value'), settings.TRIPLE_DATABASE_MASTER["KWARGS"]["graph"]) self.assertEqual( [item1.datagraph_mapped_name, item2.datagraph_mapped_name], [x.find('Param[@name="graph"]').get("value") for x in datasources]) # check tasks datasource_ids = [x.get('id') for x in datasources] tasks = tree.findall('.//LinkingTask') self.assertEqual(len(tasks), 2) self.assertEqual(datasource_ids, [x.find('.//Interlink').get('id') for x in tasks]) # check task parameters for datasource_id, task in zip(datasource_ids, tasks): self.assertEqual( task.find('.//SourceDataset').get('dataSource'), datasource_id) self.assertEqual( task.find('.//TargetDataset').get('dataSource'), 'master-graph') self.assertEqual( task.find('.//SourceDataset').find('RestrictTo').text.strip(), '?a rdf:type <{}> .'.format(self.aggregator.entity_type)) self.assertEqual( task.find('.//TargetDataset').find('RestrictTo').text.strip(), 'b -> {}'.format(self.aggregator.vertex_selector)) self.assertIsNone(task.find('.//LinkageRule').text) self.assertIsNone(task.find('.//Filter').text) self.assertIsNone(task.find('.//Outputs').text) self.assertIsNone(task.find('.//PositiveEntities').text) self.assertIsNone(task.find('.//NegativeEntities').text) self.assertIsNone( task.find('.//Alignment/').find('{}Alignment'.format( '{http://knowledgeweb.' 'semanticweb.org' '/heterogeneity/alignment#}')).text)
def test_can_silk_rules_file_is_valid(self): import xml.etree.ElementTree as ET from django.template.loader import render_to_string archive_item = ArchiveItemFactory() AggregatorArchiveItem.objects.create( aggregator=self.aggregator, archiveitem=archive_item ) self.aggregator.silk_rule = \ '<LinkageRule><smart data="now" /></LinkageRule>' self.aggregator.save() output_filename = 'a_really_cool_filename.thm' context = { 'aggregator': self.aggregator, 'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sparql_endpoint': get_virtuoso_endpoint(), 'archive_item': archive_item, 'output_filename': output_filename, 'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'], 'mastergraph_port': settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'], 'mastergraph_graphname': settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'], 'resource_namespace': settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'], } tree = ET.fromstring(render_to_string( 'controller/aggregator/silk_rules.xml', context )) self.assertIn( (settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sd'), [(x.get('namespace'), x.get('id')) for x in tree.findall('.//Prefix')] ) # check datasources datasources_dom = tree.findall('.//DataSource') self.assertEqual(len(datasources_dom), 2) self.assertEqual(datasources_dom[0].get('id'), 'master-graph') mastergraph, datasource = datasources_dom # check datasource endpoints self.assertEqual( get_virtuoso_endpoint(), datasource.find('Param[@name="endpointURI"]').get("value"), ) # check datasources graph names self.assertEqual( mastergraph.find('Param[@name="graph"]').get('value'), settings.TRIPLE_DATABASE_MASTER["KWARGS"]["graph"] ) self.assertEqual( archive_item.datagraph_mapped_name, datasource.find('Param[@name="graph"]').get("value") ) # check tasks datasource_id = datasource.get('id') rules = tree.findall('.//Interlink') self.assertEqual(len(rules), 1) self.assertEqual(datasource_id, rules[0].get('id')) # check rules parameters rule = rules[0] self.assertEqual( rule.find('.//SourceDataset').get('dataSource'), datasource_id ) self.assertEqual( rule.find('.//TargetDataset').get('dataSource'), 'master-graph' ) self.assertEqual( ET.tostring(rule.find('.//LinkageRule')).strip(), self.aggregator.silk_rule ) self.assertEqual( rule.find('.//SourceDataset').find('RestrictTo').text.strip(), '?a rdf:type <{}> .'.format(self.aggregator.entity_type) ) self.assertEqual( rule.find('.//TargetDataset').find('RestrictTo').text.strip(), 'b -> {}'.format(self.aggregator.vertex_selector) ) self.assertIsNone(rule.find('.//Filter').text) output = rule.find('.//Outputs').find('Output') self.assertEqual(output.get('type'), 'file') self.assertEqual(output.findall('Param')[0].get('name'), 'file') self.assertEqual( output.findall('Param')[0].get('value'), output_filename) self.assertEqual(output.findall('Param')[1].get('name'), 'format') self.assertEqual(output.findall('Param')[1].get('value'), 'ntriples')