def process_dataset(dataset, logger_name=None): """ process a single dataset in the workflow """ indipendent = not logger_name if indipendent: logger_name = process_dataset.request.id loggy = get_redis_logger(logger_name) local_manager.cleanup() local.logger = loggy scheduler = Scheduler.objects.create( content_type=ContentType.objects.get_for_model(dataset), object_id=dataset.pk, status=Scheduler.RUNNING, logger_name=logger_name, ) wf_input_params = [] wf_exec_results = [] loggy.info("Processing dataset %s", unicode(dataset)) try: file_meta = download_dataset(dataset) wf_input_params, wf_exec_results = \ evaluate_dispatcher_and_run_workflow(scheduler, file_meta) except Exception, e: scheduler.status = Scheduler.FAIL scheduler.error = e.message loggy.exception('Process failure: %s', scheduler.error) if isinstance(e, IllegalRuleCheckSum): scheduler.status = Scheduler.INVALID else: raise
def __init__(self, config_file, input_params=None, logger_name=None, store_output=False): """Workflow init and configuration loading.""" if not input_params: input_params = {} self.store_output = store_output if logger_name: self._logger_name = logger_name self.logger = get_redis_logger(logger_name) # make sure the input params are all strings for key, value in input_params.items(): input_params[key] = unicode(value) self.status_log = [] self._config_file = os.path.abspath(config_file) with open(config_file) as f: self.config = json.load(f) self._verify_input_params(self.config, input_params) self.config['vars'].update(input_params) self.config['vars'].update(self.extra_vars) self.logger.debug('Loaded configuration %s', self.config)
def process_source(source, older_than=0): """Processes a source""" red = redis.Redis() task_id = process_source.request.id local_manager.cleanup() loggy = get_redis_logger(task_id) local.logger = loggy red.zadd( 'source:{}'.format(source.pk), get_redis_key(task_id), timestamp_now() ) # Init Handler, if an error occurs Source will not be processed. loggy.info('Evaluating Init Handler') try: wf_output = _process_init_handler(source) except: loggy.exception( 'An error occurred while processing Init Handler for source [%s]', unicode(source) ) raise else: loggy.info( 'Init handler executed successfully. Output %s', wf_output ) # Select never scheduled datasets. the_date = timezone.now() - datetime.timedelta(seconds=older_than) dataset_ctype = ContentType.objects.get_for_model(Dataset) already_scheduled_datasets = Scheduler.objects.filter( content_type=dataset_ctype, created__gte=the_date).values('object_id') datasets = Dataset.objects.filter(source=source)\ .exclude(pk__in=already_scheduled_datasets) count = datasets.count() if count: loggy.info('Processing %d datasets', datasets.count()) result = group( [process_dataset.s(ds, logger_name=task_id) for ds in datasets] ).apply() if result.successful(): dispose_sequence.delay(result.join(), source, task_id).get() else: loggy.info('An error occurred in a process_dataset') else: loggy.info('No datasets to process') loggy.info(END)
def process_source(source, older_than=0): """Processes a source""" red = redis.Redis() task_id = process_source.request.id local_manager.cleanup() loggy = get_redis_logger(task_id) local.logger = loggy red.zadd('source:{}'.format(source.pk), get_redis_key(task_id), timestamp_now()) # Init Handler, if an error occurs Source will not be processed. loggy.info('Evaluating Init Handler') try: wf_output = _process_init_handler(source) except: loggy.exception( 'An error occurred while processing Init Handler for source [%s]', unicode(source)) raise else: loggy.info('Init handler executed successfully. Output %s', wf_output) # Select never scheduled datasets. the_date = timezone.now() - datetime.timedelta(seconds=older_than) dataset_ctype = ContentType.objects.get_for_model(Dataset) already_scheduled_datasets = Scheduler.objects.filter( content_type=dataset_ctype, created__gte=the_date).values('object_id') datasets = Dataset.objects.filter(source=source)\ .exclude(pk__in=already_scheduled_datasets) count = datasets.count() if count: loggy.info('Processing %d datasets', datasets.count()) result = group([ process_dataset.s(ds, logger_name=task_id) for ds in datasets ]).apply() if result.successful(): dispose_sequence.delay(result.join(), source, task_id).get() else: loggy.info('An error occurred in a process_dataset') else: loggy.info('No datasets to process') loggy.info(END)
def dispose_sequence(results, source, task_id): """ execute the last step of the workflow: the dispose handler """ loggy = get_redis_logger(task_id) local.logger = loggy processed_dataset_count = len(results) failed_dataset_count = len( # result could be True, False or an exception [result for result in results if result is not True]) loggy.info('Executing dispose handler') wf_output = _process_dispose_handler( source, { 'processed_dataset_count': processed_dataset_count, 'failed_dataset_count': failed_dataset_count }) if wf_output: loggy.info('Dispose handler executed. Output %r', wf_output)
def dispose_sequence(results, source, task_id): """ execute the last step of the workflow: the dispose handler """ loggy = get_redis_logger(task_id) local.logger = loggy processed_dataset_count = len(results) failed_dataset_count = len( # result could be True, False or an exception [result for result in results if result is not True] ) loggy.info('Executing dispose handler') wf_output = _process_dispose_handler(source, { 'processed_dataset_count': processed_dataset_count, 'failed_dataset_count': failed_dataset_count }) if wf_output: loggy.info( 'Dispose handler executed. Output %r', wf_output )
def test_it_pushes_to_redis(self): identity = MagicMock(return_value='testkey') logger = logging.getLogger('test_logger') logger.handlers = [] get_task_logger = MagicMock(return_value=logger) redis = MagicMock() Redis = MagicMock(return_value=redis) with patch.multiple( 'webui.scheduler.log', Redis=Redis, get_redis_key=identity, get_task_logger=get_task_logger ): logger_ = get_redis_logger('test') logger_.info('LOG THIS') self.assertEqual(redis.rpush.call_count, 1) self.assertEqual(redis.rpush.call_args[0][0], 'testkey') record = json.loads(redis.rpush.call_args[0][1]) self.assertEqual(record['msg'], 'LOG THIS')
def process_aggregator(aggregator, force=False): """ execute the aggregator workflow: run silk on every archive item associated to the aggregator. """ from tempfile import mkdtemp from webui.cnmain.utils import get_virtuoso_endpoint logger_name = process_aggregator.request.id loggy = get_redis_logger(logger_name) local_manager.cleanup() local.logger = loggy tmpdir = mkdtemp() scheduler = Scheduler.objects.create( content_type=ContentType.objects.get_for_model(aggregator), object_id=aggregator.pk, status=Scheduler.RUNNING, logger_name=logger_name, ) try: loggy.info("Processing aggregator %s", unicode(aggregator)) loggy.debug("Working dir: %s", tmpdir) context = { 'aggregator': aggregator, 'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sparql_endpoint': get_virtuoso_endpoint(), 'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'], 'mastergraph_port': settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'], 'mastergraph_graphname': settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'], 'resource_namespace': settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'], } loggy.info("Connecting to virtuoso") aggregator_archiveitems = aggregator.aggregatorarchiveitem_set\ .all().order_by('first_workflow_success') if not force: res = [] for aggregator_archiveitem in aggregator_archiveitems: if aggregator_archiveitem.needs_update(): res.append(aggregator_archiveitem) else: loggy.info('Skipped archiveitem %s', unicode(aggregator_archiveitem.archiveitem)) aggregator_archiveitems = res _aggregator_process_archiveitems(aggregator_archiveitems, scheduler, tmpdir, context) loggy.info('Workflow completed') except Exception, e: loggy.exception('Generic exception in the workflow') scheduler.status = Scheduler.FAIL scheduler.error = e.message or str(e) # send the exception to sentry raise
def process_aggregator(aggregator, force=False): """ execute the aggregator workflow: run silk on every archive item associated to the aggregator. """ from tempfile import mkdtemp from webui.cnmain.utils import get_virtuoso_endpoint logger_name = process_aggregator.request.id loggy = get_redis_logger(logger_name) local_manager.cleanup() local.logger = loggy tmpdir = mkdtemp() scheduler = Scheduler.objects.create( content_type=ContentType.objects.get_for_model(aggregator), object_id=aggregator.pk, status=Scheduler.RUNNING, logger_name=logger_name, ) try: loggy.info("Processing aggregator %s", unicode(aggregator)) loggy.debug("Working dir: %s", tmpdir) context = { 'aggregator': aggregator, 'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sparql_endpoint': get_virtuoso_endpoint(), 'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'], 'mastergraph_port': settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'], 'mastergraph_graphname': settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'], 'resource_namespace': settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'], } loggy.info("Connecting to virtuoso") aggregator_archiveitems = aggregator.aggregatorarchiveitem_set\ .all().order_by('first_workflow_success') if not force: res = [] for aggregator_archiveitem in aggregator_archiveitems: if aggregator_archiveitem.needs_update(): res.append(aggregator_archiveitem) else: loggy.info('Skipped archiveitem %s', unicode(aggregator_archiveitem.archiveitem)) aggregator_archiveitems = res _aggregator_process_archiveitems( aggregator_archiveitems, scheduler, tmpdir, context ) loggy.info('Workflow completed') except Exception, e: loggy.exception('Generic exception in the workflow') scheduler.status = Scheduler.FAIL scheduler.error = e.message or str(e) # send the exception to sentry raise