Exemplo n.º 1
0
def process_dataset(dataset, logger_name=None):
    """ process a single dataset in the workflow
    """
    indipendent = not logger_name
    if indipendent:
        logger_name = process_dataset.request.id
    loggy = get_redis_logger(logger_name)
    local_manager.cleanup()
    local.logger = loggy

    scheduler = Scheduler.objects.create(
        content_type=ContentType.objects.get_for_model(dataset),
        object_id=dataset.pk,
        status=Scheduler.RUNNING,
        logger_name=logger_name,
    )

    wf_input_params = []
    wf_exec_results = []

    loggy.info("Processing dataset %s", unicode(dataset))

    try:
        file_meta = download_dataset(dataset)
        wf_input_params, wf_exec_results = \
            evaluate_dispatcher_and_run_workflow(scheduler, file_meta)
    except Exception, e:
        scheduler.status = Scheduler.FAIL
        scheduler.error = e.message

        loggy.exception('Process failure: %s', scheduler.error)
        if isinstance(e, IllegalRuleCheckSum):
            scheduler.status = Scheduler.INVALID
        else:
            raise
Exemplo n.º 2
0
def process_dataset(dataset, logger_name=None):
    """ process a single dataset in the workflow
    """
    indipendent = not logger_name
    if indipendent:
        logger_name = process_dataset.request.id
    loggy = get_redis_logger(logger_name)
    local_manager.cleanup()
    local.logger = loggy

    scheduler = Scheduler.objects.create(
        content_type=ContentType.objects.get_for_model(dataset),
        object_id=dataset.pk,
        status=Scheduler.RUNNING,
        logger_name=logger_name,
    )

    wf_input_params = []
    wf_exec_results = []

    loggy.info("Processing dataset %s", unicode(dataset))

    try:
        file_meta = download_dataset(dataset)
        wf_input_params, wf_exec_results = \
            evaluate_dispatcher_and_run_workflow(scheduler, file_meta)
    except Exception, e:
        scheduler.status = Scheduler.FAIL
        scheduler.error = e.message

        loggy.exception('Process failure: %s', scheduler.error)
        if isinstance(e, IllegalRuleCheckSum):
            scheduler.status = Scheduler.INVALID
        else:
            raise
Exemplo n.º 3
0
    def __init__(self,
                 config_file,
                 input_params=None,
                 logger_name=None,
                 store_output=False):
        """Workflow init and configuration loading."""
        if not input_params:
            input_params = {}
        self.store_output = store_output

        if logger_name:
            self._logger_name = logger_name
            self.logger = get_redis_logger(logger_name)

        # make sure the input params are all strings
        for key, value in input_params.items():
            input_params[key] = unicode(value)

        self.status_log = []
        self._config_file = os.path.abspath(config_file)
        with open(config_file) as f:
            self.config = json.load(f)
        self._verify_input_params(self.config, input_params)
        self.config['vars'].update(input_params)
        self.config['vars'].update(self.extra_vars)
        self.logger.debug('Loaded configuration %s', self.config)
Exemplo n.º 4
0
def process_source(source, older_than=0):
    """Processes a source"""
    red = redis.Redis()
    task_id = process_source.request.id

    local_manager.cleanup()
    loggy = get_redis_logger(task_id)
    local.logger = loggy

    red.zadd(
        'source:{}'.format(source.pk),
        get_redis_key(task_id),
        timestamp_now()
    )

    # Init Handler, if an error occurs Source will not be processed.
    loggy.info('Evaluating Init Handler')
    try:
        wf_output = _process_init_handler(source)
    except:
        loggy.exception(
            'An error occurred while processing Init Handler for source [%s]',
            unicode(source)
        )
        raise
    else:
        loggy.info(
            'Init handler executed successfully. Output %s', wf_output
        )

    # Select never scheduled datasets.
    the_date = timezone.now() - datetime.timedelta(seconds=older_than)
    dataset_ctype = ContentType.objects.get_for_model(Dataset)
    already_scheduled_datasets = Scheduler.objects.filter(
        content_type=dataset_ctype, created__gte=the_date).values('object_id')
    datasets = Dataset.objects.filter(source=source)\
                              .exclude(pk__in=already_scheduled_datasets)

    count = datasets.count()
    if count:
        loggy.info('Processing %d datasets', datasets.count())

        result = group(
            [process_dataset.s(ds, logger_name=task_id)
                for ds in datasets]
        ).apply()

        if result.successful():
            dispose_sequence.delay(result.join(), source, task_id).get()
        else:
            loggy.info('An error occurred in a process_dataset')
    else:
        loggy.info('No datasets to process')

    loggy.info(END)
Exemplo n.º 5
0
def process_source(source, older_than=0):
    """Processes a source"""
    red = redis.Redis()
    task_id = process_source.request.id

    local_manager.cleanup()
    loggy = get_redis_logger(task_id)
    local.logger = loggy

    red.zadd('source:{}'.format(source.pk), get_redis_key(task_id),
             timestamp_now())

    # Init Handler, if an error occurs Source will not be processed.
    loggy.info('Evaluating Init Handler')
    try:
        wf_output = _process_init_handler(source)
    except:
        loggy.exception(
            'An error occurred while processing Init Handler for source [%s]',
            unicode(source))
        raise
    else:
        loggy.info('Init handler executed successfully. Output %s', wf_output)

    # Select never scheduled datasets.
    the_date = timezone.now() - datetime.timedelta(seconds=older_than)
    dataset_ctype = ContentType.objects.get_for_model(Dataset)
    already_scheduled_datasets = Scheduler.objects.filter(
        content_type=dataset_ctype, created__gte=the_date).values('object_id')
    datasets = Dataset.objects.filter(source=source)\
                              .exclude(pk__in=already_scheduled_datasets)

    count = datasets.count()
    if count:
        loggy.info('Processing %d datasets', datasets.count())

        result = group([
            process_dataset.s(ds, logger_name=task_id) for ds in datasets
        ]).apply()

        if result.successful():
            dispose_sequence.delay(result.join(), source, task_id).get()
        else:
            loggy.info('An error occurred in a process_dataset')
    else:
        loggy.info('No datasets to process')

    loggy.info(END)
Exemplo n.º 6
0
def dispose_sequence(results, source, task_id):
    """ execute the last step of the workflow: the dispose handler
    """
    loggy = get_redis_logger(task_id)
    local.logger = loggy

    processed_dataset_count = len(results)
    failed_dataset_count = len(
        # result could be True, False or an exception
        [result for result in results if result is not True])

    loggy.info('Executing dispose handler')
    wf_output = _process_dispose_handler(
        source, {
            'processed_dataset_count': processed_dataset_count,
            'failed_dataset_count': failed_dataset_count
        })
    if wf_output:
        loggy.info('Dispose handler executed. Output %r', wf_output)
Exemplo n.º 7
0
def dispose_sequence(results, source, task_id):
    """ execute the last step of the workflow: the dispose handler
    """
    loggy = get_redis_logger(task_id)
    local.logger = loggy

    processed_dataset_count = len(results)
    failed_dataset_count = len(
        # result could be True, False or an exception
        [result for result in results if result is not True]
    )

    loggy.info('Executing dispose handler')
    wf_output = _process_dispose_handler(source, {
        'processed_dataset_count': processed_dataset_count,
        'failed_dataset_count': failed_dataset_count
    })
    if wf_output:
        loggy.info(
            'Dispose handler executed. Output %r', wf_output
        )
Exemplo n.º 8
0
    def test_it_pushes_to_redis(self):
        identity = MagicMock(return_value='testkey')
        logger = logging.getLogger('test_logger')
        logger.handlers = []
        get_task_logger = MagicMock(return_value=logger)
        redis = MagicMock()
        Redis = MagicMock(return_value=redis)

        with patch.multiple(
            'webui.scheduler.log',
            Redis=Redis,
            get_redis_key=identity,
            get_task_logger=get_task_logger
        ):
            logger_ = get_redis_logger('test')
            logger_.info('LOG THIS')

            self.assertEqual(redis.rpush.call_count, 1)
            self.assertEqual(redis.rpush.call_args[0][0], 'testkey')
            record = json.loads(redis.rpush.call_args[0][1])
            self.assertEqual(record['msg'], 'LOG THIS')
Exemplo n.º 9
0
    def __init__(self, config_file, input_params=None, logger_name=None,
                 store_output=False):
        """Workflow init and configuration loading."""
        if not input_params:
            input_params = {}
        self.store_output = store_output

        if logger_name:
            self._logger_name = logger_name
            self.logger = get_redis_logger(logger_name)

        # make sure the input params are all strings
        for key, value in input_params.items():
            input_params[key] = unicode(value)

        self.status_log = []
        self._config_file = os.path.abspath(config_file)
        with open(config_file) as f:
            self.config = json.load(f)
        self._verify_input_params(self.config, input_params)
        self.config['vars'].update(input_params)
        self.config['vars'].update(self.extra_vars)
        self.logger.debug('Loaded configuration %s', self.config)
Exemplo n.º 10
0
def process_aggregator(aggregator, force=False):
    """ execute the aggregator workflow: run silk on every archive item
     associated to the aggregator.
    """
    from tempfile import mkdtemp
    from webui.cnmain.utils import get_virtuoso_endpoint

    logger_name = process_aggregator.request.id
    loggy = get_redis_logger(logger_name)
    local_manager.cleanup()
    local.logger = loggy
    tmpdir = mkdtemp()
    scheduler = Scheduler.objects.create(
        content_type=ContentType.objects.get_for_model(aggregator),
        object_id=aggregator.pk,
        status=Scheduler.RUNNING,
        logger_name=logger_name,
    )

    try:
        loggy.info("Processing aggregator %s", unicode(aggregator))
        loggy.debug("Working dir: %s", tmpdir)

        context = {
            'aggregator':
            aggregator,
            'sd_prefix':
            settings.TRIPLE_DATABASE['PREFIXES']['sdv1'],
            'sparql_endpoint':
            get_virtuoso_endpoint(),
            'mastergraph_host':
            settings.TRIPLE_DATABASE_MASTER['HOST'],
            'mastergraph_port':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'],
            'mastergraph_graphname':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'],
            'resource_namespace':
            settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'],
        }

        loggy.info("Connecting to virtuoso")

        aggregator_archiveitems = aggregator.aggregatorarchiveitem_set\
            .all().order_by('first_workflow_success')

        if not force:
            res = []
            for aggregator_archiveitem in aggregator_archiveitems:
                if aggregator_archiveitem.needs_update():
                    res.append(aggregator_archiveitem)
                else:
                    loggy.info('Skipped archiveitem %s',
                               unicode(aggregator_archiveitem.archiveitem))

            aggregator_archiveitems = res

        _aggregator_process_archiveitems(aggregator_archiveitems, scheduler,
                                         tmpdir, context)

        loggy.info('Workflow completed')
    except Exception, e:
        loggy.exception('Generic exception in the workflow')
        scheduler.status = Scheduler.FAIL
        scheduler.error = e.message or str(e)
        # send the exception to sentry
        raise
Exemplo n.º 11
0
def process_aggregator(aggregator, force=False):
    """ execute the aggregator workflow: run silk on every archive item
     associated to the aggregator.
    """
    from tempfile import mkdtemp
    from webui.cnmain.utils import get_virtuoso_endpoint

    logger_name = process_aggregator.request.id
    loggy = get_redis_logger(logger_name)
    local_manager.cleanup()
    local.logger = loggy
    tmpdir = mkdtemp()
    scheduler = Scheduler.objects.create(
        content_type=ContentType.objects.get_for_model(aggregator),
        object_id=aggregator.pk,
        status=Scheduler.RUNNING,
        logger_name=logger_name,
    )

    try:
        loggy.info("Processing aggregator %s", unicode(aggregator))
        loggy.debug("Working dir: %s", tmpdir)

        context = {
            'aggregator': aggregator,
            'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'],
            'sparql_endpoint': get_virtuoso_endpoint(),
            'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'],
            'mastergraph_port':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'],
            'mastergraph_graphname':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'],
            'resource_namespace':
            settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'],
        }

        loggy.info("Connecting to virtuoso")

        aggregator_archiveitems = aggregator.aggregatorarchiveitem_set\
            .all().order_by('first_workflow_success')

        if not force:
            res = []
            for aggregator_archiveitem in aggregator_archiveitems:
                if aggregator_archiveitem.needs_update():
                    res.append(aggregator_archiveitem)
                else:
                    loggy.info('Skipped archiveitem %s',
                               unicode(aggregator_archiveitem.archiveitem))

            aggregator_archiveitems = res

        _aggregator_process_archiveitems(
            aggregator_archiveitems, scheduler, tmpdir, context
        )

        loggy.info('Workflow completed')
    except Exception, e:
        loggy.exception('Generic exception in the workflow')
        scheduler.status = Scheduler.FAIL
        scheduler.error = e.message or str(e)
        # send the exception to sentry
        raise