예제 #1
0
    def retry_or_fail(self, msg, retry_time):
        """
        If any positive retry time has been specified the workflow message will be resent
        If not, an error message is logged

        :param msg: workflow message
        :param retry_time: time to retry starting the workflow

        :return:
        """
        if retry_time > 0:
            if not msg.get('workflow'):
                # Initialize the workflow part of the message for the current workflow
                msg['workflow'] = {
                    'workflow_name': self._workflow_name,
                    'step_name': self._step_name,
                    'retry_time': retry_time
                }
            if retry_workflow(msg):
                # Succesfully retried workflow
                return

        # No retries left of retry_workflow has failed
        action = self._workflow_name.upper()
        logger.configure(msg, action)
        logger.error(f"Job {action} start rejected, job is already active")
예제 #2
0
def handle_export_test_msg(msg):
    header = msg.get('header', {})
    assert_message_attributes(header, ["catalogue"])

    catalogue = header['catalogue']

    start_timestamp = int(
        datetime.datetime.utcnow().replace(microsecond=0).timestamp())
    process_id = header.get('process_id',
                            f"{start_timestamp}.export_test.{catalogue}")

    msg["header"].update({
        'process_id': process_id,
        'application': "GOBExportTest",
        'entity': catalogue
    })

    logger.configure(msg, "EXPORT_TEST")

    test(catalogue)

    summary = logger.get_summary()
    msg = {"header": msg.get("header"), "summary": summary, "contents": None}

    # To overcome distribute problems of locked files,
    # distribute is decoupled and starts at a certain
    # time triggered by in Jenkins.
    #
    # Send out a notification for a successfull export test
    #
    # if len(summary['errors']) == 0:
    #     add_notification(msg, ExportTestNotification(header['catalogue'],
    #                                                  header.get('collection'),
    #                                                  header.get('product')))
    return msg
예제 #3
0
def handle_export_file_msg(msg):
    header = msg['header']
    logger.configure(msg, "EXPORT")
    export(catalogue=header['catalogue'],
           collection=header['collection'],
           product=header['product'],
           destination=header['destination'])
예제 #4
0
def end_to_end_test_handler(msg):
    """Request to run E2E tests.

    Return message with new generated dynamic workflow in the header.

    :param msg:
    :return:
    """
    now = datetime.datetime.utcnow()
    start_timestamp = int(now.replace(microsecond=0).timestamp())
    header = msg.get('header', {})

    # Set process id before call to logger.configure
    header['process_id'] = header.get('process_id',
                                      f"{start_timestamp}.e2e_test")

    logger.configure(msg, 'E2E Test')
    logger.info("Clear any previous test data")

    e2etest = E2ETest(header['process_id'])
    e2etest.cleartests()
    logger.info("Start E2E Test")

    return {
        'header': {
            **header,
            'timestamp': now.isoformat(),
            'workflow': e2etest.get_workflow(),
        },
        'contents': ''
    }
예제 #5
0
def apply(msg):
    mode = msg['header'].get('mode', FULL_UPLOAD)

    logger.configure(msg, "UPDATE")
    logger.info("Apply events")

    storage = GOBStorageHandler()
    combinations = _get_source_catalog_entity_combinations(storage, msg)

    # Gather statistics of update process
    stats = UpdateStatistics()
    before = None
    after = None
    for result in combinations:
        model = f"{result.source} {result.catalogue} {result.entity}"
        logger.info(f"Apply events {model}")
        storage = GOBStorageHandler(result)

        # Track eventId before event application
        entity_max_eventid, last_eventid = get_event_ids(storage)
        before = min(entity_max_eventid or 0, before or sys.maxsize)

        if is_corrupted(entity_max_eventid, last_eventid):
            logger.error(
                f"Model {model} is inconsistent! data is more recent than events"
            )
        elif entity_max_eventid == last_eventid:
            logger.info(f"Model {model} is up to date")
            apply_confirm_events(storage, stats, msg)
        else:
            logger.info(f"Start application of unhandled {model} events")
            with storage.get_session():
                last_events = storage.get_last_events(
                )  # { tid: last_event, ... }

            apply_events(storage, last_events, entity_max_eventid, stats)
            apply_confirm_events(storage, stats, msg)

        # Track eventId after event application
        entity_max_eventid, last_eventid = get_event_ids(storage)
        after = max(entity_max_eventid or 0, after or 0)

        # Build result message
        results = stats.results()
        if mode == FULL_UPLOAD and _should_analyze(stats):
            logger.info("Running VACUUM ANALYZE on table")
            storage.analyze_table()

        stats.log()
        logger.info(f"Apply events {model} completed", {'data': results})

    msg['summary'] = logger.get_summary()

    # Add a events notification telling what types of event have been applied
    if not msg['header'].get('suppress_notifications', False):
        add_notification(msg, EventNotification(stats.applied,
                                                [before, after]))

    return msg
예제 #6
0
def prepare_relate(msg):
    """
    The starting point for the relate process. A relate job will be split into individual relate jobs on
    attribute level. If there's only a catalog in the message, all collections of that catalog will be related.
    When a job which has been split is received the relation name will be added and the job will be forwarded
    to the next step of the relate process where the relations are being made.

    :param msg: a message from the broker containing the catalog and collections (optional)
    :return: the result message of the relate preparation step
    """
    header = msg.get('header', {})
    catalog_name = header.get('catalogue')
    collection_name = header.get('collection')
    attribute_name = header.get('attribute')

    application = "GOBRelate"
    msg["header"] = {
        **msg.get("header", {}),
        "version": "0.1",
        "source": "GOB",
        "application": application,
        "entity": collection_name
    }

    timestamp = datetime.datetime.utcnow().isoformat()

    msg["header"].update({
        "timestamp": timestamp,
    })

    logger.configure(msg, "RELATE")

    if not catalog_name or not collection_name or not attribute_name:
        # A job will be splitted when catalog, collection or attribute are not provided
        logger.info("Splitting relate job")

        _split_job(msg)
        msg['header']['is_split'] = True

        return publish_result(msg, [])
    else:
        # If the job has all attributes, add the relation name and forward to the next step in the relate process
        logger.info(f"** Relate {catalog_name} {collection_name} {attribute_name}")

        relation_name = get_relation_name(GOBModel(), catalog_name, collection_name, attribute_name)

        msg["header"].update({
            "catalogue": "rel",
            "collection": relation_name,
            "entity": relation_name,
            "original_catalogue": catalog_name,
            "original_collection": collection_name,
            "original_attribute": attribute_name,
        })

        return msg
예제 #7
0
def handle_export_dump_msg(msg):
    header = msg['header']
    logger.configure(msg, "DUMP")
    Dumper().dump_catalog(catalog_name=header['catalogue'],
                          collection_name=header['collection'],
                          include_relations=header.get('include_relations',
                                                       True),
                          force_full=header.get('full', False))

    add_notification(
        msg, DumpNotification(header['catalogue'], header['collection']))
예제 #8
0
    def end_of_workflow(self, msg):
        logger.configure(msg, "WORKFLOW")
        on_complete = msg['header'].pop('on_workflow_complete', None)
        if on_complete is not None:
            if not isinstance(on_complete, dict) or not all([key in on_complete for key in ['exchange', 'key']]):
                logger.error("on_workflow_complete should be a dict with keys 'exchange' and 'key'")
            else:
                publish(on_complete['exchange'], on_complete['key'], msg)
                logger.info(f"Publish on_workflow_complete to {on_complete['exchange']} with {on_complete['key']}")

        logger.info("End of workflow")
        job_end(msg["header"].get("jobid"))
예제 #9
0
def handle_distribute_msg(msg):
    header = msg['header']
    logger.configure(msg, "DISTRIBUTE")

    distribute(catalogue=header['catalogue'], fileset=header.get('fileset'))

    return {
        "header": msg.get("header"),
        "summary": {
            "warnings": logger.get_warnings(),
            "errors": logger.get_errors()
        },
        "contents": None
    }
예제 #10
0
def process_relate(msg: dict):
    """
    This function starts the actual relate process. The message is checked for completeness and the Relater
    builds the new or updated relations and returns the result the be compared as if it was the result
    of an import job.

    :param msg: a message from the broker containing the catalog and collections (optional)
    :return: the result message of the relate process
    """
    logger.configure(msg, "RELATE SRC")

    _check_message(msg)
    header = msg.get('header')

    logger.info("Relate table started")

    full_update = header.get('mode', "update") == "full"

    if full_update:
        logger.info("Full relate requested")

    updater = Relater(header[CATALOG_KEY], header[COLLECTION_KEY], header[ATTRIBUTE_KEY])

    filename, confirms = updater.update(full_update)

    logger.info("Relate table completed")

    relation_name = get_relation_name(GOBModel(), header[CATALOG_KEY], header[COLLECTION_KEY], header[ATTRIBUTE_KEY])

    result_msg = {
        "header": {
            **msg["header"],
            "catalogue": "rel",
            "collection": relation_name,
            "entity": relation_name,
            "source": "GOB",
            "application": "GOB",
            "version": RELATE_VERSION,
            "timestamp": msg.get("timestamp", datetime.datetime.utcnow().isoformat()),
        },
        "summary": logger.get_summary(),
        "contents_ref": filename,
        "confirms": confirms,
    }

    return result_msg
예제 #11
0
def end_to_end_wait_handler(msg):
    logger.configure(msg, 'E2E Test')
    process_id = msg['header'].get('process_id')
    wait_for_process_id = msg['header'].get('wait_for_process_id')
    seconds = msg['header'].get('seconds')

    assert all([process_id, wait_for_process_id, seconds]), \
        "Expecting attributes 'process_id', 'wait_for_process_id' and 'seconds' in header"

    E2ETest(process_id).wait(wait_for_process_id, seconds)

    return {
        'header': {
            **msg.get('header', {}),
        },
        'summary': logger.get_summary(),
    }
예제 #12
0
def handle_brp_regression_test_msg(msg):
    logger.configure(msg, 'BRP Regression test')

    results = BrpRegression(logger).run()
    writer = ObjectstoreResultsWriter(results, 'regression_tests/results/brp')
    writer.write()
    logger.info(
        "Written test results to Objecstore at regression_tests/results/brp")

    return {
        'header': {
            **msg.get('header', {}),
            'timestamp':
            datetime.datetime.utcnow().isoformat(),
        },
        'summary': logger.get_summary(),
    }
예제 #13
0
def handle_import_object_msg(msg):
    logger.configure(msg, "IMPORT OBJECT")
    logger.info("Start import object")
    importer = MappinglessConverterAdapter(msg['header'].get('catalogue'),
                                           msg['header'].get('entity'),
                                           msg['header'].get('entity_id_attr'))
    entity = importer.convert(msg['contents'])

    return {
        'header': {
            **msg['header'],
            'mode': ImportMode.SINGLE_OBJECT.value,
            'collection': msg['header'].get('entity'),
        },
        'summary': logger.get_summary(),
        'contents': [entity]
    }
예제 #14
0
def end_to_end_execute_workflow_handler(msg):
    logger.configure(msg, 'E2E Test')
    workflow_to_execute = msg['header'].get('execute')
    workflow_process_id = msg['header'].get('execute_process_id')
    process_id = msg['header'].get('process_id')

    assert all([workflow_to_execute, workflow_process_id, process_id]), \
        "Expecting attributes 'execute', 'execute_process_id' and 'process_id' in header"

    E2ETest(process_id).execute_workflow(workflow_to_execute,
                                         workflow_process_id)

    return {
        'header': {
            **msg.get('header', {}),
        },
        'summary': logger.get_summary(),
    }
예제 #15
0
    def reject(self, action, msg, job):
        """
        Reject a message because the job is already active within GOB

        :param msg:
        :param job:
        :return:
        """
        # Start a workflow step to reject the message
        msg["header"]["process_id"] = job['id']
        msg["header"]["entity"] = msg["header"].get('collection')
        step = step_start("accept", msg['header'])
        step_status(job['id'], step['id'], STATUS_START)
        logger.configure(msg, action.upper())
        logger.error(f"Job {action} start rejected, job is already active")
        # End the workflow step and then the workflow job
        step_status(job['id'], step['id'], STATUS_REJECTED)
        return job_end(job['id'], STATUS_REJECTED)
예제 #16
0
def end_to_end_check_handler(msg):
    logger.configure(msg, 'E2E Test')

    endpoint = msg['header'].get('endpoint')
    expect = msg['header'].get('expect')
    description = msg['header'].get('description')
    process_id = msg['header'].get('process_id')

    assert all([endpoint, expect, description, process_id]), \
        "Expecting attributes 'endpoint', 'expect', 'description' and 'process_id' in header"

    E2ETest(process_id).check(endpoint, expect, description)
    return {
        'header': {
            **msg.get('header', {}),
        },
        'summary': logger.get_summary(),
    }
예제 #17
0
def kafka_produce_handler(msg):
    logger.configure(msg, "KAFKA_PRODUCE")
    logger.info("Produce Kafka events")

    catalogue = msg.get('header', {}).get('catalogue')
    collection = msg.get('header', {}).get('collection')

    assert catalogue and collection, "Missing catalogue and collection in header"

    event_producer = KafkaEventProducer(catalogue, collection, logger)
    event_producer.produce()

    return {
        'header': msg['header'],
        'summary': {
            'produced': event_producer.total_cnt,
        }
    }
예제 #18
0
def on_workflow_progress(msg):
    """
    Process a workflow progress message

    The progress report is START, OK or FAIL
    :param msg: The message that contains the progress info
    :return: None
    """
    status = msg['status']
    step_info = step_status(msg['jobid'], msg['stepid'], status)
    if step_info and status in [STATUS_OK, STATUS_FAIL]:
        logger.configure(msg, "WORKFLOW")
        logger.info(
            f"Duration {str(step_info.end - step_info.start).split('.')[0]}")
        if status == STATUS_FAIL:
            logger.error(f"Program error: {msg['info_msg']}")
            logger.info("End of workflow")
    hooks.on_workflow_progress(msg)
예제 #19
0
def full_update(msg):
    """Store the events for the current dataset

    :param msg: the result of the application of the events
    :return: Result message
    """
    logger.configure(msg, "UPDATE")
    logger.info(
        f"Update to GOB Database {GOBStorageHandler.user_name} started")

    # Interpret the message header
    message = ImportMessage(msg)
    metadata = message.metadata

    storage = GOBStorageHandler(metadata)
    model = f"{metadata.source} {metadata.catalogue} {metadata.entity}"
    logger.info(f"Store events {model}")

    # Get events from message
    events = msg["contents"]

    # Gather statistics of update process
    stats = UpdateStatistics()

    _process_events(storage, events, stats)

    # Build result message
    results = stats.results()

    stats.log()
    logger.info(f"Store events {model} completed", {'data': results})

    results.update(logger.get_summary())

    # Return the result message, with no log, no contents but pass-through any confirms
    message = {
        "header": msg["header"],
        "summary": results,
        "contents": None,
        "confirms": msg.get('confirms')
    }
    return message
예제 #20
0
def has_no_errors(msg):
    """
    Checks the message

    Interprets the message info and either return True to signal that the message was OK
    or return False and logs an error message explaining why the result was rejected
    :param msg: The message to check
    :return: True if the message is OK to proceed to the next step
    """
    summary = msg.get('summary')
    is_ok = True
    if summary:
        num_errors = len(summary.get('errors', []))
        is_ok = num_errors == 0
        if not is_ok:
            logger.configure(msg, "WORKFLOW")
            logger.warning(
                f"Workflow stopped because of {num_errors} error{'s' if num_errors > 1 else '' }"
            )
    return is_ok
예제 #21
0
def handle_import_msg(msg):
    dataset = extract_dataset_from_msg(msg)

    msg['header'] |= {
        'source': dataset['source']['name'],
        'application': dataset['source']['application'],
        'catalogue': dataset['catalogue'],
        'entity': dataset['entity'],
    }

    logger.configure(msg, "IMPORT")
    header = msg.get('header', {})

    # Create a new import client and start the process
    mode = ImportMode(header.get('mode', ImportMode.FULL.value))

    import_client = ImportClient(dataset=dataset,
                                 msg=msg,
                                 mode=mode,
                                 logger=logger)
    return import_client.import_dataset()
예제 #22
0
def update_materialized_view(msg):
    """Updates materialized view for a relation for a given catalog, collection and attribute or relation name.

    Expects a message with headers:
    - catalogue
    - collection (if catalogue is 'rel' this should be the relation_name)
    - attribute (optional if catalogue is 'rel')

    examples of correct headers that are functionally equivalent:
    header = {
        "catalogue": "meetbouten",
        "collection": "meetbouten",
        "attribute": "ligt_in_buurt",
    }
    header = {
        "catalogue": "rel",
        "collection": "mbn_mbt_gbd_brt_ligt_in_buurt",
    }

    :param msg:
    :return:
    """
    header = msg.get('header', {})
    catalog_name = header.get('catalogue')
    collection_name = header.get('collection')
    attribute_name = header.get('attribute')

    logger.configure(msg, "UPDATE_VIEW")
    storage_handler = GOBStorageHandler()

    view = _get_materialized_view(catalog_name, collection_name, attribute_name)
    view.refresh(storage_handler)
    logger.info(f"Update materialized view {view.name}")

    timestamp = datetime.datetime.utcnow().isoformat()
    msg['header'].update({
        "timestamp": timestamp
    })

    return msg
예제 #23
0
def check_relation(msg):
    """
    Check for any dangling relations

    :param msg:
    :return:
    """
    header = msg.get('header', {})
    catalog_name = header.get('original_catalogue')
    collection_name = header.get('original_collection')
    attribute_name = header.get('original_attribute')

    model = GOBModel()

    logger.configure(msg, "RELATE_CHECK")
    logger.info("Relate check started")

    collection = model.get_collection(catalog_name, collection_name)
    assert collection is not None, f"Invalid catalog/collection combination {catalog_name}/{collection_name}"

    reference = model._extract_references(collection['attributes']).get(attribute_name)

    try:
        is_very_many = reference['type'] == fully_qualified_type_name(VeryManyReference)
        check_function = check_very_many_relations if is_very_many else check_relations
        check_function(catalog_name, collection_name, attribute_name)
    except Exception as e:
        _log_exception(f"{attribute_name} check FAILED", e)

    logger.info("Relation conflicts check started")
    check_relation_conflicts(catalog_name, collection_name, attribute_name)

    logger.info("Relate check completed")

    return {
        "header": msg["header"],
        "summary": logger.get_summary(),
        "contents": None
    }
예제 #24
0
def data_consistency_test_handler(msg):
    """Request to run data consistency tests.

    :param msg:
    :return:
    """
    catalog = msg['header'].get('catalogue')
    collection = msg['header'].get('collection')
    application = msg['header'].get('application')
    msg['header']['entity'] = msg['header'].get('entity', collection)

    logger.configure(msg, 'Data consistency test')

    assert all([catalog, collection
                ]), "Expecting header attributes 'catalogue' and 'collection'"
    id = f"{catalog} {collection} {application or ''}"
    # No return value. Results are captured by logger.
    logger.info(f"Data consistency test {id} started")
    try:
        DataConsistencyTest(catalog, collection, application).run()
    except GOBConfigException as e:
        logger.error(f"Dataset connection failed: {str(e)}")
    except (NotImplementedCatalogError, NotImplementedApplicationError,
            GOBException) as e:
        logger.error(f"Dataset test failed: {str(e)}")
    else:
        logger.info(f"Data consistency test {id} ended")

    return {
        'header': {
            **msg.get('header', {}),
            'timestamp':
            datetime.datetime.utcnow().isoformat(),
        },
        'summary': logger.get_summary(),
    }
예제 #25
0
def compare(msg):
    """Compare new data in msg (contents) with the current data

    :param msg: The new data, including header and summary
    :return: result message
    """
    logger.configure(msg, "COMPARE")
    header = msg.get('header', {})
    mode = header.get('mode', FULL_UPLOAD)
    logger.info(
        f"Compare (mode = {mode}) to GOB Database {GOBStorageHandler.user_name} started"
    )

    # Parse the message header
    message = ImportMessage(msg)
    metadata = message.metadata

    # Get the model for the collection to be compared
    gob_model = GOBModel()
    entity_model = gob_model.get_collection(metadata.catalogue,
                                            metadata.entity)

    # Initialize a storage handler for the collection
    storage = GOBStorageHandler(metadata)
    model = f"{metadata.source} {metadata.catalogue} {metadata.entity}"
    logger.info(f"Compare {model}")

    stats = CompareStatistics()

    tmp_table_name = None
    with storage.get_session():
        with ProgressTicker("Collect compare events", 10000) as progress:
            # Check any dependencies
            if not meets_dependencies(storage, msg):
                return {
                    "header": msg["header"],
                    "summary": logger.get_summary(),
                    "contents": None
                }

            enricher = Enricher(storage, msg)
            populator = Populator(entity_model, msg)

            # If there are no records in the database all data are ADD events
            initial_add = not storage.has_any_entity()
            if initial_add:
                logger.info("Initial load of new collection detected")
                # Write ADD events directly, without using a temporary table
                contents_writer = ContentsWriter()
                contents_writer.open()
                # Pass a None confirms_writer because only ADD events are written
                collector = EventCollector(contents_writer,
                                           confirms_writer=None,
                                           version=entity_model['version'])
                collect = collector.collect_initial_add
            else:
                # Collect entities in a temporary table
                collector = EntityCollector(storage)
                collect = collector.collect
                tmp_table_name = collector.tmp_table_name

            for entity in msg["contents"]:
                progress.tick()
                stats.collect(entity)
                enricher.enrich(entity)
                populator.populate(entity)
                collect(entity)

            collector.close()

    if initial_add:
        filename = contents_writer.filename
        confirms = None
        contents_writer.close()
    else:
        # Compare entities from temporary table
        with storage.get_session():
            diff = storage.compare_temporary_data(tmp_table_name, mode)
            filename, confirms = _process_compare_results(
                storage, entity_model, diff, stats)

    # Build result message
    results = stats.results()

    logger.info(f"Compare {model} completed", {'data': results})

    results.update(logger.get_summary())

    message = {
        "header": msg["header"],
        "summary": results,
        "contents_ref": filename,
        "confirms": confirms
    }

    return message
예제 #26
0
 def end_of_workflow(self, msg):
     logger.configure(msg, "WORKFLOW")
     logger.info(f"End of workflow")
     job_end(msg["header"].get("jobid"))