Exemplo n.º 1
0
def load_all_data_with_uri(source, source_json_dir, transform_config, pool, logger):
    logger.debug("Loading BrAPI JSON from {}...".format(source_json_dir))

    entity_files = list(list_entity_files(source_json_dir))
    if transform_config.get('restricted-documents'):
        document_configs = transform_config['documents']
        required_entities = get_required_entities(document_configs, source_json_dir)
        entity_files = list(filter(compose(required_entities.__contains__, first), entity_files))
    logger.debug("Loading entities: {}".format(', '.join(list(map(first, entity_files)))))

    # Load stream of file lines
    all_lines = itertools.chain.from_iterable(map(load_entity_lines, entity_files))

    # Parse JSON to python objects
    all_data = pool.imap_unordered(parse_data, all_lines, CHUNK_SIZE)

    # Generate URIs (and create dict from entity/id to URI)
    uri_map = dict()
    data_list = list()
    for entity_name, data in all_data:
        data_id, data_uri = generate_uri_global_id(source, entity_name, data)
        uri_map[(entity_name, data_id)] = data_uri
        uri_map[(entity_name, get_identifier(entity_name, data))] = data_uri
        if is_checkpoint(len(data_list)):
            logger.debug("checkpoint: {} BrAPI objects loaded".format(len(data_list)))
        data_list.append(data)
    logger.debug("Loaded total of {} BrAPI objects.".format(len(data_list)))

    # Replace all entity links using global ids (ex: studyDbId: 1 => studyDbId: urn:source%2Fstudy%2F1)
    generate_links = partial(generate_global_id_links, source, uri_map)
    return pool.imap_unordered(generate_links, data_list, CHUNK_SIZE)
def get_required_entities(document_configs, source_json_dir):
    """
    Returns set of required entities for all documents in configuration
    """
    source_entities = set(
        remove_none(map(lambda d: d.get('source-entity'), document_configs)))

    def collect_entities(parsed_template):
        if is_list_like(parsed_template):
            return set(flatten_it(map(collect_entities, parsed_template)))
        if isinstance(parsed_template, dict):
            if '{lark}' in parsed_template:
                entities = set()
                for object_path in as_list(
                        resolve_path(parsed_template,
                                     ['start', 'object_path'])):
                    fields = resolve_path(object_path, ['field_path', 'FIELD'])
                    match = re.search("^(\w+)DbId(s?)$", fields[-1])
                    if match:
                        entities.add(match.groups()[0])
                return entities
            return set(
                flatten_it(map(collect_entities, parsed_template.values())))
        return set()

    document_transforms = remove_none(
        map(lambda d: d.get('document-transform'), document_configs))
    required_entities = source_entities.union(
        flatten_it(map(collect_entities, document_transforms)))

    if source_json_dir:
        all_files = list_entity_files(source_json_dir)
        filtered_files = list(
            filter(lambda x: x[0] in source_entities, all_files))
        for entity_name, file_path in filtered_files:
            with open(file_path, 'r') as file:
                line = file.readline()
                if line:
                    data = json.loads(line)
                    links = get_entity_links(data, 'DbId', 'PUI')
                    entity_names = set(map(first, links))
                    required_entities.update(entity_names)

    return required_entities
Exemplo n.º 3
0
def load_source(source, config, source_bulk_dir, log_dir):
    """
    Full Elasticsearch documents indexing
    """
    source_name = source['schema:identifier']
    action = 'load-elasticsearch-' + source_name
    log_file = get_file_path([log_dir, action], ext='.log', recreate=True)
    logger = create_logger(source_name, log_file, config['verbose'])

    load_config = config['load-elasticsearch']
    es_client = init_es_client(load_config['url'], logger)

    logger.info("Loading '{}' into elasticsearch '{}'...".format(source_bulk_dir, load_config['url']))
    try:
        if not os.path.exists(source_bulk_dir):
            raise FileNotFoundError(
                'No such file or directory: \'{}\'.\n'
                'Please make sure you have run the BrAPI extraction and Elasticsearch document transformation'
                ' before trying to launch the transformation process.'
                .format(source_bulk_dir))

        bulk_files = list(list_entity_files(source_bulk_dir))
        all_document_types = set(map(first, bulk_files))
        document_types = load_config.get('document-types') or all_document_types
        document_types = document_types.intersection(all_document_types)

        index_by_document = dict()

        logger.info("Preparing index with template mapping...")
        timestamp = int(time.time())
        for document_type in document_types:
            base_index_name = replace_template(
                load_config['index-template'],
                {'source': source['schema:identifier'], 'documentType': document_type}
            ).lower()
            create_template(es_client, load_config, document_type, base_index_name, logger)

            index_name = base_index_name + '-d' + str(timestamp)
            create_index(es_client, index_name, logger)
            index_by_document[document_type] = base_index_name, index_name

        logger.info("Bulk indexing...")
        for document_type, file_path in bulk_files:
            if document_type in index_by_document:
                base_index_name, index_name = index_by_document[document_type]
                bulk_index(es_client, index_name, file_path, logger)

        logger.info("Creating index aliases and deleting old indices...")
        for document_type, (base_index_name, index_name) in index_by_document.items():
            create_alias(es_client, index_name, base_index_name, logger)
            new_index, *old_indices = get_indices(es_client, base_index_name)
            for old_index in old_indices[1:]:
                delete_index(es_client, old_index, logger)

        logger.info("SUCCEEDED Loading {}.".format(source_name))
    except Exception as e:
        logger.debug(traceback.format_exc())
        logger.debug(getattr(e, 'long_message', ''))
        logger.info("FAILED Loading {} Elasticsearch documents.\n"
                    "=> Check the logs ({}) for more details."
                    .format(source_name, log_file))
Exemplo n.º 4
0
def launch_etl(options, config):
    def handler(*_):
        sys.exit(0)

    signal.signal(signal.SIGINT, handler)
    default_index_template = config['load-elasticsearch']['index-template']

    # Execute ETL actions based on CLI arguments:
    if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options:
        etl.extract.brapi.main(config)

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        transform_config = config['transform-elasticsearch']

        # Restrict lis of generated document if requested
        input_doc_types = options.get('document_types')
        if input_doc_types:
            transform_config['restricted-documents'] = set(
                remove_empty(input_doc_types.split(',')))

        # Copy base jsonschema definitions into each document jsonschema
        validation_config = transform_config['validation']
        base_definitions = validation_config['base-definitions']
        for (document_type,
             document_schema) in validation_config['documents'].items():
            document_schema['definitions'] = base_definitions

        # Run transform
        etl.transform.elasticsearch.main(config)

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        # Replace JSON-LD context path with absolute path
        for (entity_name,
             entity) in config['transform-jsonld']['entities'].items():
            if '@context' in entity:
                entity['@context'] = get_file_path(
                    [config['conf-dir'], entity['@context']])
                if not os.path.exists(entity['@context']):
                    raise Exception(
                        'JSON-LD context file "{}" defined in "{}" does not exist'
                        .format(
                            entity['@context'],
                            os.path.join(config['conf-dir'],
                                         'transform-jsonld.json')))

        # Replace JSON-LD model path with an absolute path
        config['transform-jsonld']['model'] = get_file_path(
            [config['conf-dir'], config['transform-jsonld']['model']])

        etl.transform.jsonld.main(config)

    if 'transform_rdf' in options or 'etl_virtuoso' in options:
        etl.transform.rdf.main(config)

    if 'load_elasticsearch' in options or 'etl_es' in options:
        mapping_files = list_entity_files(
            os.path.join(config['conf-dir'], 'elasticsearch'))

        selected_document_types = None
        if 'document_types' in options and options['document_types']:
            selected_document_types = set(options['document_types'].split(','))
        config['load-elasticsearch']['url'] = '{}:{}'.format(
            options['host'], options['port'])
        config['load-elasticsearch']['mappings'] = {
            document_type: file_path
            for document_type, file_path in mapping_files
        }
        config['load-elasticsearch']['index-template'] = options.get(
            'index_template') or default_index_template
        config['load-elasticsearch'][
            'document-types'] = selected_document_types
        etl.load.elasticsearch.main(config)

    if 'load_virtuoso' in options or 'etl_virtuoso' in options:
        etl.load.virtuoso.main(config)