Exemplo n.º 1
0
def main(config):
    entities = config["extract-brapi"]["entities"]
    for (entity_name, entity) in entities.items():
        entity['name'] = entity_name

    json_dir = get_folder_path([config['data-dir'], 'json'], create=True)
    sources = config['sources']

    threads = list()
    for source_name in sources:
        if source_name == 'EVA':
            print("# INFO: EVA data can't be extracted, EVA Skipped ..")
            continue
        source_json_dir = get_folder_path([json_dir, source_name], recreate=True)
        source_json_dir_failed = source_json_dir + '-failed'
        if os.path.exists(source_json_dir_failed):
            shutil.rmtree(source_json_dir_failed)

        source = deepcopy(sources[source_name])
        entities_copy = deepcopy(entities)

        thread = threading.Thread(target=extract_source,
                                  args=(source, entities_copy, config, source_json_dir))
        thread.daemon = True
        thread.start()
        threads.append(thread)

    for thread in threads:
        while thread.isAlive():
            thread.join(500)
def main(config):
    print()
    entities = config['jsonld_entities']
    for entity_name in entities:
        entities[entity_name]['id'] = entity_name + 'DbId'
        entities[entity_name]['pui'] = entity_name + 'PUI'

    json_dir = get_folder_path([config['data-dir'], 'json'])
    if not os.path.exists(json_dir):
        raise Exception('No json folder found in {}'.format(json_dir))
    jsonld_dir = get_folder_path([config['data-dir'], 'json-ld'], create=True)

    institutions = config['institutions']
    for institution_name in institutions:
        institution = institutions[institution_name]
        if not institution['active']:
            continue
        institution_json_dir = get_folder_path([json_dir, institution_name])
        if not os.path.exists(institution_json_dir):
            continue
        institution_jsonld_dir = get_folder_path(
            [jsonld_dir, institution_name], recreate=True)

        # Partial function application
        uri_base = institution[
            'uri_base'] if 'uri_base' in institution else institution[
                'brapi_url']
        institution_add_jsonld = functools.partial(add_jsonld, uri_base,
                                                   entities)

        transform_folder(institution_add_jsonld, institution_json_dir,
                         institution_jsonld_dir)
def transform_source(source, config):
    # Prepare configs
    ignore_links = set(config['transform-uri']['ignore-links'])
    entities = config['extract-brapi']['entities']
    source_json_dir = get_folder_path(
        [config['data-dir'], 'json', source['schema:identifier']])

    index_dir = get_folder_path(
        [config['data-dir'], 'uri-index', source['schema:identifier']],
        recreate=True)

    # Step 1: Load JSON data into indices (& add URI)
    id_indices = step1(source, entities, source_json_dir, index_dir)

    # Step 2: Replace all DbIds links with b64 encoded URIs
    uri_indices = step2(source, entities, ignore_links, source_json_dir,
                        index_dir, id_indices)
    return UriIndex(uri_indices)
Exemplo n.º 4
0
def main(config):
    log_dir = config['log-dir']
    bulk_dir = os.path.join(config['data-dir'], 'json-bulk')
    if not os.path.exists(bulk_dir):
        raise Exception('No json bulk folder found in ' + bulk_dir)

    sources = config['sources']
    for (source_name, source) in sources.items():
        source_bulk_dir = get_folder_path([bulk_dir, source_name])
        load_source(source, config, source_bulk_dir, log_dir)
Exemplo n.º 5
0
def load_file_config():
    config = dict()
    config['root-dir'] = os.path.dirname(__file__)
    config['conf-dir'] = os.path.join(config['root-dir'], 'config')
    config['source-dir'] = os.path.join(config['conf-dir'], 'sources')
    config['log-dir'] = get_folder_path([config['root-dir'], 'log'],
                                        create=True)

    # Other configs
    conf_files = filter(lambda s: s.endswith('.json'),
                        os.listdir(config['conf-dir']))
    for conf_file in conf_files:
        config.update(load_config(config['conf-dir'], conf_file))

    return config
Exemplo n.º 6
0
def main():
    def handler(*_):
        sys.exit(0)

    # Trap SIGINT to force exit the program
    signal.signal(signal.SIGINT, handler)

    # Initialize defaults
    config = dict()
    config['root-dir'] = os.path.dirname(__file__)
    config['default-data-dir'] = os.path.join(config['root-dir'], 'data')
    config['conf-dir'] = os.path.join(config['root-dir'], 'config')
    config['log-dir'] = get_folder_path([config['root-dir'], 'log'],
                                        create=True)

    # Load file configs
    config = load_file_config(config)

    # Parse command line arguments
    options = parse_cli_arguments(config)

    # Extend config with CLI arguments
    config = extend_config(config, options)

    # Execute ETL actions based on CLI arguments:
    if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options:
        etl.extract.brapi.main(config)

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        etl.transform.elasticsearch.main(config)

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        etl.transform.jsonld.main(config)

    if 'transform_rdf' in options or 'etl_virtuoso' in options:
        etl.transform.rdf.main(config)

    if 'load_elasticsearch' in options or 'etl_es' in options:
        etl.load.elasticsearch.main(config)

    if 'load_virtuoso' in options or 'etl_virtuoso' in options:
        etl.load.virtuoso.main(config)
Exemplo n.º 7
0
def main(config):
    rdf_dir = os.path.join(config['data-dir'], 'rdf')
    if not os.path.exists(rdf_dir):
        raise Exception('No rdf folder found in ' + rdf_dir)

    virtuoso_config = config['virtuoso']

    institutions = config['institutions']
    for institution_name in institutions:
        institution = institutions[institution_name]
        if not institution['active']:
            continue
        institution_rdf_dir = get_folder_path([rdf_dir, institution_name])
        if not os.path.exists(institution_rdf_dir):
            continue

        graph_uri = replace_template(virtuoso_config['graph_uri_template'], {
            'institution': institution_name
        }).lower()

        delete_graph(virtuoso_config, graph_uri)
        load_folder(graph_uri, institution_rdf_dir, virtuoso_config)
Exemplo n.º 8
0
def extend_config(config, arguments):
    config['verbose'] = arguments['verbose']
    config['data-dir'] = get_folder_path(
        [arguments.get('data_dir') or default_data_dir], create=True)

    # Sources config
    config['sources'] = dict()
    source_id_field = 'schema:identifier'
    for source_file in (arguments.get('sources') or list()):
        source_config = json.loads(source_file.read())
        if source_id_field not in source_config:
            raise Exception(
                "No field '{}' in data source JSON configuration file '{}'".
                format(source_id_field, source_file.name))
        identifier = source_config[source_id_field]
        if identifier in config['sources']:
            raise Exception(
                "Source id '{}' found twice in source list: {}\n"
                "Please verify the '{}' field in your files.".format(
                    identifier, arguments['sources'], source_id_field))
        config['sources'][identifier] = source_config

    return config
Exemplo n.º 9
0
def extend_config(config, options):
    """
    Extend the configuration with the options provided in CLI arguments

    """
    config['options'] = options

    # Data output dir
    config['data-dir'] = get_folder_path(
        [options.get('data_dir') or config['default-data-dir']], create=True)

    # Sources config
    config['sources'] = dict()
    source_id_field = 'schema:identifier'
    for source_file in (options.get('sources') or list()):
        source_config = json.loads(source_file.read())
        if source_id_field not in source_config:
            raise Exception(
                "No field '{}' in data source JSON configuration file '{}'".
                format(source_id_field, source_file.name))
        identifier = source_config[source_id_field]
        if identifier in config['sources']:
            raise Exception(
                "Source id '{}' found twice in source list: {}\n"
                "Please verify the '{}' field in your files.".format(
                    identifier, options['sources'], source_id_field))
        config['sources'][identifier] = source_config

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        transform_config = config['transform-elasticsearch']
        transform_config['documents'] = list(
            transform_config['documents'].values())

        # Restrict lis of generated document if requested
        input_doc_types = options.get('document_types')
        if input_doc_types:
            transform_config['restricted-documents'] = set(
                remove_empty(input_doc_types.split(',')))

        # Copy base jsonschema definitions into each document jsonschema
        validation_schemas = transform_config['validation-schemas']
        base_definitions = validation_schemas['base-definitions']
        for (document_type, document_schema) in validation_schemas.items():
            if document_schema != base_definitions:
                document_schema['definitions'] = base_definitions

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        # Replace JSON-LD context path with absolute path
        for (entity_name,
             entity) in config['transform-jsonld']['entities'].items():
            if '@context' in entity:
                entity['@context'] = get_file_path(
                    [config['conf-dir'], entity['@context']])
                if not os.path.exists(entity['@context']):
                    raise Exception(
                        'JSON-LD context file "{}" defined in "{}" does not exist'
                        .format(
                            entity['@context'],
                            os.path.join(config['conf-dir'],
                                         'transform-jsonld.json')))

        # Replace JSON-LD model path with an absolute path
        config['transform-jsonld']['model'] = get_file_path(
            [config['conf-dir'], config['transform-jsonld']['model']])

    if 'load_elasticsearch' in options or 'etl_es' in options:
        load_elasticsearch = config['load-elasticsearch']

        # CLI selected list of document types
        selected_document_types = None
        if 'document_types' in options and options['document_types']:
            selected_document_types = set(options['document_types'].split(','))
        load_elasticsearch['document-types'] = selected_document_types

        elasticsearch_config = load_elasticsearch['config']

        load_elasticsearch['index-template'] = options.get(
            'index_template') or elasticsearch_config['index-template']

        load_elasticsearch['url'] = '{}:{}'.format(
            options['host'] or elasticsearch_config['host'], options['port']
            or elasticsearch_config['port'])
    return config