def main(config): entities = config["extract-brapi"]["entities"] for (entity_name, entity) in entities.items(): entity['name'] = entity_name json_dir = get_folder_path([config['data-dir'], 'json'], create=True) sources = config['sources'] threads = list() for source_name in sources: if source_name == 'EVA': print("# INFO: EVA data can't be extracted, EVA Skipped ..") continue source_json_dir = get_folder_path([json_dir, source_name], recreate=True) source_json_dir_failed = source_json_dir + '-failed' if os.path.exists(source_json_dir_failed): shutil.rmtree(source_json_dir_failed) source = deepcopy(sources[source_name]) entities_copy = deepcopy(entities) thread = threading.Thread(target=extract_source, args=(source, entities_copy, config, source_json_dir)) thread.daemon = True thread.start() threads.append(thread) for thread in threads: while thread.isAlive(): thread.join(500)
def main(config): print() entities = config['jsonld_entities'] for entity_name in entities: entities[entity_name]['id'] = entity_name + 'DbId' entities[entity_name]['pui'] = entity_name + 'PUI' json_dir = get_folder_path([config['data-dir'], 'json']) if not os.path.exists(json_dir): raise Exception('No json folder found in {}'.format(json_dir)) jsonld_dir = get_folder_path([config['data-dir'], 'json-ld'], create=True) institutions = config['institutions'] for institution_name in institutions: institution = institutions[institution_name] if not institution['active']: continue institution_json_dir = get_folder_path([json_dir, institution_name]) if not os.path.exists(institution_json_dir): continue institution_jsonld_dir = get_folder_path( [jsonld_dir, institution_name], recreate=True) # Partial function application uri_base = institution[ 'uri_base'] if 'uri_base' in institution else institution[ 'brapi_url'] institution_add_jsonld = functools.partial(add_jsonld, uri_base, entities) transform_folder(institution_add_jsonld, institution_json_dir, institution_jsonld_dir)
def transform_source(source, config): # Prepare configs ignore_links = set(config['transform-uri']['ignore-links']) entities = config['extract-brapi']['entities'] source_json_dir = get_folder_path( [config['data-dir'], 'json', source['schema:identifier']]) index_dir = get_folder_path( [config['data-dir'], 'uri-index', source['schema:identifier']], recreate=True) # Step 1: Load JSON data into indices (& add URI) id_indices = step1(source, entities, source_json_dir, index_dir) # Step 2: Replace all DbIds links with b64 encoded URIs uri_indices = step2(source, entities, ignore_links, source_json_dir, index_dir, id_indices) return UriIndex(uri_indices)
def main(config): log_dir = config['log-dir'] bulk_dir = os.path.join(config['data-dir'], 'json-bulk') if not os.path.exists(bulk_dir): raise Exception('No json bulk folder found in ' + bulk_dir) sources = config['sources'] for (source_name, source) in sources.items(): source_bulk_dir = get_folder_path([bulk_dir, source_name]) load_source(source, config, source_bulk_dir, log_dir)
def load_file_config(): config = dict() config['root-dir'] = os.path.dirname(__file__) config['conf-dir'] = os.path.join(config['root-dir'], 'config') config['source-dir'] = os.path.join(config['conf-dir'], 'sources') config['log-dir'] = get_folder_path([config['root-dir'], 'log'], create=True) # Other configs conf_files = filter(lambda s: s.endswith('.json'), os.listdir(config['conf-dir'])) for conf_file in conf_files: config.update(load_config(config['conf-dir'], conf_file)) return config
def main(): def handler(*_): sys.exit(0) # Trap SIGINT to force exit the program signal.signal(signal.SIGINT, handler) # Initialize defaults config = dict() config['root-dir'] = os.path.dirname(__file__) config['default-data-dir'] = os.path.join(config['root-dir'], 'data') config['conf-dir'] = os.path.join(config['root-dir'], 'config') config['log-dir'] = get_folder_path([config['root-dir'], 'log'], create=True) # Load file configs config = load_file_config(config) # Parse command line arguments options = parse_cli_arguments(config) # Extend config with CLI arguments config = extend_config(config, options) # Execute ETL actions based on CLI arguments: if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options: etl.extract.brapi.main(config) if 'transform_elasticsearch' in options or 'etl_es' in options: etl.transform.elasticsearch.main(config) if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options: etl.transform.jsonld.main(config) if 'transform_rdf' in options or 'etl_virtuoso' in options: etl.transform.rdf.main(config) if 'load_elasticsearch' in options or 'etl_es' in options: etl.load.elasticsearch.main(config) if 'load_virtuoso' in options or 'etl_virtuoso' in options: etl.load.virtuoso.main(config)
def main(config): rdf_dir = os.path.join(config['data-dir'], 'rdf') if not os.path.exists(rdf_dir): raise Exception('No rdf folder found in ' + rdf_dir) virtuoso_config = config['virtuoso'] institutions = config['institutions'] for institution_name in institutions: institution = institutions[institution_name] if not institution['active']: continue institution_rdf_dir = get_folder_path([rdf_dir, institution_name]) if not os.path.exists(institution_rdf_dir): continue graph_uri = replace_template(virtuoso_config['graph_uri_template'], { 'institution': institution_name }).lower() delete_graph(virtuoso_config, graph_uri) load_folder(graph_uri, institution_rdf_dir, virtuoso_config)
def extend_config(config, arguments): config['verbose'] = arguments['verbose'] config['data-dir'] = get_folder_path( [arguments.get('data_dir') or default_data_dir], create=True) # Sources config config['sources'] = dict() source_id_field = 'schema:identifier' for source_file in (arguments.get('sources') or list()): source_config = json.loads(source_file.read()) if source_id_field not in source_config: raise Exception( "No field '{}' in data source JSON configuration file '{}'". format(source_id_field, source_file.name)) identifier = source_config[source_id_field] if identifier in config['sources']: raise Exception( "Source id '{}' found twice in source list: {}\n" "Please verify the '{}' field in your files.".format( identifier, arguments['sources'], source_id_field)) config['sources'][identifier] = source_config return config
def extend_config(config, options): """ Extend the configuration with the options provided in CLI arguments """ config['options'] = options # Data output dir config['data-dir'] = get_folder_path( [options.get('data_dir') or config['default-data-dir']], create=True) # Sources config config['sources'] = dict() source_id_field = 'schema:identifier' for source_file in (options.get('sources') or list()): source_config = json.loads(source_file.read()) if source_id_field not in source_config: raise Exception( "No field '{}' in data source JSON configuration file '{}'". format(source_id_field, source_file.name)) identifier = source_config[source_id_field] if identifier in config['sources']: raise Exception( "Source id '{}' found twice in source list: {}\n" "Please verify the '{}' field in your files.".format( identifier, options['sources'], source_id_field)) config['sources'][identifier] = source_config if 'transform_elasticsearch' in options or 'etl_es' in options: transform_config = config['transform-elasticsearch'] transform_config['documents'] = list( transform_config['documents'].values()) # Restrict lis of generated document if requested input_doc_types = options.get('document_types') if input_doc_types: transform_config['restricted-documents'] = set( remove_empty(input_doc_types.split(','))) # Copy base jsonschema definitions into each document jsonschema validation_schemas = transform_config['validation-schemas'] base_definitions = validation_schemas['base-definitions'] for (document_type, document_schema) in validation_schemas.items(): if document_schema != base_definitions: document_schema['definitions'] = base_definitions if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options: # Replace JSON-LD context path with absolute path for (entity_name, entity) in config['transform-jsonld']['entities'].items(): if '@context' in entity: entity['@context'] = get_file_path( [config['conf-dir'], entity['@context']]) if not os.path.exists(entity['@context']): raise Exception( 'JSON-LD context file "{}" defined in "{}" does not exist' .format( entity['@context'], os.path.join(config['conf-dir'], 'transform-jsonld.json'))) # Replace JSON-LD model path with an absolute path config['transform-jsonld']['model'] = get_file_path( [config['conf-dir'], config['transform-jsonld']['model']]) if 'load_elasticsearch' in options or 'etl_es' in options: load_elasticsearch = config['load-elasticsearch'] # CLI selected list of document types selected_document_types = None if 'document_types' in options and options['document_types']: selected_document_types = set(options['document_types'].split(',')) load_elasticsearch['document-types'] = selected_document_types elasticsearch_config = load_elasticsearch['config'] load_elasticsearch['index-template'] = options.get( 'index_template') or elasticsearch_config['index-template'] load_elasticsearch['url'] = '{}:{}'.format( options['host'] or elasticsearch_config['host'], options['port'] or elasticsearch_config['port']) return config