def get_jsonld_contexts(base_dir, config): jsonld_contexts = dict() for entity_name in config['entities']: if '@context' in config['entities'][entity_name]: jsonld_contexts[entity_name] = get_file_path( [base_dir, config['entities'][entity_name]['@context']]) return jsonld_contexts
def list_entity_files(json_dir): for file_name in os.listdir(json_dir): matches = re.search('^([a-zA-Z]+).*\.json$', file_name) if not matches: continue entity_name = matches.groups()[0] json_path = get_file_path([json_dir, file_name]) yield entity_name, json_path
def _new_file(self): json_path = None while not json_path or os.path.exists(json_path): self.file_index += 1 json_path = get_file_path([self.output_dir, self.base_json_name], ext="-" + str(self.file_index) + ".json") if self.file_index > 1000000: raise Exception('Max file index exceeded') return open(json_path, 'a')
def load_folder(graph_uri, endpoint_rdf_dir, virtuoso_config): print('Loading RDF from "{}" \n\tinto Virtuoso graph <{}> on "{}"'.format( endpoint_rdf_dir, graph_uri, virtuoso_config['url'])) for file_name in os.listdir(endpoint_rdf_dir): rdf_path = get_file_path([endpoint_rdf_dir, file_name]) if not os.path.exists(rdf_path): continue load_rdf(virtuoso_config, rdf_path, graph_uri)
def extract_source(source, entities, config, output_dir): """ Full JSON BrAPI source extraction process """ source_name = source['schema:identifier'] action = 'extract-' + source_name log_file = get_file_path([config['log-dir'], action], ext='.log', recreate=True) logger = create_logger(action, log_file, config['options']['verbose']) pool = ThreadPool(10) logger.info("Extracting BrAPI {}...".format(source_name)) try: # Initialize JSON merge stores for (entity_name, entity) in entities.items(): entity['store'] = MergeStore(source['schema:identifier'], entity['name']) # Fetch server implemented calls if 'implemented-calls' not in source: source['implemented-calls'] = get_implemented_calls(source, logger) # Fetch entities lists fetch_all_list(source, logger, entities, pool) # Detail entities fetch_all_details(source, logger, entities, pool) # Link entities (internal links, internal object links and external object links) fetch_all_links(source, logger, entities) # Detail entities (for object that might have been discovered by links) fetch_all_details(source, logger, entities, pool) remove_internal_objects(entities) logger.info("SUCCEEDED Extracting BrAPI {}.".format(source_name)) except: logger.debug(traceback.format_exc()) shutil.rmtree(output_dir) output_dir = output_dir + '-failed' logger.info( "FAILED Extracting BrAPI {}.\n" "=> Check the logs ({}) and data ({}) for more details.".format( source_name, log_file, output_dir)) pool.close() # Save to file logger.info("Saving BrAPI {} to '{}'...".format(source_name, output_dir)) for (entity_name, entity) in entities.items(): entity['store'].save(output_dir) entity['store'].clear()
def save(self, output_dir): if len(self) <= 0: return json_path = get_file_path([output_dir, self.entity_name], ext='.json', create=True) with open(json_path, 'w') as json_file: for data in self.values(): if 'etl:detailed' in data: del data['etl:detailed'] CustomJSONEncoder.dump(data, json_file) json_file.write('\n')
def index_by(index_dir: str, index_extension: str, data_iter: iter, key_fn: Callable, value_fn: Callable, checkpoint: int, object_name: str): """ Generate UnQlite data indices for each entity :param index_dir index directory :param index_extension index file extension :param data_iter iterable on data :param key_fn function to use on data to get the index key :param value_fn function to use on data to get the index value :param checkpoint commit index every checkpoints :return dict of index paths by entity name """ i = 0 index_path_by_entity = {} index_by_entity = {} for data in data_iter: entity = data['@type'] if entity not in index_path_by_entity: index_path = get_file_path([index_dir, entity], ext=index_extension) index_path_by_entity[entity] = index_path index = UnQLite(index_path_by_entity[entity]) index.begin() index_by_entity[entity] = index index = index_by_entity[entity] # Index index[str(key_fn(data))] = value_fn(data) i += 1 # Log if i % 50000 == 0: print(f'checkpoint: {i} {object_name}') # Checkpoint if i % checkpoint == 0: # Flush indices for index in index_by_entity.values(): index.commit() index.begin() print(f'checkpoint: {i} {object_name}') # Close indices for index in index_by_entity.values(): index.commit() index.close() # Output all indices return index_path_by_entity
def transform_folder(institution_add_jsonld, json_dir, jsonld_dir): print('Transforming JSON from "{}" \n\tto JSON-LD in "{}"'.format( json_dir, jsonld_dir)) # List of options options = list() for file_name in os.listdir(json_dir): matches = re.search('(\D+)(\d+).json', file_name) if matches: (entity_name, index) = matches.groups() src_path = get_file_path([json_dir, entity_name], ext=str(index) + '.json') dest_path = get_file_path([jsonld_dir, entity_name], ext=str(index) + '.jsonld') # Partial function application entity_add_jsonld = functools.partial(institution_add_jsonld, entity_name) options.append((entity_add_jsonld, src_path, dest_path)) # Run transform_to_jsonld on a thread pool pool_worker(transform_to_jsonld, options)
def read_json_lines(json_dir: str, out_queue: Queue): """ Read JSON in source dir for each entity and output into queue """ # List JSON files for each entities try: file_names = filter(lambda f: f.endswith(".json"), os.listdir(json_dir)) except FileNotFoundError: raise FileNotFoundError( f"No such file or directory: '{json_dir}'.\n" 'Please make sure you have run the BrAPI extraction before trying to launch the transformation process.' ) file_readers = {} for file_name in file_names: # Use file base name a the entity name entity = os.path.splitext(os.path.basename(file_name))[0] file_readers[entity] = open(get_file_path([json_dir, file_name]), 'r') # Read line # with open(get_file_path([json_dir, file_name]), 'r') as file: # for line in file: # out_queue.put((entity, line)) # Alternatively read lines from each file (uniformize data flow) while file_readers: for entity, file in list(file_readers.items()): line = file.readline() if not line: file.close() del file_readers[entity] else: out_queue.put((entity, line)) # Signal no more data out_queue.put(None)
def load_source(source, config, source_bulk_dir, log_dir): """ Full Elasticsearch documents indexing """ source_name = source['schema:identifier'] action = 'load-elasticsearch-' + source_name log_file = get_file_path([log_dir, action], ext='.log', recreate=True) logger = create_logger(source_name, log_file, config['verbose']) load_config = config['load-elasticsearch'] es_client = init_es_client(load_config['url'], logger) logger.info("Loading '{}' into elasticsearch '{}'...".format(source_bulk_dir, load_config['url'])) try: if not os.path.exists(source_bulk_dir): raise FileNotFoundError( 'No such file or directory: \'{}\'.\n' 'Please make sure you have run the BrAPI extraction and Elasticsearch document transformation' ' before trying to launch the transformation process.' .format(source_bulk_dir)) bulk_files = list(list_entity_files(source_bulk_dir)) all_document_types = set(map(first, bulk_files)) document_types = load_config.get('document-types') or all_document_types document_types = document_types.intersection(all_document_types) index_by_document = dict() logger.info("Preparing index with template mapping...") timestamp = int(time.time()) for document_type in document_types: base_index_name = replace_template( load_config['index-template'], {'source': source['schema:identifier'], 'documentType': document_type} ).lower() create_template(es_client, load_config, document_type, base_index_name, logger) index_name = base_index_name + '-d' + str(timestamp) create_index(es_client, index_name, logger) index_by_document[document_type] = base_index_name, index_name logger.info("Bulk indexing...") for document_type, file_path in bulk_files: if document_type in index_by_document: base_index_name, index_name = index_by_document[document_type] bulk_index(es_client, index_name, file_path, logger) logger.info("Creating index aliases and deleting old indices...") for document_type, (base_index_name, index_name) in index_by_document.items(): create_alias(es_client, index_name, base_index_name, logger) new_index, *old_indices = get_indices(es_client, base_index_name) for old_index in old_indices[1:]: delete_index(es_client, old_index, logger) logger.info("SUCCEEDED Loading {}.".format(source_name)) except Exception as e: logger.debug(traceback.format_exc()) logger.debug(getattr(e, 'long_message', '')) logger.info("FAILED Loading {} Elasticsearch documents.\n" "=> Check the logs ({}) for more details." .format(source_name, log_file))
def launch_etl(options, config): def handler(*_): sys.exit(0) signal.signal(signal.SIGINT, handler) default_index_template = config['load-elasticsearch']['index-template'] # Execute ETL actions based on CLI arguments: if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options: etl.extract.brapi.main(config) if 'transform_elasticsearch' in options or 'etl_es' in options: transform_config = config['transform-elasticsearch'] # Restrict lis of generated document if requested input_doc_types = options.get('document_types') if input_doc_types: transform_config['restricted-documents'] = set( remove_empty(input_doc_types.split(','))) # Copy base jsonschema definitions into each document jsonschema validation_config = transform_config['validation'] base_definitions = validation_config['base-definitions'] for (document_type, document_schema) in validation_config['documents'].items(): document_schema['definitions'] = base_definitions # Run transform etl.transform.elasticsearch.main(config) if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options: # Replace JSON-LD context path with absolute path for (entity_name, entity) in config['transform-jsonld']['entities'].items(): if '@context' in entity: entity['@context'] = get_file_path( [config['conf-dir'], entity['@context']]) if not os.path.exists(entity['@context']): raise Exception( 'JSON-LD context file "{}" defined in "{}" does not exist' .format( entity['@context'], os.path.join(config['conf-dir'], 'transform-jsonld.json'))) # Replace JSON-LD model path with an absolute path config['transform-jsonld']['model'] = get_file_path( [config['conf-dir'], config['transform-jsonld']['model']]) etl.transform.jsonld.main(config) if 'transform_rdf' in options or 'etl_virtuoso' in options: etl.transform.rdf.main(config) if 'load_elasticsearch' in options or 'etl_es' in options: mapping_files = list_entity_files( os.path.join(config['conf-dir'], 'elasticsearch')) selected_document_types = None if 'document_types' in options and options['document_types']: selected_document_types = set(options['document_types'].split(',')) config['load-elasticsearch']['url'] = '{}:{}'.format( options['host'], options['port']) config['load-elasticsearch']['mappings'] = { document_type: file_path for document_type, file_path in mapping_files } config['load-elasticsearch']['index-template'] = options.get( 'index_template') or default_index_template config['load-elasticsearch'][ 'document-types'] = selected_document_types etl.load.elasticsearch.main(config) if 'load_virtuoso' in options or 'etl_virtuoso' in options: etl.load.virtuoso.main(config)
def extend_config(config, options): """ Extend the configuration with the options provided in CLI arguments """ config['options'] = options # Data output dir config['data-dir'] = get_folder_path( [options.get('data_dir') or config['default-data-dir']], create=True) # Sources config config['sources'] = dict() source_id_field = 'schema:identifier' for source_file in (options.get('sources') or list()): source_config = json.loads(source_file.read()) if source_id_field not in source_config: raise Exception( "No field '{}' in data source JSON configuration file '{}'". format(source_id_field, source_file.name)) identifier = source_config[source_id_field] if identifier in config['sources']: raise Exception( "Source id '{}' found twice in source list: {}\n" "Please verify the '{}' field in your files.".format( identifier, options['sources'], source_id_field)) config['sources'][identifier] = source_config if 'transform_elasticsearch' in options or 'etl_es' in options: transform_config = config['transform-elasticsearch'] transform_config['documents'] = list( transform_config['documents'].values()) # Restrict lis of generated document if requested input_doc_types = options.get('document_types') if input_doc_types: transform_config['restricted-documents'] = set( remove_empty(input_doc_types.split(','))) # Copy base jsonschema definitions into each document jsonschema validation_schemas = transform_config['validation-schemas'] base_definitions = validation_schemas['base-definitions'] for (document_type, document_schema) in validation_schemas.items(): if document_schema != base_definitions: document_schema['definitions'] = base_definitions if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options: # Replace JSON-LD context path with absolute path for (entity_name, entity) in config['transform-jsonld']['entities'].items(): if '@context' in entity: entity['@context'] = get_file_path( [config['conf-dir'], entity['@context']]) if not os.path.exists(entity['@context']): raise Exception( 'JSON-LD context file "{}" defined in "{}" does not exist' .format( entity['@context'], os.path.join(config['conf-dir'], 'transform-jsonld.json'))) # Replace JSON-LD model path with an absolute path config['transform-jsonld']['model'] = get_file_path( [config['conf-dir'], config['transform-jsonld']['model']]) if 'load_elasticsearch' in options or 'etl_es' in options: load_elasticsearch = config['load-elasticsearch'] # CLI selected list of document types selected_document_types = None if 'document_types' in options and options['document_types']: selected_document_types = set(options['document_types'].split(',')) load_elasticsearch['document-types'] = selected_document_types elasticsearch_config = load_elasticsearch['config'] load_elasticsearch['index-template'] = options.get( 'index_template') or elasticsearch_config['index-template'] load_elasticsearch['url'] = '{}:{}'.format( options['host'] or elasticsearch_config['host'], options['port'] or elasticsearch_config['port']) return config