def generate_global_id_links(source, uri_map, data): def generate_uri(tuple): (entity_name, object_id) = tuple return get_uri_by_id(source, entity_name, object_id) def get_or_generate_uri(entity_name, object_id): return get_dict_or_generate(uri_map, (entity_name, object_id), generate_uri) entity_id_links = get_entity_links(data, 'DbId') for (linked_entity, linked_id_field, plural, linked_ids) in entity_id_links: link_uri_field = linked_entity + 'PUI' + plural if link_uri_field in data: continue linked_uris = set(remove_none( map(partial(get_or_generate_uri, linked_entity), as_list(linked_ids)))) if linked_uris: if not plural: linked_uris = first(linked_uris) data[link_uri_field] = linked_uris entity_uri_links = get_entity_links(data, 'PUI') for (linked_entity, linked_uri_field, plural, linked_uris) in entity_uri_links: linked_id_field = linked_entity + 'DbId' + plural linked_ids = set(map(uri_encode, as_list(linked_uris))) if linked_ids: if not plural: linked_ids = first(linked_ids) data[linked_id_field] = linked_ids return data
def test_no_ids(self): """ List DbIds in object without any """ expected = [] actual = get_entity_links({}, 'DbId') self.assertEqual(expected, actual)
def test_nested_URIs(self): """ List URIs in a nested BrAPI object """ expected = [('study', ['studyURI'], 'urn:S1'), ('location', ['locationURI'], 'urn:L1'), ('trial', ['trials', 0, 'trialURI'], 'urn:T1'), ('trial', ['trials', 1, 'trialURI'], 'urn:T2'), ('germplasm', ['germplasmURIs'], ['urn:G1', 'urn:2'])] actual = get_entity_links(self.data, 'URI') self.assertEqual(expected, actual)
def test_database_id(self): input = { "studyDbId": "S1", "locationDbId": 1, "germplasmDbIds": ["G1", 2] } expected = [["study", "studyDbId", "", "S1"], ['location', 'locationDbId', '', 1], ['germplasm', 'germplasmDbIds', 's', ['G1', 2]]] actual = get_entity_links(input, 'DbId') self.assertEqual(expected, actual)
def test_nested_DbIds(self): """ List DbIds in a nested BrAPI object """ expected = [('study', ['studyDbId'], 'S1'), ('location', ['locationDbId'], 1), ('trial', ['trials', 0, 'trialDbId'], 'T1'), ('trial', ['trials', 1, 'trialDbId'], 'T2'), ('contact', ['trials', 1, 'contacts', 0, 'contactDbId'], 'C1'), ('germplasm', ['germplasmDbIds'], ['G1', 2])] actual = get_entity_links(self.data, 'DbId') self.assertEqual(expected, actual)
def get_required_entities(document_configs, source_json_dir): """ Returns set of required entities for all documents in configuration """ source_entities = set( remove_none(map(lambda d: d.get('source-entity'), document_configs))) def collect_entities(parsed_template): if is_list_like(parsed_template): return set(flatten_it(map(collect_entities, parsed_template))) if isinstance(parsed_template, dict): if '{lark}' in parsed_template: entities = set() for object_path in as_list( resolve_path(parsed_template, ['start', 'object_path'])): fields = resolve_path(object_path, ['field_path', 'FIELD']) match = re.search("^(\w+)DbId(s?)$", fields[-1]) if match: entities.add(match.groups()[0]) return entities return set( flatten_it(map(collect_entities, parsed_template.values()))) return set() document_transforms = remove_none( map(lambda d: d.get('document-transform'), document_configs)) required_entities = source_entities.union( flatten_it(map(collect_entities, document_transforms))) if source_json_dir: all_files = list_entity_files(source_json_dir) filtered_files = list( filter(lambda x: x[0] in source_entities, all_files)) for entity_name, file_path in filtered_files: with open(file_path, 'r') as file: line = file.readline() if line: data = json.loads(line) links = get_entity_links(data, 'DbId', 'PUI') entity_names = set(map(first, links)) required_entities.update(entity_names) return required_entities
def test_pui(self): input = { "studyPUI": "urn:S1", "studyDbId": "S1", "locationPUI": "urn:1", "locationDbId": 1, "germplasmPUIs": ["urn:G1", "urn:2"], "germplasmDbIds": ["G1", 2] } expected = [ ["study", "studyDbId", "", "S1"], ['location', 'locationDbId', '', 1], ['germplasm', 'germplasmDbIds', 's', ['G1', 2]], ["study", "studyPUI", "", "urn:S1"], ['location', 'locationPUI', '', "urn:1"], ['germplasm', 'germplasmPUIs', 's', ['urn:G1', "urn:2"]], ] actual = get_entity_links(input, 'DbId', 'PUI') self.assertEqual(expected, actual)
def transform_uri_link(source: dict, entities: dict, ignore_links, id_index_files: dict, entity_line: Tuple[str, str]) -> dict: """ Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs. Also checks entity links to make sure every referenced entity exists. """ entity, line = entity_line data = remove_empty(json.loads(line)) data_id = get_identifier(entity, data) data[f"{entity}DbId"] = str(data_id) data_uri = get_generate_uri(source, entity, data) data[f"{entity}URI"] = data_uri # Add basic JSON-LD fields (store URI as @id) data['@type'] = entity data['@id'] = data_uri # Add basic schema.org fields data['schema:includedInDataCatalog'] = source['@id'] data['schema:identifier'] = data_id data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name') # Create URI links for each DbId link id_links = get_entity_links(data, 'DbId') for linked_entity, link_path, link_value in id_links: if linked_entity in ignore_links: continue plural = 's' if is_collection(link_value) else '' link_uri_field = f"{linked_entity}URI{plural}" link_uri_path = [*link_path[:-1], link_uri_field] alias = None if linked_entity not in id_index_files: # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm) aliases = map( lambda l: l['entity-alias'], filter( # Find a link for current entity lambda l: l['entity'] == linked_entity and 'entity-alias' in l, # In entity links get_in(entities, [data['@type'], 'links']) or [])) alias = next(aliases, None) # Linked entity index by Id try: id_index_file = id_index_files[alias or linked_entity] except KeyError as e: raise MissingDataLink( f"No '{alias or linked_entity}' data available to verify '{link_path}' data link " f"in JSON object:\n" f"{data}\n" f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' " f"config option.\n" f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an " f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n" f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link" f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file." ) from e # open read only uri_index = UnQLite(id_index_file, flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP) def get_in_index(link_id): try: return uri_index[link_id].decode() except KeyError as e: raise MissingDataLink( f"Could not find '{alias or linked_entity}' with id '{link_id}' " f"found in '{link_path}' of object:\n{data}") from e if plural: link_uri = list(map(get_in_index, link_value)) else: link_uri = get_in_index(link_value) update_in(data, link_uri_path, link_uri) def encode_uri(uri): return base64.b64encode(str(uri).encode()).decode() # Replace DbId with b64 encoded URI uri_links = get_entity_links(data, 'URI') for linked_entity, link_path, link_value in uri_links: if linked_entity in ignore_links: continue plural = 's' if is_collection(link_value) else '' link_id_field = f"{linked_entity}DbId{plural}" link_id_path = [*link_path[:-1], link_id_field] if plural: link_id = list(map(encode_uri, link_value)) else: link_id = encode_uri(link_value) update_in(data, link_id_path, link_id) return data