Exemplo n.º 1
0
def generate_global_id_links(source, uri_map, data):
    def generate_uri(tuple):
        (entity_name, object_id) = tuple
        return get_uri_by_id(source, entity_name, object_id)

    def get_or_generate_uri(entity_name, object_id):
        return get_dict_or_generate(uri_map, (entity_name, object_id), generate_uri)

    entity_id_links = get_entity_links(data, 'DbId')
    for (linked_entity, linked_id_field, plural, linked_ids) in entity_id_links:
        link_uri_field = linked_entity + 'PUI' + plural
        if link_uri_field in data:
            continue
        linked_uris = set(remove_none(
            map(partial(get_or_generate_uri, linked_entity), as_list(linked_ids))))
        if linked_uris:
            if not plural:
                linked_uris = first(linked_uris)
            data[link_uri_field] = linked_uris

    entity_uri_links = get_entity_links(data, 'PUI')
    for (linked_entity, linked_uri_field, plural, linked_uris) in entity_uri_links:
        linked_id_field = linked_entity + 'DbId' + plural
        linked_ids = set(map(uri_encode, as_list(linked_uris)))
        if linked_ids:
            if not plural:
                linked_ids = first(linked_ids)
            data[linked_id_field] = linked_ids

    return data
 def test_no_ids(self):
     """
     List DbIds in object without any
     """
     expected = []
     actual = get_entity_links({}, 'DbId')
     self.assertEqual(expected, actual)
 def test_nested_URIs(self):
     """
     List URIs in a nested BrAPI object
     """
     expected = [('study', ['studyURI'], 'urn:S1'),
                 ('location', ['locationURI'], 'urn:L1'),
                 ('trial', ['trials', 0, 'trialURI'], 'urn:T1'),
                 ('trial', ['trials', 1, 'trialURI'], 'urn:T2'),
                 ('germplasm', ['germplasmURIs'], ['urn:G1', 'urn:2'])]
     actual = get_entity_links(self.data, 'URI')
     self.assertEqual(expected, actual)
Exemplo n.º 4
0
 def test_database_id(self):
     input = {
         "studyDbId": "S1",
         "locationDbId": 1,
         "germplasmDbIds": ["G1", 2]
     }
     expected = [["study", "studyDbId", "", "S1"],
                 ['location', 'locationDbId', '', 1],
                 ['germplasm', 'germplasmDbIds', 's', ['G1', 2]]]
     actual = get_entity_links(input, 'DbId')
     self.assertEqual(expected, actual)
 def test_nested_DbIds(self):
     """
     List DbIds in a nested BrAPI object
     """
     expected = [('study', ['studyDbId'], 'S1'),
                 ('location', ['locationDbId'], 1),
                 ('trial', ['trials', 0, 'trialDbId'], 'T1'),
                 ('trial', ['trials', 1, 'trialDbId'], 'T2'),
                 ('contact', ['trials', 1, 'contacts', 0,
                              'contactDbId'], 'C1'),
                 ('germplasm', ['germplasmDbIds'], ['G1', 2])]
     actual = get_entity_links(self.data, 'DbId')
     self.assertEqual(expected, actual)
def get_required_entities(document_configs, source_json_dir):
    """
    Returns set of required entities for all documents in configuration
    """
    source_entities = set(
        remove_none(map(lambda d: d.get('source-entity'), document_configs)))

    def collect_entities(parsed_template):
        if is_list_like(parsed_template):
            return set(flatten_it(map(collect_entities, parsed_template)))
        if isinstance(parsed_template, dict):
            if '{lark}' in parsed_template:
                entities = set()
                for object_path in as_list(
                        resolve_path(parsed_template,
                                     ['start', 'object_path'])):
                    fields = resolve_path(object_path, ['field_path', 'FIELD'])
                    match = re.search("^(\w+)DbId(s?)$", fields[-1])
                    if match:
                        entities.add(match.groups()[0])
                return entities
            return set(
                flatten_it(map(collect_entities, parsed_template.values())))
        return set()

    document_transforms = remove_none(
        map(lambda d: d.get('document-transform'), document_configs))
    required_entities = source_entities.union(
        flatten_it(map(collect_entities, document_transforms)))

    if source_json_dir:
        all_files = list_entity_files(source_json_dir)
        filtered_files = list(
            filter(lambda x: x[0] in source_entities, all_files))
        for entity_name, file_path in filtered_files:
            with open(file_path, 'r') as file:
                line = file.readline()
                if line:
                    data = json.loads(line)
                    links = get_entity_links(data, 'DbId', 'PUI')
                    entity_names = set(map(first, links))
                    required_entities.update(entity_names)

    return required_entities
Exemplo n.º 7
0
 def test_pui(self):
     input = {
         "studyPUI": "urn:S1",
         "studyDbId": "S1",
         "locationPUI": "urn:1",
         "locationDbId": 1,
         "germplasmPUIs": ["urn:G1", "urn:2"],
         "germplasmDbIds": ["G1", 2]
     }
     expected = [
         ["study", "studyDbId", "", "S1"],
         ['location', 'locationDbId', '', 1],
         ['germplasm', 'germplasmDbIds', 's', ['G1', 2]],
         ["study", "studyPUI", "", "urn:S1"],
         ['location', 'locationPUI', '', "urn:1"],
         ['germplasm', 'germplasmPUIs', 's', ['urn:G1', "urn:2"]],
     ]
     actual = get_entity_links(input, 'DbId', 'PUI')
     self.assertEqual(expected, actual)
def transform_uri_link(source: dict, entities: dict, ignore_links,
                       id_index_files: dict, entity_line: Tuple[str,
                                                                str]) -> dict:
    """
    Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs.
    Also checks entity links to make sure every referenced entity exists.
    """
    entity, line = entity_line
    data = remove_empty(json.loads(line))

    data_id = get_identifier(entity, data)
    data[f"{entity}DbId"] = str(data_id)

    data_uri = get_generate_uri(source, entity, data)
    data[f"{entity}URI"] = data_uri

    # Add basic JSON-LD fields (store URI as @id)
    data['@type'] = entity
    data['@id'] = data_uri

    # Add basic schema.org fields
    data['schema:includedInDataCatalog'] = source['@id']
    data['schema:identifier'] = data_id
    data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name')

    # Create URI links for each DbId link
    id_links = get_entity_links(data, 'DbId')
    for linked_entity, link_path, link_value in id_links:
        if linked_entity in ignore_links:
            continue
        plural = 's' if is_collection(link_value) else ''
        link_uri_field = f"{linked_entity}URI{plural}"
        link_uri_path = [*link_path[:-1], link_uri_field]

        alias = None
        if linked_entity not in id_index_files:
            # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm)
            aliases = map(
                lambda l: l['entity-alias'],
                filter(
                    # Find a link for current entity
                    lambda l: l['entity'] == linked_entity and 'entity-alias'
                    in l,
                    # In entity links
                    get_in(entities, [data['@type'], 'links']) or []))
            alias = next(aliases, None)

        # Linked entity index by Id
        try:
            id_index_file = id_index_files[alias or linked_entity]
        except KeyError as e:
            raise MissingDataLink(
                f"No '{alias or linked_entity}' data available to verify '{link_path}' data link "
                f"in JSON object:\n"
                f"{data}\n"
                f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' "
                f"config option.\n"
                f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an "
                f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n"
                f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link"
                f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file."
            ) from e

        # open read only
        uri_index = UnQLite(id_index_file,
                            flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP)

        def get_in_index(link_id):
            try:
                return uri_index[link_id].decode()
            except KeyError as e:
                raise MissingDataLink(
                    f"Could not find '{alias or linked_entity}' with id '{link_id}' "
                    f"found in '{link_path}' of object:\n{data}") from e

        if plural:
            link_uri = list(map(get_in_index, link_value))
        else:
            link_uri = get_in_index(link_value)

        update_in(data, link_uri_path, link_uri)

    def encode_uri(uri):
        return base64.b64encode(str(uri).encode()).decode()

    # Replace DbId with b64 encoded URI
    uri_links = get_entity_links(data, 'URI')
    for linked_entity, link_path, link_value in uri_links:
        if linked_entity in ignore_links:
            continue
        plural = 's' if is_collection(link_value) else ''
        link_id_field = f"{linked_entity}DbId{plural}"
        link_id_path = [*link_path[:-1], link_id_field]

        if plural:
            link_id = list(map(encode_uri, link_value))
        else:
            link_id = encode_uri(link_value)

        update_in(data, link_id_path, link_id)

    return data