예제 #1
0
def fetch_all_links(source, logger, entities):
    """
    Link objects across entities.
     - Internal: link an object (ex: study) to another using an identifier inside the JSON object
      (ex: link a location via study.locationDbId)
     - Internal object: link an object (ex: study) to another contained inside the first
      (ex: link a location via study.location.locationDbId)
     - External object: link an object (ex: study) to another using a dedicated call
      (ex: link to observation variables via /brapi/v1/studies/{id}/observationVariables)
    """
    for (entity_name, entity) in entities.items():
        if 'links' not in entity:
            continue

        for link in entity['links']:
            for (object_id, object) in entity['store'].items():
                linked_entity_name = link['entity']
                linked_entity = entities[linked_entity_name]
                linked_objects_by_id = {}

                if link['type'].startswith('internal'):
                    link_path = link['json-path']
                    link_path_list = remove_empty(link_path.split('.'))

                    link_values = remove_none(as_list(get_in(object, link_path_list)))
                    if not link_values:
                        if link.get('required'):
                            raise BrokenLink("Could not find required field '{}' in {} object id '{}'"
                                             .format(link_path, entity_name, object_id))
                        continue

                    if link['type'] == 'internal-object':
                        for link_value in link_values:
                            link_id = get_identifier(linked_entity_name, link_value)
                            linked_objects_by_id[link_id] = link_value

                    elif link['type'] == 'internal':
                        link_id_field = linked_entity['name'] + 'DbId'
                        link_name_field = linked_entity['name'] + 'Name'
                        for link_value in link_values:
                            link_id = link_value.get(link_id_field)
                            link_name = link_value.get(link_name_field)
                            if link_id:
                                linked_objects_by_id[link_id] = {link_id_field: link_id, link_name_field: link_name}

                elif link['type'] == 'external-object':
                    call = get_implemented_call(source, link, context=object)
                    if not call:
                        continue

                    link_values = list(BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], call, logger))
                    for link_value in link_values:
                        link_id = get_identifier(linked_entity_name, link_value)
                        linked_objects_by_id[link_id] = link_value

                link_objects(entity, object, linked_entity, linked_objects_by_id)
    def test_get_generate_identifier(self):
        data = {'foo': 'bar', 'baz': 'fizz'}
        entity = 'buzz'
        actual = get_identifier(entity, data)
        self.assertEqual('148068838', actual)

        # Changing key order should not matter
        data2 = {'baz': 'fizz', 'foo': 'bar'}
        actual2 = get_identifier(entity, data2)
        self.assertEqual(actual, actual2)
예제 #3
0
def load_all_data_with_uri(source, source_json_dir, transform_config, pool, logger):
    logger.debug("Loading BrAPI JSON from {}...".format(source_json_dir))

    entity_files = list(list_entity_files(source_json_dir))
    if transform_config.get('restricted-documents'):
        document_configs = transform_config['documents']
        required_entities = get_required_entities(document_configs, source_json_dir)
        entity_files = list(filter(compose(required_entities.__contains__, first), entity_files))
    logger.debug("Loading entities: {}".format(', '.join(list(map(first, entity_files)))))

    # Load stream of file lines
    all_lines = itertools.chain.from_iterable(map(load_entity_lines, entity_files))

    # Parse JSON to python objects
    all_data = pool.imap_unordered(parse_data, all_lines, CHUNK_SIZE)

    # Generate URIs (and create dict from entity/id to URI)
    uri_map = dict()
    data_list = list()
    for entity_name, data in all_data:
        data_id, data_uri = generate_uri_global_id(source, entity_name, data)
        uri_map[(entity_name, data_id)] = data_uri
        uri_map[(entity_name, get_identifier(entity_name, data))] = data_uri
        if is_checkpoint(len(data_list)):
            logger.debug("checkpoint: {} BrAPI objects loaded".format(len(data_list)))
        data_list.append(data)
    logger.debug("Loaded total of {} BrAPI objects.".format(len(data_list)))

    # Replace all entity links using global ids (ex: studyDbId: 1 => studyDbId: urn:source%2Fstudy%2F1)
    generate_links = partial(generate_global_id_links, source, uri_map)
    return pool.imap_unordered(generate_links, data_list, CHUNK_SIZE)
def get_generate_uri(source: dict, entity: str, data: dict) -> str:
    """
    Get/Generate URI from BrAPI object or generate one
    """
    pui_field = entity + 'PUI'
    data_uri = data.get(pui_field)

    if data_uri and rfc3987.match(data_uri, rule='URI'):
        # The original PUI is a valid URI
        return data_uri

    source_id = urllib.parse.quote(source['schema:identifier'])
    data_id = get_identifier(entity, data)
    if not data_uri:
        # Generate URI from source id, entity name and data id
        encoded_entity = urllib.parse.quote(entity)
        encoded_id = urllib.parse.quote(data_id)
        data_uri = f"urn:{source_id}/{encoded_entity}/{encoded_id}"
    else:
        # Generate URI by prepending the original URI with the source identifier
        encoded_uri = urllib.parse.quote(data_uri)
        data_uri = f"urn:{source_id}/{encoded_uri}"

    if not rfc3987.match(data_uri, rule='URI'):
        raise Exception(
            f'Could not get or create a correct URI for "{entity}" object id "{data_id}"'
            f' (malformed URI: "{data_uri}")')
    return data_uri
예제 #5
0
def fetch_all_details(source, logger, entities, pool):
    """
    Fetch all details for each object of each entity
    """
    args = list()
    for (entity_name, entity) in entities.items():
        for (_, object) in entity['store'].items():
            object_id = get_identifier(entity_name, object)
            args.append((source, logger, entity, object_id))
    fetch_all_in_store(entities, fetch_details, args, pool)
 def add(self, data):
     # Compact object by removing nulls and empty
     data = remove_empty(data)
     if data:
         data['source'] = self.source_id
         data_id = get_identifier(self.entity_name, data)
         if data_id in self:
             dict_merge(self[data_id], data)
         else:
             self[data_id] = data
예제 #7
0
def generate_uri_global_id(source, entity_name, data):
    data_id = get_identifier(entity_name, data)

    data_uri = get_uri(source, entity_name, data)
    data_global_id = uri_encode(data_uri)

    data['brapi:type'] = entity_name
    data['source'] = source['@id']
    data['@type'] = entity_name
    data['@id'] = data_uri
    data[entity_name + 'PUI'] = data_uri
    data[entity_name + 'DbId'] = data_global_id

    return data_id, data_uri
 def load_file(options):
     entity_name, json_path = options
     result = list()
     with open(json_path, 'r') as json_file:
         while True:
             offset = json_file.tell()
             line = json_file.readline()
             if not line:
                 break
             data = json.loads(line)
             data_id = get_identifier(entity_name, data)
             data_location = {
                 'file': json_path,
                 'offset': offset,
                 'brapi:type': entity_name
             }
             result.append((entity_name, data_id, data_location))
     return result
def link_objects(entity, object, linked_entity, linked_objects_by_id):
    object_id = get_identifier(entity['name'], object)
    for (link_id, linked_object) in linked_objects_by_id.items():
        was_in_store = link_id in linked_entity['store']

        if was_in_store:
            linked_object = linked_entity['store'][link_id]

        linked_entity_name = linked_entity['name']
        if linked_object:
            link_object(entity['name'], linked_object, object_id)
        else:
            raise BrokenLink(
                f"{linked_entity_name} object id {link_id} not found in store while trying to link with "
                f"{entity['name']} object id {object_id}")
        link_object(linked_entity_name, object, link_id)

        if not was_in_store and linked_object:
            linked_entity['store'].add(linked_object)
 def dump(self, data):
     entity_name = data['brapi:type']
     if entity_name not in self.json_stores:
         self.json_stores[entity_name] = JSONSplitStore(
             self.json_dir,
             entity_name,
             buffer_size=1,
             max_file_byte_size=self.max_file_byte_size)
     json_store = self.json_stores[entity_name]
     data_id = get_identifier(entity_name, data)
     file_name = json_store.json_file.name
     offset = json_store.json_file.tell()
     json_store.dump(data)
     json_store.flush()
     data_location = {
         'file': file_name,
         'offset': offset,
         'brapi:type': entity_name
     }
     self._add_location(entity_name, data_id, data_location)
    def test_get_identifier(self):
        data = {'germplasmDbId': 'foo'}
        entity = 'germplasm'
        actual = get_identifier(entity, data)

        self.assertEqual('foo', actual)
def transform_uri_link(source: dict, entities: dict, ignore_links,
                       id_index_files: dict, entity_line: Tuple[str,
                                                                str]) -> dict:
    """
    Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs.
    Also checks entity links to make sure every referenced entity exists.
    """
    entity, line = entity_line
    data = remove_empty(json.loads(line))

    data_id = get_identifier(entity, data)
    data[f"{entity}DbId"] = str(data_id)

    data_uri = get_generate_uri(source, entity, data)
    data[f"{entity}URI"] = data_uri

    # Add basic JSON-LD fields (store URI as @id)
    data['@type'] = entity
    data['@id'] = data_uri

    # Add basic schema.org fields
    data['schema:includedInDataCatalog'] = source['@id']
    data['schema:identifier'] = data_id
    data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name')

    # Create URI links for each DbId link
    id_links = get_entity_links(data, 'DbId')
    for linked_entity, link_path, link_value in id_links:
        if linked_entity in ignore_links:
            continue
        plural = 's' if is_collection(link_value) else ''
        link_uri_field = f"{linked_entity}URI{plural}"
        link_uri_path = [*link_path[:-1], link_uri_field]

        alias = None
        if linked_entity not in id_index_files:
            # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm)
            aliases = map(
                lambda l: l['entity-alias'],
                filter(
                    # Find a link for current entity
                    lambda l: l['entity'] == linked_entity and 'entity-alias'
                    in l,
                    # In entity links
                    get_in(entities, [data['@type'], 'links']) or []))
            alias = next(aliases, None)

        # Linked entity index by Id
        try:
            id_index_file = id_index_files[alias or linked_entity]
        except KeyError as e:
            raise MissingDataLink(
                f"No '{alias or linked_entity}' data available to verify '{link_path}' data link "
                f"in JSON object:\n"
                f"{data}\n"
                f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' "
                f"config option.\n"
                f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an "
                f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n"
                f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link"
                f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file."
            ) from e

        # open read only
        uri_index = UnQLite(id_index_file,
                            flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP)

        def get_in_index(link_id):
            try:
                return uri_index[link_id].decode()
            except KeyError as e:
                raise MissingDataLink(
                    f"Could not find '{alias or linked_entity}' with id '{link_id}' "
                    f"found in '{link_path}' of object:\n{data}") from e

        if plural:
            link_uri = list(map(get_in_index, link_value))
        else:
            link_uri = get_in_index(link_value)

        update_in(data, link_uri_path, link_uri)

    def encode_uri(uri):
        return base64.b64encode(str(uri).encode()).decode()

    # Replace DbId with b64 encoded URI
    uri_links = get_entity_links(data, 'URI')
    for linked_entity, link_path, link_value in uri_links:
        if linked_entity in ignore_links:
            continue
        plural = 's' if is_collection(link_value) else ''
        link_id_field = f"{linked_entity}DbId{plural}"
        link_id_path = [*link_path[:-1], link_id_field]

        if plural:
            link_id = list(map(encode_uri, link_value))
        else:
            link_id = encode_uri(link_value)

        update_in(data, link_id_path, link_id)

    return data
 def get_or_generate_uri(source, entity, data):
     data_id = get_identifier(entity, data)
     data_uri = get_generate_uri(source, entity, data)
     return {'@type': entity, '@id': data_uri, 'schema:identifier': data_id}