Пример #1
0
def nested_item_as_graph(tree, cfg):
    """Converts a nested data item into a graph dict

    :param tree: a nested data item
    :param cfg: config options
    :returns: a graph dict with fields "nodes" and "links"
    :rtype: dict
    """
    assert content.is_item(tree)
    logger.debug('extracting item and linked items from %s' % (list(tree.keys())))
    ident_tree = ensure_ident(tree, cfg)
    ident2items = index_ident_tree(ident_tree, {**cfg,
                                                'unique_id_index': True})
    
    node_links_tuples = [item_and_links(v, cfg) for k, v in ident2items.items()]
    nodes = [n for n, links in node_links_tuples]
    if 'ensureUrls' in cfg:
        nodes = [ensure_url(n, cfg) for n in nodes]
    links = [link for n, links in node_links_tuples
             for link in links]
    return {'@context': 'http://coinform.eu',
            '@type': 'Graph',
            'nodes': nodes,
            'links': links,
            'mainNode': get_item_identifiers(ident_tree, cfg)[0]}
Пример #2
0
def trim_tree(tree, prop, depth):
    """Trims a newted data item to limit number of a nested property

    :param tree: a nested data item or a list of such items
    :param prop: the property to trim. It is assumed that values of
    property are either a single nested data item or a list of nested
    data items.
    :param depth: int maximum number of property jumps to follow from tree
    :returns: a trimmed version of the input tree
    :rtype: dict or list of dicts
    """
    assert type(prop) is str, '%s: %s' % (type(prop), prop)
    if type(depth) is not int or depth < 0:
        raise ValueError('depth %s' % (depth))

    if type(tree) is list:
        return [trim_tree(sub, prop, depth) for sub in tree]
    if not content.is_item(tree):
        return tree

    if prop not in tree:
        return tree
        
    result = {**tree}
    if depth == 0:
        del result[prop]
    else:  # depth > 0
        result[prop] = trim_tree(result[prop], prop, depth - 1)
    return  result
Пример #3
0
def item_with_refs(item, cfg):
    """Returns a copy of item where all nested items have been replaced by refs

    :param item: a (nested) item 
    :returns: a copy of the input item, where all nested items have
      been replaced by refs. It is assumed that all nested items
      already have either an `identifier` field or a `url` field which
      will serve as the reference.
    :rtype: dict
    """
    def value_as_ref(v, for_k=None):
        if type(v) is list:
            return [value_as_ref(sv, for_k) for sv in v]
        if content.is_item(v) and v['@type'] not in no_ident_types:
            ids = get_item_identifiers(v, cfg)
            if len(ids) > 0:
                return ids[0] # return the first (main) identifier
            else:
                raise ValueError('Nested item does not have an identifier %s' % (v))
        elif type(v) is dict:
            if bool(cfg.get('debug_identifiers', False)) and has_identifier(v, cfg):
                logger.debug('Nested dict value has identifier or url, but no @type %s' % (v))
            return {**v}
        elif v is None:
            return v
        else:
            assert type(v) in [int, float, str, bool], 'Unsupported value type %s %s for %s in %s' % (
                type(v), v, for_k, item)
            return v

    if not content.is_item(item):
        logger.warn('Expecting an item. @type field not included? %s' % (item))
    assert type(item) is dict, '%s' (type(item))
    return {k: value_as_ref(v, k) for k, v in item.items()}
Пример #4
0
def ensure_url(item, cfg):
    """Creates a copy of the input tree whereby all the items have a url value

    :param item: a datastructure nested schema.org compatible item
    :param cfg: config options
    :returns: a copy of tree but any suitable item and subitem in the tree has
      a url field
    :rtype: any
    """
    if type(item) == list:
        return [ensure_url(it, cfg) for it in item]
    if type(item) == dict:
        assert dictu.is_value(item)
        result = {k: ensure_url(v, cfg) for k, v in item.items()}
        if content.is_item(result):
            if 'url' in item:
                # optionally, make sure it matches the calculated url
                #  if not a match, replace url value and put old value in sameAs?
                return result
            elif item['@type'] in no_url_types:
                return result
            else:
                return {**result,
                        'url': calc_item_url(result, cfg)}
        else: # no ident is needed
            return {**item}
    # all other types are returned as they are
    return item
Пример #5
0
 def value_as_links(v, src_id, rel):
     if rel in cfg.get('composite_rels', []):
         return None
     if type(v) is list:
         return [link  for sv in v
                 if value_as_links(sv, src_id, rel) is not None
                 for link in value_as_links(sv, src_id, rel)]
     if content.is_item(v) and v['@type'] not in no_ident_types:
         ids = get_item_identifiers(v, cfg)
         if len(ids) > 0:
             return [{'source': src_id,
                     'target': ids[0],
                     'rel': rel}]
         else:
             raise ValueError('Nested item does not have an identifier %s' % (v))
     elif type(v) is dict:
         if bool(cfg.get('debug_identifiers', False)) and has_identifier(v, cfg):
             logger.debug('Nested dict value has identifier or url, but no @type %s' % (v))
         return None
     elif v is None:
         return v
     else:
         assert type(v) in [int, float, str, bool], 'Unsupported value type %s %s for %s in %s' % (
             type(v), v, src_id, item)
         return None
Пример #6
0
def ensure_ident(item, cfg):
    """Creates a copy of the input tree whereby all the items have a unique identifier

    :param item: a datastructure nested schema.org compatible item
    :param cfg: config options
    :returns: a copy of tree but any item and subitem in the tree has
      a unique identifier field
    :rtype: any
    """
    if type(item) == list:
        return [ensure_ident(it, cfg) for it in item]
    if type(item) == dict:
        assert dictu.is_value(item)
        result = {k: ensure_ident(v, cfg) for k, v in item.items()}
        if content.is_item(result):
            if 'identifier' in item:
                return result
            elif item['@type'] in no_ident_types:
                return result
            else:
                return {**result,
                        'identifier': calc_identifier(result, cfg)}
        else: # no ident is needed
            return {**item}
    # all other types are returned as they are
    return item
Пример #7
0
def partition_ident_index(ident_index, partition_types):
    """Creates an index partitioned by types

    :param ident_index: an identity item index; i.e. a dict with
      identifiers as keys and data items as values
    :param partition_types: a dict specifying the partition labels and
      types to include in each partition. The dict must have strings
      as keys and list of type names as values. We assume that the
      types are disjoint.
    :returns: a partitioned index. This is a dict with as keys the
      label for each partition and as values a subset of the input
      `ident_index`. An invariant is that merging all the values in
      the result yields the same as `ident_index`
    :rtype: dict
    """
    assert type(partition_types) is dict
    assert '_rest' not in partition_types, 'partition label _rest is reserved'
    result = {plabel: {}
              for plabel, pqtypes in partition_types.items()}
    result['_rest'] = {}
    for ident, item in ident_index.items():
        if content.is_item(item):
            matching_plabels = [
                partition_label
                for partition_label, partition_qtypes in partition_types.items()
                if content.item_matches_type(item, partition_qtypes)]
            if len(matching_plabels) > 1:
                logger.warning('Multiple partitions match item %s: %s' % (ident, matching_plabels))
                result[matching_plabels[0]][ident] = item
            elif len(matching_plabels) == 1:
                result[matching_plabels[0]][ident] = item
            else:
                result['_rest'][ident] = item
    return result
Пример #8
0
def filter_ident_index_by_type(ident_index, qtypes):
    """Filter an ident_index selecting only entries matching the query types

    :param ident_index: 
    :param qtypes: a single typename or a list of typenames to match
    :returns: 
    :rtype: 
    """
    if type(qtypes) is str:
        return filter_ident_index_by_type(ident_index, [qtypes])
    assert type(qtypes) is list
    return {k: v
            for k, v in ident_index.items()
            if content.is_item(v) and content.item_matches_type(v, qtypes)}
Пример #9
0
def calc_identifier(item, cfg):
    """Given a data item, calculate its identifier

    Any nested items must already have an identifier.

    The default identifier is given by a subset of its fields.

    :param item: The item for which to calculate the identifier
    :param cfg: config options
    :returns: a unique identifier within acred.
    :rtype: str
    """
    assert content.is_item(item)
    assert 'identifier' not in item
    to_id = item_with_refs(dictu.select_keys(item, ident_keys(item, cfg)), cfg)
    return hashu.hash_dict(to_id)
Пример #10
0
def calc_item_url(item, cfg):
    """Given a data item, calculate its url

    The url is calculated based on 

    :param item: 
    :param cfg: 
    :returns: 
    :rtype: 

    """
    assert content.is_item(item)
    #assert 'identifier' in item
    template = route_template(item, cfg)
    if template is not None:
        return '%s%s' % (content.ci_context, template.format(**item))
    else:
        return None
Пример #11
0
def normalise_nested_item(tree, cfg):
    """Converts a nested data item into an index dict

    :param tree: a nested data item
    :param cfg: config options
    :returns: an ident dict containing identifiers as the keys and
      relatively flat items as the values. The dict will also contain
      a special key `mainItem` which has as a value the main
      identifier string for the input tree
    :rtype: dict
    """
    assert content.is_item(tree)
    logger.debug('extracting item and linked items from %s' % (list(tree.keys())))
    ident_tree = ensure_ident(tree, cfg)
    ident2items = index_ident_tree(ident_tree, cfg)
    result = {k: item_with_refs(v, cfg) for k, v in ident2items.items()}
    return {**result,
            'mainItem': get_item_identifiers(ident_tree, cfg)[0]}
Пример #12
0
 def value_as_ref(v, for_k=None):
     if type(v) is list:
         return [value_as_ref(sv, for_k) for sv in v]
     if content.is_item(v) and v['@type'] not in no_ident_types:
         ids = get_item_identifiers(v, cfg)
         if len(ids) > 0:
             return ids[0] # return the first (main) identifier
         else:
             raise ValueError('Nested item does not have an identifier %s' % (v))
     elif type(v) is dict:
         if bool(cfg.get('debug_identifiers', False)) and has_identifier(v, cfg):
             logger.debug('Nested dict value has identifier or url, but no @type %s' % (v))
         return {**v}
     elif v is None:
         return v
     else:
         assert type(v) in [int, float, str, bool], 'Unsupported value type %s %s for %s in %s' % (
             type(v), v, for_k, item)
         return v
Пример #13
0
def item_and_links(item, cfg):
    """Returns a copy of item where all nested items have been converted into a list of link

    :param item: a (nested) item
    :param cfg: config options. In particular option
      `composite_rels` specifies a list of relation names which should not be decomposed.
    :returns: a tuple where the first item is the item without any refs, nor nested items
    :rtype: tuple
    """
    def value_as_links(v, src_id, rel):
        if rel in cfg.get('composite_rels', []):
            return None
        if type(v) is list:
            return [link  for sv in v
                    if value_as_links(sv, src_id, rel) is not None
                    for link in value_as_links(sv, src_id, rel)]
        if content.is_item(v) and v['@type'] not in no_ident_types:
            ids = get_item_identifiers(v, cfg)
            if len(ids) > 0:
                return [{'source': src_id,
                        'target': ids[0],
                        'rel': rel}]
            else:
                raise ValueError('Nested item does not have an identifier %s' % (v))
        elif type(v) is dict:
            if bool(cfg.get('debug_identifiers', False)) and has_identifier(v, cfg):
                logger.debug('Nested dict value has identifier or url, but no @type %s' % (v))
            return None
        elif v is None:
            return v
        else:
            assert type(v) in [int, float, str, bool], 'Unsupported value type %s %s for %s in %s' % (
                type(v), v, src_id, item)
            return None

    src_id = get_item_identifiers(item, cfg)[0]
    assert content.is_item(item), 'Expecting an item. @type field not included? %s' % (item)
    return {k: v for k, v in item.items()
            if value_as_links(v, src_id, k) is None}, [
                    link for k, v in item.items()
                    if value_as_links(v, src_id, k) is not None
                    for link in value_as_links(v, src_id, k)]
Пример #14
0
def index_ident_tree(tree, cfg):
    """Converts a tree item into an index dict

    :param tree: a possibly nested value data structure which may
      contain data items. All data items must have an identifier or
      other identifying field.
    :param cfg: configuration options
    :returns: an identifier index for the tree; it contains identifier
      strings as keys and branches of the input tree as values. Note
      that the tree and its branches are not modified at all. For a
      more trimmed index, you may want to map the values using the
      `item_with_refs` method.
    :rtype: dict
    """
    if type(tree) is list:
        result = {}
        for it in tree:
            result = _index_merge(result, index_ident_tree(it, cfg), cfg)
        return result
    elif type(tree) is dict:
        result = {}
        # first build index for any nested values
        for k, v in tree.items():
            if k in cfg.get('composite_rels', []):
                continue
            result = _index_merge(result, index_ident_tree(v, cfg), cfg)
        # finally, add entries for this item if it's an identifiable type
        if content.is_item(tree) and tree['@type'] not in no_ident_types:
            ids = get_item_identifiers(tree, cfg)
            assert len(ids) > 0, 'Cannot index an item without identifiers'
            if cfg.get('unique_id_index', False):
                ids = ids[:1] # keep only the first id
            for idval in ids:
                assert type(idval) == str
                result = _index_merge(result, {idval: tree}, cfg)
        return result
    else: # assume simple values, these are never indexed
        return {}