示例#1
0
class Source(object):
    """
    A Source is responsible for reading data as records
    from a store where the store is a file or a database.
    """
    def __init__(self):
        self.graph_metadata: Dict = {}
        self.node_filters = {}
        self.edge_filters = {}
        self.node_properties = set()
        self.edge_properties = set()
        self.prefix_manager = PrefixManager()

    def set_prefix_map(self, m: Dict) -> None:
        """
        Update default prefix map.

        Parameters
        ----------
        m: Dict
            A dictionary with prefix to IRI mappings

        """
        self.prefix_manager.update_prefix_map(m)

    def parse(self, **kwargs: Any) -> Generator:
        """
        This method reads from the underlying store, using the
        arguments provided in ``config`` and yields records.

        Parameters
        ----------
        **kwargs: Any

        Returns
        -------
        Generator

        """
        pass

    def check_node_filter(self, node: Dict) -> bool:
        """
        Check if a node passes defined node filters.

        Parameters
        ----------
        node: Dict
            A node

        Returns
        -------
        bool
            Whether the given node has passed all defined node filters

        """
        pass_filter = False
        if self.node_filters:
            for k, v in self.node_filters.items():
                if k in node:
                    # filter key exists in node
                    if isinstance(v, (list, set, tuple)):
                        if any(x in node[k] for x in v):
                            pass_filter = True
                        else:
                            return False
                    elif isinstance(v, str):
                        if node[k] == v:
                            pass_filter = True
                        else:
                            return False
                    else:
                        log.error(
                            f"Unexpected {k} node filter of type {type(v)}")
                        return False
                else:
                    # filter key does not exist in node
                    return False
        else:
            # no node filters defined
            pass_filter = True
        return pass_filter

    def check_edge_filter(self, edge: Dict) -> bool:
        """
        Check if an edge passes defined edge filters.

        Parameters
        ----------
        edge: Dict
            An edge

        Returns
        -------
        bool
            Whether the given edge has passed all defined edge filters

        """
        pass_filter = False
        if self.edge_filters:
            for k, v in self.edge_filters.items():
                if k in {'subject_category', 'object_category'}:
                    pass_filter = True
                    continue
                if k in edge:
                    # filter key exists in edge
                    if isinstance(v, (list, set, tuple)):
                        if any(x in edge[k] for x in v):
                            pass_filter = True
                        else:
                            return False
                    elif isinstance(v, str):
                        if edge[k] == v:
                            pass_filter = True
                        else:
                            return False
                    else:
                        log.error(
                            f"Unexpected {k} edge filter of type {type(v)}")
                        return False
                else:
                    # filter does not exist in edge
                    return False
        else:
            # no edge filters defined
            pass_filter = True
        return pass_filter

    def set_node_filter(self, key: str, value: Union[str, set]) -> None:
        """
        Set a node filter, as defined by a key and value pair.
        These filters are used to filter (or reduce) the
        search space when fetching nodes from the underlying store.

        .. note::
            When defining the 'category' filter, the value should be of type ``set``.
            This method also sets the 'subject_category' and 'object_category'
            edge filters, to get a consistent set of nodes in the subgraph.

        Parameters
        ----------
        key: str
            The key for node filter
        value: Union[str, set]
            The value for the node filter.
            Can be either a string or a set.

        """
        if key == 'category':
            if isinstance(value, set):
                if 'subject_category' in self.edge_filters:
                    self.edge_filters['subject_category'].update(value)
                else:
                    self.edge_filters['subject_category'] = value
                if 'object_category' in self.edge_filters:
                    self.edge_filters['object_category'].update(value)
                else:
                    self.edge_filters['object_category'] = value
            else:
                raise TypeError(
                    "'category' node filter should have a value of type 'set'")

        if key in self.node_filters:
            self.node_filters[key].update(value)
        else:
            self.node_filters[key] = value

    def set_node_filters(self, filters: Dict) -> None:
        """
        Set node filters.

        Parameters
        ----------
        filters: Dict
            Node filters

        """
        if filters:
            for k, v in filters.items():
                if isinstance(v, (list, set, tuple)):
                    self.set_node_filter(k, set(v))
                else:
                    self.set_node_filter(k, v)

    def set_edge_filters(self, filters: Dict) -> None:
        """
        Set edge filters.

        Parameters
        ----------
        filters: Dict
            Edge filters

        """
        if filters:
            for k, v in filters.items():
                if isinstance(v, (list, set, tuple)):
                    self.set_edge_filter(k, set(v))
                else:
                    self.set_edge_filter(k, v)

    def set_edge_filter(self, key: str, value: set) -> None:
        """
        Set an edge filter, as defined by a key and value pair.
        These filters are used to filter (or reduce) the
        search space when fetching nodes from the underlying store.

        .. note::
            When defining the 'subject_category' or 'object_category' filter,
            the value should be of type ``set``.
            This method also sets the 'category' node filter, to get a
            consistent set of nodes in the subgraph.

        Parameters
        ----------
        key: str
            The key for edge filter
        value: Union[str, set]
            The value for the edge filter.
            Can be either a string or a set.

        """
        if key in {'subject_category', 'object_category'}:
            if isinstance(value, set):
                if 'category' in self.node_filters:
                    self.node_filters['category'].update(value)
                else:
                    self.node_filters['category'] = value
            else:
                raise TypeError(
                    f"'{key}' edge filter should have a value of type 'set'")

        if key in self.edge_filters:
            self.edge_filters[key].update(value)
        else:
            self.edge_filters[key] = value
示例#2
0
文件: source.py 项目: deepakunni3/kgx
class Source(object):
    """
    A Source is responsible for reading data as records
    from a store where the store is a file or a database.
    """

    def __init__(self, owner):
        self.owner = owner
        self.graph_metadata: Dict = {}
        self.node_filters = {}
        self.edge_filters = {}
        self.node_properties = set()
        self.edge_properties = set()
        self.prefix_manager = PrefixManager()
        self.infores_context: Optional[InfoResContext] = None

    def set_prefix_map(self, m: Dict) -> None:
        """
        Update default prefix map.

        Parameters
        ----------
        m: Dict
            A dictionary with prefix to IRI mappings

        """
        self.prefix_manager.update_prefix_map(m)

    def check_node_filter(self, node: Dict) -> bool:
        """
        Check if a node passes defined node filters.

        Parameters
        ----------
        node: Dict
            A node

        Returns
        -------
        bool
            Whether the given node has passed all defined node filters

        """
        pass_filter = False
        if self.node_filters:
            for k, v in self.node_filters.items():
                if k in node:
                    # filter key exists in node
                    if isinstance(v, (list, set, tuple)):
                        if any(x in node[k] for x in v):
                            pass_filter = True
                        else:
                            return False
                    elif isinstance(v, str):
                        if node[k] == v:
                            pass_filter = True
                        else:
                            return False
                    else:
                        self.owner.log_error(
                            entity=node["id"],
                            error_type=ErrorType.INVALID_NODE_PROPERTY,
                            message=f"Unexpected '{k}' node filter of type '{type(v)}'"
                        )
                        return False
                else:
                    # filter key does not exist in node
                    return False
        else:
            # no node filters defined
            pass_filter = True
        return pass_filter

    def check_edge_filter(self, edge: Dict) -> bool:
        """
        Check if an edge passes defined edge filters.

        Parameters
        ----------
        edge: Dict
            An edge

        Returns
        -------
        bool
            Whether the given edge has passed all defined edge filters

        """
        pass_filter = False
        if self.edge_filters:
            for k, v in self.edge_filters.items():
                if k in {"subject_category", "object_category"}:
                    pass_filter = True
                    continue
                if k in edge:
                    # filter key exists in edge
                    if isinstance(v, (list, set, tuple)):
                        if any(x in edge[k] for x in v):
                            pass_filter = True
                        else:
                            return False
                    elif isinstance(v, str):
                        if edge[k] == v:
                            pass_filter = True
                        else:
                            return False
                    else:
                        subobj = f"{edge['subject']}->{edge['object']}"
                        self.owner.log_error(
                            entity=subobj,
                            error_type=ErrorType.INVALID_EDGE_PROPERTY,
                            message=f"Unexpected '{k}' edge filter of type '{type(v)}'"
                        )
                        return False
                else:
                    # filter does not exist in edge
                    return False
        else:
            # no edge filters defined
            pass_filter = True
        return pass_filter

    def set_node_filter(self, key: str, value: Union[str, set]) -> None:
        """
        Set a node filter, as defined by a key and value pair.
        These filters are used to filter (or reduce) the
        search space when fetching nodes from the underlying store.

        .. note::
            When defining the 'category' filter, the value should be of type ``set``.
            This method also sets the 'subject_category' and 'object_category'
            edge filters, to get a consistent set of nodes in the subgraph.

        Parameters
        ----------
        key: str
            The key for node filter
        value: Union[str, set]
            The value for the node filter.
            Can be either a string or a set.

        """
        if key == "category":
            if isinstance(value, set):
                if "subject_category" in self.edge_filters:
                    self.edge_filters["subject_category"].update(value)
                else:
                    self.edge_filters["subject_category"] = value
                if "object_category" in self.edge_filters:
                    self.edge_filters["object_category"].update(value)
                else:
                    self.edge_filters["object_category"] = value
            else:
                raise TypeError(
                    "'category' node filter should have a value of type 'set'"
                )

        if key in self.node_filters:
            self.node_filters[key].update(value)
        else:
            self.node_filters[key] = value

    def set_node_filters(self, filters: Dict) -> None:
        """
        Set node filters.

        Parameters
        ----------
        filters: Dict
            Node filters

        """
        if filters:
            for k, v in filters.items():
                if isinstance(v, (list, set, tuple)):
                    self.set_node_filter(k, set(v))
                else:
                    self.set_node_filter(k, v)

    def set_edge_filters(self, filters: Dict) -> None:
        """
        Set edge filters.

        Parameters
        ----------
        filters: Dict
            Edge filters

        """
        if filters:
            for k, v in filters.items():
                if isinstance(v, (list, set, tuple)):
                    self.set_edge_filter(k, set(v))
                else:
                    self.set_edge_filter(k, v)

    def set_edge_filter(self, key: str, value: set) -> None:
        """
        Set an edge filter, as defined by a key and value pair.
        These filters are used to filter (or reduce) the
        search space when fetching nodes from the underlying store.

        .. note::
            When defining the 'subject_category' or 'object_category' filter,
            the value should be of type ``set``.
            This method also sets the 'category' node filter, to get a
            consistent set of nodes in the subgraph.

        Parameters
        ----------
        key: str
            The key for edge filter
        value: Union[str, set]
            The value for the edge filter.
            Can be either a string or a set.

        """
        if key in {"subject_category", "object_category"}:
            if isinstance(value, set):
                if "category" in self.node_filters:
                    self.node_filters["category"].update(value)
                else:
                    self.node_filters["category"] = value
            else:
                raise TypeError(
                    f"'{key}' edge filter should have a value of type 'set'"
                )

        if key in self.edge_filters:
            self.edge_filters[key].update(value)
        else:
            self.edge_filters[key] = value

    def clear_graph_metadata(self):
        """
        Clears a Source graph's internal graph_metadata. The value of such graph metadata is (now)
        generally a Callable function. This operation can be used in the code when the metadata is
        no longer needed, but may cause peculiar Python object persistent problems downstream.
        """
        self.infores_context = None

    def set_provenance_map(self, kwargs):
        """
        Set up a provenance (Knowledge Source to InfoRes) map
        """
        self.infores_context = InfoResContext()
        self.infores_context.set_provenance_map(kwargs)

    def get_infores_catalog(self) -> Dict[str, str]:
        """
        Return the InfoRes Context of the source
        """
        if not self.infores_context:
            return dict()
        return self.infores_context.get_catalog()

    def set_node_provenance(self, node_data):
        """
        Set a specific node provenance value.
        """
        self.infores_context.set_node_provenance(node_data)

    def set_edge_provenance(self, edge_data):
        """
        Set a specific edge provenance value.
        """
        self.infores_context.set_edge_provenance(edge_data)

    def validate_node(self, node: Dict) -> Optional[Dict]:
        """
        Given a node as a dictionary, check for required properties.
        This method will return the node dictionary with default
        assumptions applied, if any.

        Parameters
        ----------
        node: Dict
            A node represented as a dict

        Returns
        -------
        Dict
            A node represented as a dict, with default assumptions applied.

        """
        if "id" not in node or not node["id"]:
            self.owner.log_error(
                entity=str(node),
                error_type=ErrorType.MISSING_NODE_PROPERTY,
                message=f"Node missing 'id' property or empty 'id' value"
            )
            return None
        if "name" not in node:
            self.owner.log_error(
                entity=node["id"],
                error_type=ErrorType.MISSING_NODE_PROPERTY,
                message=f"Node missing 'name' property",
                message_level=MessageLevel.WARNING
            )
        if "category" not in node:
            self.owner.log_error(
                entity=node["id"],
                error_type=ErrorType.MISSING_CATEGORY,
                message=f"Node missing 'category' property? Using '{DEFAULT_NODE_CATEGORY}' as default.",
                message_level=MessageLevel.WARNING
            )
            node["category"] = [DEFAULT_NODE_CATEGORY]

        return node

    def validate_edge(self, edge: Dict) -> Optional[Dict]:
        """
        Given an edge as a dictionary, check for required properties.
        This method will return the edge dictionary with default
        assumptions applied, if any.

        Parameters
        ----------
        edge: Dict
            An edge represented as a dict

        Returns
        -------
        Dict
            An edge represented as a dict, with default assumptions applied.

        """
        incomplete_edge: bool = False
        if "subject" not in edge or not edge["subject"]:
            self.owner.log_error(
                entity=str(edge),
                error_type=ErrorType.MISSING_NODE,
                message=f"Edge missing 'subject'?"
            )
            incomplete_edge = True
        if "predicate" not in edge or not edge["predicate"]:
            self.owner.log_error(
                entity=str(edge),
                error_type=ErrorType.MISSING_EDGE_PREDICATE,
                message=f"Edge missing 'predicate'?"
            )
            incomplete_edge = True
        if "object" not in edge or not edge["object"]:
            self.owner.log_error(
                entity=str(edge),
                error_type=ErrorType.MISSING_NODE,
                message=f"Edge missing 'object'?"
            )
            incomplete_edge = True

        if not incomplete_edge:
            return edge
        else:
            return None