Exemplo n.º 1
0
    def validate_node_property_values(node: str, data: dict) -> list:
        """
        Validate a node property's value.

        Parameters
        ----------
        node: str
            Node identifier
        data: dict
            Node properties

        Returns
        -------
        list
            A list of errors for a given node

        """
        errors = []
        error_type = ErrorType.INVALID_NODE_PROPERTY_VALUE
        if not PrefixManager.is_curie(node):
            message = f"Node property 'id' expected to be of type 'CURIE'"
            errors.append(
                ValidationError(node, error_type, message, MessageLevel.ERROR))
        else:
            prefix = PrefixManager.get_prefix(node)
            if prefix and prefix not in Validator.get_all_prefixes():
                message = f"Node property 'id' has a value '{node}' with a CURIE prefix '{prefix}' is not represented in Biolink Model JSON-LD context"
                errors.append(
                    ValidationError(node, error_type, message,
                                    MessageLevel.ERROR))
        return errors
Exemplo n.º 2
0
 def __init__(self):
     self.graph_metadata: Dict = {}
     self.node_filters = {}
     self.edge_filters = {}
     self.node_properties = set()
     self.edge_properties = set()
     self.prefix_manager = PrefixManager()
Exemplo n.º 3
0
    def _prepare_object(self, prop: str, prop_type: str, value: Any) -> rdflib.term.Identifier:
        """
        Prepare the object of a triple.

        Parameters
        ----------
        prop: str
            property name
        prop_type: str
            property type
        value: Any
            property value

        Returns
        -------
        rdflib.term.Identifier
            An instance of rdflib.term.Identifier

        """
        if prop_type == 'uriorcurie' or prop_type == 'xsd:anyURI':
            if isinstance(value, str) and PrefixManager.is_curie(value):
                o = self.uriref(value)
            elif isinstance(value, str) and PrefixManager.is_iri(value):
                if _is_valid_uri(value):
                    o = URIRef(value)
                else:
                    o = Literal(value)
            else:
                o = Literal(value)
        elif prop_type.startswith('xsd'):
            o = Literal(value, datatype=self.prefix_manager.expand(prop_type))
        else:
            o = Literal(value, datatype=self.prefix_manager.expand("xsd:string"))
        return o
Exemplo n.º 4
0
class Sink(object):
    """
    A Sink is responsible for writing data as records
    to a store where the store is a file or a database.

    Parameters:
    ----------
    :param owner: Transformer
        Transformer to which the GraphSink belongs
    """
    def __init__(self, owner):
        self.owner = owner
        self.prefix_manager = PrefixManager()
        self.node_properties = set()
        self.edge_properties = set()

    def set_reverse_prefix_map(self, m: Dict) -> None:
        """
        Update default reverse prefix map.

        Parameters
        ----------
        m: Dict
            A dictionary with IRI to prefix mappings

        """
        self.prefix_manager.update_reverse_prefix_map(m)

    def write_node(self, record) -> None:
        """
        Write a node record to the underlying store.

        Parameters
        ----------
        record: Any
            A node record

        """
        pass

    def write_edge(self, record) -> None:
        """
        Write an edge record to the underlying store.

        Parameters
        ----------
        record: Any
            An edge record

        """
        pass

    def finalize(self) -> None:
        """
        Operations that ought to be done after
        writing all the incoming data should be called
        by this method.

        """
        pass
Exemplo n.º 5
0
    def validate_node_property_values(
            self,
            node: str,
            data: dict
    ):
        """
        Validate a node property's value.

        Parameters
        ----------
        node: str
            Node identifier
        data: dict
            Node properties

        """
        error_type = ErrorType.INVALID_NODE_PROPERTY_VALUE
        if not PrefixManager.is_curie(node):
            message = f"Node property 'id' is expected to be of type 'CURIE'"
            self.log_error(node, error_type, message, MessageLevel.ERROR)
        else:
            prefix = PrefixManager.get_prefix(node)
            if prefix and prefix not in self.get_all_prefixes():
                message = f"Node property 'id' has a value '{node}' with a CURIE prefix '{prefix}'" + \
                          f" is not represented in Biolink Model JSON-LD context"
                self.log_error(node, error_type, message, MessageLevel.ERROR)
Exemplo n.º 6
0
    def validate_edge_predicate(
            self,
            subject: str,
            object: str,
            data: dict,
            toolkit: Optional[Toolkit] = None
    ):
        """
        Validate ``edge_predicate`` field of a given edge.

        Parameters
        ----------
        subject: str
            Subject identifier
        object: str
            Object identifier
        data: dict
            Edge properties
        toolkit: Optional[Toolkit]
            Optional externally provided toolkit (default: use Validator class defined toolkit)

        """
        if not toolkit:
            toolkit = Validator.get_toolkit()

        error_type = ErrorType.INVALID_EDGE_PREDICATE
        edge_predicate = data.get("predicate")
        if edge_predicate is None:
            message = "Edge does not have an 'predicate' property"
            self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
        elif not isinstance(edge_predicate, str):
            message = f"Edge property 'edge_predicate' is expected to be of type 'string'"
            self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
        else:
            if PrefixManager.is_curie(edge_predicate):
                edge_predicate = PrefixManager.get_reference(edge_predicate)
            m = re.match(r"^([a-z_][^A-Z\s]+_?[a-z_][^A-Z\s]+)+$", edge_predicate)
            if m:
                p = toolkit.get_element(snakecase_to_sentencecase(edge_predicate))
                if p is None:
                    message = f"Edge predicate '{edge_predicate}' is not in Biolink Model"
                    self.log_error(
                        f"{subject}->{object}",
                        error_type,
                        message,
                        MessageLevel.ERROR,
                    )
                elif edge_predicate != p.name and edge_predicate in p.aliases:
                    message = f"Edge predicate '{edge_predicate}' is actually an alias for {p.name}; " + \
                              f"Should replace {edge_predicate} with {p.name}"
                    self.log_error(
                        f"{subject}->{object}",
                        error_type,
                        message,
                        MessageLevel.ERROR,
                    )
            else:
                message = f"Edge predicate '{edge_predicate}' is not in snake_case form"
                self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
Exemplo n.º 7
0
    def validate_categories(self,
                            node: str,
                            data: dict,
                            toolkit: Optional[Toolkit] = None):
        """
        Validate ``category`` field of a given node.

        Parameters
        ----------
        node: str
            Node identifier
        data: dict
            Node properties
        toolkit: Optional[Toolkit]
            Optional externally provided toolkit (default: use Validator class defined toolkit)

        """
        if not toolkit:
            toolkit = Validator.get_toolkit()

        error_type = ErrorType.INVALID_CATEGORY
        categories = data.get("category")
        if categories is None:
            message = "Node does not have a 'category' property"
            self.log_error(node, error_type, message, MessageLevel.ERROR)
        elif not isinstance(categories, list):
            message = f"Node property 'category' is expected to be of type {list}"
            self.log_error(node, error_type, message, MessageLevel.ERROR)
        else:
            for category in categories:
                if PrefixManager.is_curie(category):
                    category = PrefixManager.get_reference(category)
                m = re.match(r"^([A-Z][a-z\d]+)+$", category)
                if not m:
                    # category is not CamelCase
                    error_type = ErrorType.INVALID_CATEGORY
                    message = f"Category '{category}' is not in CamelCase form"
                    self.log_error(node, error_type, message,
                                   MessageLevel.ERROR)
                formatted_category = camelcase_to_sentencecase(category)
                if toolkit.is_mixin(formatted_category):
                    message = f"Category '{category}' is a mixin in the Biolink Model"
                    self.log_error(node, error_type, message,
                                   MessageLevel.ERROR)
                elif not toolkit.is_category(formatted_category):
                    message = (
                        f"Category '{category}' is unknown in the current Biolink Model"
                    )
                    self.log_error(node, error_type, message,
                                   MessageLevel.ERROR)
                else:
                    c = toolkit.get_element(formatted_category.lower())
                    if c:
                        if category != c.name and category in c.aliases:
                            message = f"Category {category} is actually an alias for {c.name}; " + \
                                      f"Should replace '{category}' with '{c.name}'"
                            self.log_error(node, error_type, message,
                                           MessageLevel.ERROR)
Exemplo n.º 8
0
 def __init__(self, owner):
     self.owner = owner
     self.graph_metadata: Dict = {}
     self.node_filters = {}
     self.edge_filters = {}
     self.node_properties = set()
     self.edge_properties = set()
     self.prefix_manager = PrefixManager()
     self.infores_context: Optional[InfoResContext] = None
Exemplo n.º 9
0
    def validate_edge_predicate(subject: str, object: str, data: dict) -> list:
        """
        Validate ``edge_predicate`` field of a given edge.

        Parameters
        ----------
        subject: str
            Subject identifier
        object: str
            Object identifier
        data: dict
            Edge properties

        Returns
        -------
        list
            A list of errors for a given edge

        """
        toolkit = get_toolkit()
        error_type = ErrorType.INVALID_EDGE_PREDICATE
        errors = []
        edge_predicate = data.get('predicate')
        if edge_predicate is None:
            message = "Edge does not have an 'predicate' property"
            errors.append(
                ValidationError(f"{subject}-{object}", error_type, message,
                                MessageLevel.ERROR))
        elif not isinstance(edge_predicate, str):
            message = f"Edge property 'edge_predicate' expected to be of type 'string'"
            errors.append(
                ValidationError(f"{subject}-{object}", error_type, message,
                                MessageLevel.ERROR))
        else:
            if PrefixManager.is_curie(edge_predicate):
                edge_predicate = PrefixManager.get_reference(edge_predicate)
            m = re.match(r"^([a-z_][^A-Z\s]+_?[a-z_][^A-Z\s]+)+$",
                         edge_predicate)
            if m:
                p = toolkit.get_element(
                    snakecase_to_sentencecase(edge_predicate))
                if p is None:
                    message = f"Edge label '{edge_predicate}' not in Biolink Model"
                    errors.append(
                        ValidationError(f"{subject}-{object}", error_type,
                                        message, MessageLevel.ERROR))
                elif edge_predicate != p.name and edge_predicate in p.aliases:
                    message = f"Edge label '{edge_predicate}' is actually an alias for {p.name}; Should replace {edge_predicate} with {p.name}"
                    errors.append(
                        ValidationError(f"{subject}-{object}", error_type,
                                        message, MessageLevel.ERROR))
            else:
                message = f"Edge label '{edge_predicate}' is not in snake_case form"
                errors.append(
                    ValidationError(f"{subject}-{object}", error_type, message,
                                    MessageLevel.ERROR))
        return errors
Exemplo n.º 10
0
    def validate_categories(node: str, data: dict) -> list:
        """
        Validate ``category`` field of a given node.

        Parameters
        ----------
        node: str
            Node identifier
        data: dict
            Node properties

        Returns
        -------
        list
            A list of errors for a given node

        """
        toolkit = get_toolkit()
        error_type = ErrorType.INVALID_CATEGORY
        errors = []
        categories = data.get('category')
        if categories is None:
            message = "Node does not have a 'category' property"
            errors.append(
                ValidationError(node, error_type, message, MessageLevel.ERROR))
        elif not isinstance(categories, list):
            message = f"Node property 'category' expected to be of type {list}"
            errors.append(
                ValidationError(node, error_type, message, MessageLevel.ERROR))
        else:
            for category in categories:
                if PrefixManager.is_curie(category):
                    category = PrefixManager.get_reference(category)
                m = re.match(r"^([A-Z][a-z\d]+)+$", category)
                if not m:
                    # category is not CamelCase
                    error_type = ErrorType.INVALID_CATEGORY
                    message = f"Category '{category}' is not in CamelCase form"
                    errors.append(
                        ValidationError(node, error_type, message,
                                        MessageLevel.ERROR))
                formatted_category = camelcase_to_sentencecase(category)
                if not toolkit.is_category(formatted_category):
                    message = f"Category '{category}' not in Biolink Model"
                    errors.append(
                        ValidationError(node, error_type, message,
                                        MessageLevel.ERROR))
                else:
                    c = toolkit.get_element(formatted_category.lower())
                    if category != c.name and category in c.aliases:
                        message = f"Category {category} is actually an alias for {c.name}; Should replace '{category}' with '{c.name}'"
                        errors.append(
                            ValidationError(node, error_type, message,
                                            MessageLevel.ERROR))
        return errors
Exemplo n.º 11
0
    def __init__(self, source_graph: nx.MultiDiGraph = None):
        if source_graph:
            self.graph = source_graph
        else:
            self.graph = nx.MultiDiGraph()

        self.graph_metadata = {}
        self.prefix_manager = PrefixManager()
        self.DEFAULT = Namespace(self.prefix_manager.prefix_map[':'])
        # TODO: use OBO IRI from biolink model context once https://github.com/biolink/biolink-model/issues/211 is resolved
        self.OBO = Namespace('http://purl.obolibrary.org/obo/')
        self.OBAN = Namespace(self.prefix_manager.prefix_map['OBAN'])
        self.PMID = Namespace(self.prefix_manager.prefix_map['PMID'])
        self.BIOLINK = Namespace(self.prefix_manager.prefix_map['biolink'])
Exemplo n.º 12
0
    def __init__(
            self,
            verbose: bool = False,
            progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = None,
            schema: Optional[str] = None,
            error_log: str = None
    ):
        ErrorDetecting.__init__(self, error_log)

        # formal arguments
        self.verbose: bool = verbose
        self.progress_monitor: Optional[
            Callable[[GraphEntityType, List], None]
        ] = progress_monitor

        # TODO: fix... this attribute is not used anywhere at the moment?
        self.schema: Optional[str] = schema

        # internal attributes
        # associated currently active _currently_active_toolkit with this Validator instance
        self.validating_toolkit = self.get_toolkit()
        self.prefix_manager = PrefixManager()
        self.jsonld = get_jsonld_context()
        self.prefixes = self.get_all_prefixes(self.jsonld)
        self.required_node_properties = self.get_required_node_properties()
        self.required_edge_properties = self.get_required_edge_properties()
Exemplo n.º 13
0
    def uriref(self, identifier: str) -> URIRef:
        """
        Generate a rdflib.URIRef for a given string.

        Parameters
        ----------
        identifier: str
            Identifier as string.

        Returns
        -------
        rdflib.URIRef
            URIRef form of the input ``identifier``

        """
        if identifier.startswith('urn:uuid:'):
            uri = identifier
        elif identifier in reverse_property_mapping:
            # identifier is a property
            uri = reverse_property_mapping[identifier]
        else:
            # identifier is an entity
            if identifier.startswith(':'):
                # TODO: this should be handled upstream by prefixcommons-py
                uri = self.DEFAULT.term(identifier.replace(':', '', 1))
            else:
                uri = self.prefix_manager.expand(identifier)
            if identifier == uri:
                if PrefixManager.is_curie(identifier):
                    identifier = identifier.replace(':', '_')
                if ' ' in identifier:
                    identifier = identifier.replace(' ', '_')
                uri = self.DEFAULT.term(identifier)

        return URIRef(uri)
Exemplo n.º 14
0
def curie_lookup(curie: str) -> Optional[str]:
    """
    Given a CURIE, find its label.

    This method first does a lookup in predefined maps. If none found,
    it makes use of CurieLookupService to look for the CURIE in a set
    of preloaded ontologies.

    Parameters
    ----------
    curie: str
        A CURIE

    Returns
    -------
    Optional[str]
        The label corresponding to the given CURIE

    """
    cls = get_curie_lookup_service()
    name: Optional[str] = None
    prefix = PrefixManager.get_prefix(curie)
    if prefix in ['OIO', 'OWL', 'owl', 'OBO', 'rdfs']:
        name = stringcase.snakecase(curie.split(':', 1)[1])
    elif curie in cls.curie_map:
        name = cls.curie_map[curie]
    elif curie in cls.ontology_graph:
        name = cls.ontology_graph.nodes()[curie]['name']
    return name
Exemplo n.º 15
0
    def get_category(self, curie: str, node: dict) -> Optional[str]:
        """
        Get category for a given CURIE.

        Parameters
        ----------
        curie: str
            Curie for node
        node: dict
            Node data

        Returns
        -------
        Optional[str]
            Category for the given node CURIE.

        """
        category = None
        # use meta.basicPropertyValues
        if "meta" in node and "basicPropertyValues" in node["meta"]:
            for p in node["meta"]["basicPropertyValues"]:
                if p["pred"] == self.HAS_OBO_NAMESPACE:
                    category = p["val"]
                    element = self.toolkit.get_element(category)
                    if element:
                        category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                    else:
                        element = self.toolkit.get_element_by_mapping(category)
                        if element:
                            category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element))}"
                        else:
                            category = "biolink:OntologyClass"

        if not category or category == "biolink:OntologyClass":
            prefix = PrefixManager.get_prefix(curie)
            # TODO: the mapping should be via biolink-model lookups
            if prefix == "HP":
                category = "biolink:PhenotypicFeature"
            elif prefix == "CHEBI":
                category = "biolink:ChemicalSubstance"
            elif prefix == "MONDO":
                category = "biolink:Disease"
            elif prefix == "UBERON":
                category = "biolink:AnatomicalEntity"
            elif prefix == "SO":
                category = "biolink:SequenceFeature"
            elif prefix == "CL":
                category = "biolink:Cell"
            elif prefix == "PR":
                category = "biolink:Protein"
            elif prefix == "NCBITaxon":
                category = "biolink:OrganismalEntity"
            else:
                self.owner.log_error(
                    entity=f"{str(category)} for node {curie}",
                    error_type=ErrorType.MISSING_CATEGORY,
                    message=
                    f"Missing category; Defaulting to 'biolink:OntologyClass'",
                    message_level=MessageLevel.WARNING)
        return category
Exemplo n.º 16
0
    def _add_attribute(self, attr_dict: Dict, key: str, value: str) -> None:
        """
        Adds an attribute to the attribute dictionary, respecting whether or not
        that attribute should be multi-valued.
        Multi-valued attributes will not contain duplicates.

        Some attributes are singular form of others. In such cases overflowing values
        will be placed into the correlating multi-valued attribute.
        For example, `name` attribute will hold only one value while any additional
        value will be stored as `synonym` attribute.

        Parameters
        ----------
        attr_dict: dict
            Dictionary representing the attribute set of a node or an edge in a networkx graph
        key: str
            The name of the attribute
        value: str
            The value of the attribute

        """
        if PrefixManager.is_iri(value):
            value = process_iri(value)
        if key in is_property_multivalued and is_property_multivalued[key]:
            if key not in attr_dict:
                attr_dict[key] = [value]
            elif value not in attr_dict[key]:
                attr_dict[key].append(value)
        else:
            if key == 'name':
                self._add_attribute(attr_dict, 'synonym', value)
            else:
                attr_dict[key] = value
Exemplo n.º 17
0
    def get_category(self, curie: str, node: dict) -> Optional[str]:
        """
        Get category for a given CURIE.

        Parameters
        ----------
        curie: str
            Curie for node
        node: dict
            Node data

        Returns
        -------
        Optional[str]
            Category for the given node CURIE.

        """
        category = None
        # use meta.basicPropertyValues
        if 'meta' in node and 'basicPropertyValues' in node['meta']:
            for p in node['meta']['basicPropertyValues']:
                if p['pred'] == self.HAS_OBO_NAMESPACE:
                    category = p['val']
                    element = self.toolkit.get_element(category)
                    if element:
                        category = (
                            f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                        )
                    else:
                        element = self.toolkit.get_element_by_mapping(category)
                        if element:
                            category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                        else:
                            category = 'biolink:OntologyClass'

        if not category or category == 'biolink:OntologyClass':
            prefix = PrefixManager.get_prefix(curie)
            # TODO: the mapping should be via biolink-model lookups
            if prefix == 'HP':
                category = "biolink:PhenotypicFeature"
            elif prefix == 'CHEBI':
                category = "biolink:ChemicalSubstance"
            elif prefix == 'MONDO':
                category = "biolink:Disease"
            elif prefix == 'UBERON':
                category = "biolink:AnatomicalEntity"
            elif prefix == 'SO':
                category = "biolink:SequenceFeature"
            elif prefix == 'CL':
                category = "biolink:Cell"
            elif prefix == 'PR':
                category = "biolink:Protein"
            elif prefix == 'NCBITaxon':
                category = "biolink:OrganismalEntity"
            else:
                log.debug(
                    f"{curie} Could not find a category mapping for '{category}'; Defaulting to 'biolink:OntologyClass'"
                )
        return category
Exemplo n.º 18
0
def test_prefix_manager():
    """
    Test to get an instance of PrefixManager.
    """
    pm = PrefixManager()
    assert pm.prefix_map
    assert pm.reverse_prefix_map
    assert 'biolink' in pm.prefix_map
    assert '' in pm.prefix_map
Exemplo n.º 19
0
    def add_edge(self, subject_iri: URIRef, object_iri: URIRef, predicate_iri: URIRef) -> Tuple[str, str, str]:
        """
        This method should be used by all derived classes when adding an edge to the networkx.MultiDiGraph.
        This ensures that the `subject` and `object` identifiers are CURIEs, and that `edge_label` is in the correct form.

        Returns the CURIE identifiers used for the `subject` and `object` in the
        networkx.MultiDiGraph, and the processed `edge_label`.

        Parameters
        ----------
        subject_iri: rdflib.URIRef
            Subject IRI for the subject in a triple
        object_iri: rdflib.URIRef
            Object IRI for the object in a triple
        predicate_iri: rdflib.URIRef
            Predicate IRI for the predicate in a triple

        Returns
        -------
        Tuple[str, str, str]
            A 3-nary tuple (of the form subject, object, predicate) that represents the edge

        """
        s = self.add_node(subject_iri)
        o = self.add_node(object_iri)
        relation = self.prefix_manager.contract(predicate_iri)
        edge_label = process_iri(predicate_iri)
        if ' ' in edge_label:
            logging.debug("predicate IRI '{}' yields edge_label '{}' that not in snake_case form; replacing ' ' with '_'".format(predicate_iri, edge_label))
        if edge_label.startswith(self.BIOLINK):
            logging.debug("predicate IRI '{}' yields edge_label '{}' that starts with '{}'; removing IRI prefix".format(predicate_iri, edge_label, self.BIOLINK))
            edge_label = edge_label.replace(self.BIOLINK, '')

        if PrefixManager.is_curie(edge_label):
            name = curie_lookup(edge_label)
            if name:
                logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; Using its mapping instead: {}".format(predicate_iri, edge_label, name))
                edge_label = name
            else:
                logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; defaulting back to {}".format(predicate_iri, edge_label, self.DEFAULT_EDGE_LABEL))
                edge_label = self.DEFAULT_EDGE_LABEL

        kwargs = {
            'subject': s,
            'predicate': str(predicate_iri),
            'object': o,
            'relation': relation,
            'edge_label': f"biolink:{edge_label}"
        }
        if 'provided_by' in self.graph_metadata:
            kwargs['provided_by'] = self.graph_metadata['provided_by']

        key = generate_edge_key(s, edge_label, o)
        if not self.graph.has_edge(s, o, key=key):
            self.graph.add_edge(s, o, key=key, **kwargs)
        # TODO: support append
        return s, o, edge_label
Exemplo n.º 20
0
 def __init__(self, verbose: bool = False):
     self.toolkit = get_toolkit()
     self.prefix_manager = PrefixManager()
     self.jsonld = get_jsonld_context()
     self.prefixes = Validator.get_all_prefixes(self.jsonld)
     self.required_node_properties = Validator.get_required_node_properties(
     )
     self.required_edge_properties = Validator.get_required_edge_properties(
     )
     self.verbose = verbose
Exemplo n.º 21
0
def test_process_predicate(query):
    """
    Test behavior of process_predicate method.
    """
    pm = PrefixManager()
    x = process_predicate(pm, query[0])
    assert x[0] == query[1]
    assert x[1] == query[2]
    assert x[2] == query[3]
    assert x[3] == query[4]
Exemplo n.º 22
0
 def _compile_prefix_stats(self, n: str):
     prefix = PrefixManager.get_prefix(n)
     if not prefix:
         error_type = ErrorType.MISSING_NODE_CURIE_PREFIX
         self.mkg.log_error(entity=n,
                            error_type=error_type,
                            message="Node 'id' has no CURIE prefix",
                            message_level=MessageLevel.WARNING)
     else:
         if prefix not in self.category_stats["id_prefixes"]:
             self.category_stats["id_prefixes"].add(prefix)
Exemplo n.º 23
0
    def validate_edge_property_values(
            self,
            subject: str,
            object: str,
            data: dict
    ):
        """
        Validate an edge property's value.

        Parameters
        ----------
        subject: str
            Subject identifier
        object: str
            Object identifier
        data: dict
            Edge properties

        """
        error_type = ErrorType.INVALID_EDGE_PROPERTY_VALUE
        prefixes = self.get_all_prefixes()

        if PrefixManager.is_curie(subject):
            prefix = PrefixManager.get_prefix(subject)
            if prefix and prefix not in prefixes:
                message = f"Edge property 'subject' has a value '{subject}' with a CURIE prefix " + \
                          f"'{prefix}' that is not represented in Biolink Model JSON-LD context"
                self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
        else:
            message = f"Edge property 'subject' has a value '{subject}' which is not a proper CURIE"
            self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)

        if PrefixManager.is_curie(object):
            prefix = PrefixManager.get_prefix(object)
            if prefix not in prefixes:
                message = f"Edge property 'object' has a value '{object}' with a CURIE " + \
                          f"prefix '{prefix}' that is not represented in Biolink Model JSON-LD context"
                self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
        else:
            message = f"Edge property 'object' has a value '{object}' which is not a proper CURIE"
            self.log_error(f"{subject}->{object}", error_type, message, MessageLevel.ERROR)
Exemplo n.º 24
0
    def __init__(self, verbose: bool = False):
        self.toolkit = get_toolkit()
        self.prefix_manager = PrefixManager()
        self.prefixes = None
        self.required_node_properties = None
        self.required_edge_properties = None
        self.verbose = verbose

        try:
            self.jsonld = requests.get(CONTEXT_JSONLD).json()
        except:
            raise Exception('Unable to download JSON-LD context from {}'.format(CONTEXT_JSONLD))
Exemplo n.º 25
0
def get_category_via_superclass(graph: BaseGraph,
                                curie: str,
                                load_ontology: bool = True) -> Set[str]:
    """
    Get category for a given CURIE by tracing its superclass, via ``subclass_of`` hierarchy,
    and getting the most appropriate category based on the superclass.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        Graph to traverse
    curie: str
        Input CURIE
    load_ontology: bool
        Determines whether to load ontology, based on CURIE prefix, or to simply
        rely on ``subclass_of`` hierarchy from graph

    Returns
    -------
    Set[str]
        A set containing one (or more) category for the given CURIE

    """
    log.debug("curie: {}".format(curie))
    new_categories = []
    toolkit = get_toolkit()
    if PrefixManager.is_curie(curie):
        ancestors = get_ancestors(graph, curie, relations=['subclass_of'])
        if len(ancestors) == 0 and load_ontology:
            cls = get_curie_lookup_service()
            ontology_graph = cls.ontology_graph
            new_categories += [
                x for x in get_category_via_superclass(ontology_graph, curie,
                                                       False)
            ]
        log.debug("Ancestors for CURIE {} via subClassOf: {}".format(
            curie, ancestors))
        seen = []
        for anc in ancestors:
            mapping = toolkit.get_by_mapping(anc)
            seen.append(anc)
            if mapping:
                # there is direct mapping to BioLink Model
                log.debug("Ancestor {} mapped to {}".format(anc, mapping))
                seen_labels = [
                    graph.nodes()[x]['name'] for x in seen
                    if 'name' in graph.nodes()[x]
                ]
                new_categories += [x for x in seen_labels]
                new_categories += [x for x in toolkit.ancestors(mapping)]
                break
    return set(new_categories)
Exemplo n.º 26
0
 def analyse_node_category(self, n, data):
     prefix = PrefixManager.get_prefix(n)
     self.category_stats['count'] += 1
     if prefix not in self.category_stats['id_prefixes']:
         self.category_stats['id_prefixes'].add(prefix)
     if 'provided_by' in data:
         for s in data['provided_by']:
             if s in self.category_stats['count_by_source']:
                 self.category_stats['count_by_source'][s] += 1
             else:
                 self.category_stats['count_by_source'][s] = 1
     else:
         self.category_stats['count_by_source']['unknown'] += 1
Exemplo n.º 27
0
 def _capture_prefix(self, n: str):
     prefix = PrefixManager.get_prefix(n)
     if not prefix:
         error_type = ErrorType.MISSING_NODE_CURIE_PREFIX
         self.summary.log_error(entity=n,
                                error_type=error_type,
                                message="Node 'id' has no CURIE prefix",
                                message_level=MessageLevel.WARNING)
     else:
         if prefix in self.category_stats["count_by_id_prefix"]:
             self.category_stats["count_by_id_prefix"][prefix] += 1
         else:
             self.category_stats["count_by_id_prefix"][prefix] = 1
Exemplo n.º 28
0
def get_biolink_element(prefix_manager: PrefixManager,
                        predicate: Any) -> Optional[Element]:
    """
    Returns a Biolink Model element for a given predicate.

    Parameters
    ----------
    prefix_manager: PrefixManager
        An instance of prefix manager
    predicate: Any
        The CURIE of a predicate

    Returns
    -------
    Optional[Element]
        The corresponding Biolink Model element

    """
    toolkit = get_toolkit()
    if prefix_manager.is_iri(predicate):
        predicate_curie = prefix_manager.contract(predicate)
    else:
        predicate_curie = predicate
    if prefix_manager.is_curie(predicate_curie):
        reference = prefix_manager.get_reference(predicate_curie)
    else:
        reference = predicate_curie
    element = toolkit.get_element(reference)
    if not element:
        try:
            mapping = toolkit.get_element_by_mapping(predicate)
            if mapping:
                element = toolkit.get_element(mapping)
        except ValueError as e:
            log.error(e)
    return element
Exemplo n.º 29
0
    def add_edge_attribute(self, subject_iri: Union[URIRef, str], object_iri: URIRef, predicate_iri: URIRef, key: str, value: str) -> None:
        """
        Adds an attribute to an edge, while taking into account whether the attribute
        should be multi-valued.
        Multi-valued properties will not contain duplicates.

        The ``key`` may be a rdflib.URIRef or a URI string that maps onto a property name
        as defined in ``rdf_utils.property_mapping``.

        If the nodes in the edge does not exist then they will be created
        using ``subject_iri`` and ``object_iri``.

        If the edge itself does not exist then it will be created using
        ``subject_iri``, ``object_iri`` and ``predicate_iri``.

        Parameters
        ----------
        subject_iri: [rdflib.URIRef, str]
            The IRI of the subject node of an edge in rdflib.Graph
        object_iri: rdflib.URIRef
            The IRI of the object node of an edge in rdflib.Graph
        predicate_iri: rdflib.URIRef
            The IRI of the predicate representing an edge in rdflib.Graph
        key: str
            The name of the attribute. Can be a rdflib.URIRef or URI string
        value: str
            The value of the attribute

        """
        if key.lower() in is_property_multivalued:
            key = key.lower()
        else:
            if not isinstance(key, URIRef):
                key = URIRef(key)
            key = property_mapping.get(key)

        if key is not None and value is not None:
            subject_curie = self.prefix_manager.contract(subject_iri)
            object_curie = self.prefix_manager.contract(object_iri)
            edge_label = process_iri(predicate_iri)
            if PrefixManager.is_curie(edge_label):
                edge_label = curie_lookup(edge_label)
            edge_key = generate_edge_key(subject_curie, edge_label, object_curie)
            attr_dict = self.graph.get_edge_data(subject_curie, object_curie, key=edge_key)
            self._add_attribute(attr_dict, key, value)
Exemplo n.º 30
0
    def __init__(self,
                 verbose: bool = False,
                 progress_monitor: Optional[Callable[[GraphEntityType, List],
                                                     None]] = None):
        # formal arguments
        self.verbose: bool = verbose
        self.progress_monitor: Optional[Callable[[GraphEntityType, List],
                                                 None]] = progress_monitor

        # internal attributes
        self.toolkit = get_toolkit()
        self.prefix_manager = PrefixManager()
        self.jsonld = get_jsonld_context()
        self.prefixes = Validator.get_all_prefixes(self.jsonld)
        self.required_node_properties = Validator.get_required_node_properties(
        )
        self.required_edge_properties = Validator.get_required_edge_properties(
        )
        self.errors: List[ValidationError] = list()