Exemplo n.º 1
0
def test_check_all_categories2():
    """
    Test check_all_categories method.
    """
    # explicitly pin the release of Biolink to a
    # specific one with known category ancestry
    get_toolkit(biolink_release="2.2.11")
    categories = get_biolink_ancestors("biolink:Gene")
    vbc, ibc, ic = check_all_categories(categories)

    assert len(vbc) == 4
    assert len(ibc) == 0
    assert (
        len(ic) == 8
    )  # mixins are not valid biolink categories, but they are ancestors.

    categories = ["biolink:NamedThing", "biolink:GeneOrGeneProduct", "biolink:Gene"]
    vbc, ibc, ic = check_all_categories(categories)
    assert len(vbc) == 2
    assert len(ibc) == 0
    assert len(ic) == 1

    categories = ["biolink:NamedThing", "biolink:GeneOrGeneProduct", "Node"]
    vbc, ibc, ic = check_all_categories(categories)
    assert len(vbc) == 1
    assert len(ibc) == 0
    assert len(ic) == 2
Exemplo n.º 2
0
    def clean_categories(self, threashold=100):
        """
        Removes categories and edges labels that are not from the biolink model.
        Adds alt_edge_label and alt_category property to hold these invalid
        edge labels and categories, so that the information is not lost.
        """
        with click.progressbar(self.graph.nodes(data='category'),
                               label='cleaning up category for nodes') as bar:
            for n, category in bar:
                if isinstance(category, list):
                    # category is a list
                    for c in category:
                        if not get_toolkit().is_category(c):
                            self.graph.node[n]['category'] = c
                    self.graph.node[n]['category'] = 'named thing'
                else:
                    # category is string
                    # TODO: This behavior needs to be consolidated, post merge
                    if not get_toolkit().is_category(category):
                        self.graph.node[n]['category'] = 'named thing'
                        self.graph.node[n]['alt_category'] = category

        with click.progressbar(
                self.graph.edges(data='edge_label'),
                label='cleaning up edge_label for edges') as bar:
            for s, o, edgelabel in bar:
                if not get_toolkit().is_edgelabel(edgelabel):
                    self.graph.node[n]['edge_label'] = 'related_to'
                    self.graph.node[n]['alt_edge_label'] = edgelabel
Exemplo n.º 3
0
 def __init__(
     self,
     owner,
     filename: str,
     format: str = "nt",
     compression: Optional[bool] = None,
     reify_all_edges: bool = False,
     **kwargs: Any,
 ):
     super().__init__(owner)
     if format not in {"nt"}:
         raise ValueError(
             f"Only RDF N-Triples ('nt') serialization supported.")
     self.DEFAULT = Namespace(self.prefix_manager.prefix_map[""])
     # self.OBO = Namespace('http://purl.obolibrary.org/obo/')
     self.OBAN = Namespace(self.prefix_manager.prefix_map["OBAN"])
     self.PMID = Namespace(self.prefix_manager.prefix_map["PMID"])
     self.BIOLINK = Namespace(self.prefix_manager.prefix_map["biolink"])
     self.toolkit = get_toolkit()
     self.reverse_predicate_mapping = {}
     self.property_types = get_biolink_property_types()
     self.cache = {}
     self.reify_all_edges = reify_all_edges
     self.reification_types = {
         RDF.Statement,
         self.BIOLINK.Association,
         self.OBAN.association,
     }
     if compression == "gz":
         f = gzip.open(filename, "wb")
     else:
         f = open(filename, "wb")
     self.FH = f
     self.encoding = "ascii"
Exemplo n.º 4
0
 def get_toolkit(cls) -> Toolkit:
     """
     Get the current default Validator Toolkit
     """
     if not cls._currently_active_toolkit:
         cls._currently_active_toolkit = get_toolkit()
     return cls._currently_active_toolkit
Exemplo n.º 5
0
 def __init__(self, prefix_prioritization_map: dict = None):
     self.toolkit = get_toolkit()
     self.clique_graph = nx.Graph()
     self.target_graph = None
     if prefix_prioritization_map:
         for x, v in prefix_prioritization_map.items():
             PREFIX_PRIORITIZATION_MAP[x] = v
Exemplo n.º 6
0
    def get_biolink_element(self, predicate: Any) -> Optional[Element]:
        """
        Returns a Biolink Model element for a given predicate.

        Parameters
        ----------
        predicate: Any
            The CURIE of a predicate

        Returns
        -------
        Optional[Element]
            The corresponding Biolink Model element

        """
        toolkit = get_toolkit()
        if self.prefix_manager.is_iri(predicate):
            predicate_curie = self.prefix_manager.contract(predicate)
        else:
            predicate_curie = predicate
        if self.prefix_manager.is_curie(predicate_curie):
            reference = self.prefix_manager.get_reference(predicate_curie)
        else:
            reference = predicate_curie
        element = toolkit.get_element(reference)
        if not element:
            try:
                mapping = toolkit.get_element_by_mapping(predicate)
                if mapping:
                    element = toolkit.get_element(mapping)
            except ValueError as e:
                log.error(e)
        return element
Exemplo n.º 7
0
 def __init__(self,
              source_graph: nx.MultiDiGraph = None,
              node_properties: Set = None,
              edge_properties: Set = None):
     super().__init__(source_graph)
     self.toolkit = get_toolkit()
     self.node_properties = node_properties if node_properties else set()
     self.edge_properties = edge_properties if edge_properties else set()
     self.node_properties.update([
         'biolink:same_as', 'OBAN:association_has_object',
         'OBAN:association_has_subject', 'OBAN:association_has_predicate',
         'OBAN:association_has_object'
     ])
     self.edge_properties.update([
         'biolink:has_modifier', 'biolink:has_gene_product',
         'biolink:has_db_xref', 'biolink:in_taxon'
     ])
     self.edge_properties.update([
         'biolink:subclass_of', 'biolink:same_as', 'biolink:part_of',
         'biolink:has_part'
     ])
     self.assocs = set()
     self.count = 0
     self.start = 0
     self.cache = {}
Exemplo n.º 8
0
def test_get_toolkit():
    """
    Test to get an instance of Toolkit via get_toolkit and
    check if default is the default biolink model version.
    """
    tk = get_toolkit()
    assert isinstance(tk, Toolkit)
    assert tk.get_model_version() == Toolkit().get_model_version()
Exemplo n.º 9
0
 def get_default_model_version(cls):
     """
     Get the Default Biolink Model version
     """
     if not cls._default_model_version:
         # get default Biolink version from BMT
         cls._default_model_version = get_toolkit().get_model_version()
     return cls._default_model_version
Exemplo n.º 10
0
    def validate_edge_predicate(subject: str, object: str, data: dict) -> list:
        """
        Validate ``edge_predicate`` field of a given edge.

        Parameters
        ----------
        subject: str
            Subject identifier
        object: str
            Object identifier
        data: dict
            Edge properties

        Returns
        -------
        list
            A list of errors for a given edge

        """
        toolkit = get_toolkit()
        error_type = ErrorType.INVALID_EDGE_PREDICATE
        errors = []
        edge_predicate = data.get('predicate')
        if edge_predicate is None:
            message = "Edge does not have an 'predicate' property"
            errors.append(
                ValidationError(f"{subject}-{object}", error_type, message,
                                MessageLevel.ERROR))
        elif not isinstance(edge_predicate, str):
            message = f"Edge property 'edge_predicate' expected to be of type 'string'"
            errors.append(
                ValidationError(f"{subject}-{object}", error_type, message,
                                MessageLevel.ERROR))
        else:
            if PrefixManager.is_curie(edge_predicate):
                edge_predicate = PrefixManager.get_reference(edge_predicate)
            m = re.match(r"^([a-z_][^A-Z\s]+_?[a-z_][^A-Z\s]+)+$",
                         edge_predicate)
            if m:
                p = toolkit.get_element(
                    snakecase_to_sentencecase(edge_predicate))
                if p is None:
                    message = f"Edge label '{edge_predicate}' not in Biolink Model"
                    errors.append(
                        ValidationError(f"{subject}-{object}", error_type,
                                        message, MessageLevel.ERROR))
                elif edge_predicate != p.name and edge_predicate in p.aliases:
                    message = f"Edge label '{edge_predicate}' is actually an alias for {p.name}; Should replace {edge_predicate} with {p.name}"
                    errors.append(
                        ValidationError(f"{subject}-{object}", error_type,
                                        message, MessageLevel.ERROR))
            else:
                message = f"Edge label '{edge_predicate}' is not in snake_case form"
                errors.append(
                    ValidationError(f"{subject}-{object}", error_type, message,
                                    MessageLevel.ERROR))
        return errors
Exemplo n.º 11
0
    def __init__(self):
        self.toolkit = get_toolkit()
        self.prefix_manager = PrefixManager()
        self.errors = []

        try:
            self.jsonld = requests.get(CONTEXT_JSONLD).json()
        except:
            raise Exception('Unable to download jsonld file from {}'.format(CONTEXT_JSONLD))
Exemplo n.º 12
0
 def __init__(self, verbose: bool = False):
     self.toolkit = get_toolkit()
     self.prefix_manager = PrefixManager()
     self.jsonld = get_jsonld_context()
     self.prefixes = Validator.get_all_prefixes(self.jsonld)
     self.required_node_properties = Validator.get_required_node_properties(
     )
     self.required_edge_properties = Validator.get_required_edge_properties(
     )
     self.verbose = verbose
Exemplo n.º 13
0
    def validate_categories(node: str, data: dict) -> list:
        """
        Validate ``category`` field of a given node.

        Parameters
        ----------
        node: str
            Node identifier
        data: dict
            Node properties

        Returns
        -------
        list
            A list of errors for a given node

        """
        toolkit = get_toolkit()
        error_type = ErrorType.INVALID_CATEGORY
        errors = []
        categories = data.get('category')
        if categories is None:
            message = "Node does not have a 'category' property"
            errors.append(
                ValidationError(node, error_type, message, MessageLevel.ERROR))
        elif not isinstance(categories, list):
            message = f"Node property 'category' expected to be of type {list}"
            errors.append(
                ValidationError(node, error_type, message, MessageLevel.ERROR))
        else:
            for category in categories:
                if PrefixManager.is_curie(category):
                    category = PrefixManager.get_reference(category)
                m = re.match(r"^([A-Z][a-z\d]+)+$", category)
                if not m:
                    # category is not CamelCase
                    error_type = ErrorType.INVALID_CATEGORY
                    message = f"Category '{category}' is not in CamelCase form"
                    errors.append(
                        ValidationError(node, error_type, message,
                                        MessageLevel.ERROR))
                formatted_category = camelcase_to_sentencecase(category)
                if not toolkit.is_category(formatted_category):
                    message = f"Category '{category}' not in Biolink Model"
                    errors.append(
                        ValidationError(node, error_type, message,
                                        MessageLevel.ERROR))
                else:
                    c = toolkit.get_element(formatted_category.lower())
                    if category != c.name and category in c.aliases:
                        message = f"Category {category} is actually an alias for {c.name}; Should replace '{category}' with '{c.name}'"
                        errors.append(
                            ValidationError(node, error_type, message,
                                            MessageLevel.ERROR))
        return errors
Exemplo n.º 14
0
def check_categories(
    categories: List, closure: List, category_mapping: Optional[Dict[str, str]] = None
) -> Tuple[List, List, List]:
    """
    Check categories to ensure whether values in ``categories`` are valid biolink categories.
    Valid biolink categories are classes that descend from 'NamedThing'.
    Mixins, while valid ancestors, are not valid categories.

    Parameters
    ----------
    categories: List
        A list of categories to check
    closure: List
        A list of nodes in a clique
    category_mapping: Optional[Dict[str, str]]
        A map that provides mapping from a non-biolink category to a biolink category

    Returns
    -------
    Tuple[List, List, List]
        A tuple consisting of valid biolink categories, invalid biolink categories, and invalid categories

    """
    valid_biolink_categories = []
    invalid_biolink_categories = []
    invalid_categories = []
    tk = get_toolkit()
    for x in categories:
        # use the toolkit to check if the declared category is actually a mixin.
        if tk.is_mixin(x):
            invalid_categories.append(x)
            continue
        # get biolink element corresponding to category
        element = get_biolink_element(x)
        if element:
            mapped_category = format_biolink_category(element["name"])
            if mapped_category in closure:
                valid_biolink_categories.append(x)
            else:
                log.warning(f"category '{mapped_category}' not in closure: {closure}")
                if category_mapping:
                    mapped = category_mapping[x] if x in category_mapping.keys() else x
                    if mapped not in closure:
                        log.warning(
                            f"category '{mapped_category}' is not in category_mapping."
                        )
                        invalid_biolink_categories.append(x)
                else:
                    invalid_biolink_categories.append(x)
        else:
            log.warning(f"category '{x}' is not in Biolink Model")
            invalid_categories.append(x)
            continue
    return valid_biolink_categories, invalid_biolink_categories, invalid_categories
Exemplo n.º 15
0
    def __init__(self, verbose: bool = False):
        self.toolkit = get_toolkit()
        self.prefix_manager = PrefixManager()
        self.prefixes = None
        self.required_node_properties = None
        self.required_edge_properties = None
        self.verbose = verbose

        try:
            self.jsonld = requests.get(CONTEXT_JSONLD).json()
        except:
            raise Exception('Unable to download JSON-LD context from {}'.format(CONTEXT_JSONLD))
Exemplo n.º 16
0
def get_category_via_superclass(graph: BaseGraph,
                                curie: str,
                                load_ontology: bool = True) -> Set[str]:
    """
    Get category for a given CURIE by tracing its superclass, via ``subclass_of`` hierarchy,
    and getting the most appropriate category based on the superclass.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        Graph to traverse
    curie: str
        Input CURIE
    load_ontology: bool
        Determines whether to load ontology, based on CURIE prefix, or to simply
        rely on ``subclass_of`` hierarchy from graph

    Returns
    -------
    Set[str]
        A set containing one (or more) category for the given CURIE

    """
    log.debug("curie: {}".format(curie))
    new_categories = []
    toolkit = get_toolkit()
    if PrefixManager.is_curie(curie):
        ancestors = get_ancestors(graph, curie, relations=['subclass_of'])
        if len(ancestors) == 0 and load_ontology:
            cls = get_curie_lookup_service()
            ontology_graph = cls.ontology_graph
            new_categories += [
                x for x in get_category_via_superclass(ontology_graph, curie,
                                                       False)
            ]
        log.debug("Ancestors for CURIE {} via subClassOf: {}".format(
            curie, ancestors))
        seen = []
        for anc in ancestors:
            mapping = toolkit.get_by_mapping(anc)
            seen.append(anc)
            if mapping:
                # there is direct mapping to BioLink Model
                log.debug("Ancestor {} mapped to {}".format(anc, mapping))
                seen_labels = [
                    graph.nodes()[x]['name'] for x in seen
                    if 'name' in graph.nodes()[x]
                ]
                new_categories += [x for x in seen_labels]
                new_categories += [x for x in toolkit.ancestors(mapping)]
                break
    return set(new_categories)
Exemplo n.º 17
0
    def __init__(self, owner):
        super().__init__(owner)
        self.DEFAULT = Namespace(self.prefix_manager.prefix_map[""])
        # TODO: use OBO IRI from biolink model context once
        #  https://github.com/biolink/biolink-model/issues/211 is resolved
        # self.OBO = Namespace('http://purl.obolibrary.org/obo/')
        self.OBAN = Namespace(self.prefix_manager.prefix_map["OBAN"])
        self.PMID = Namespace(self.prefix_manager.prefix_map["PMID"])
        self.BIOLINK = Namespace(self.prefix_manager.prefix_map["biolink"])
        self.predicate_mapping = {}
        self.cache: Dict = {}
        self.toolkit = get_toolkit()
        self.node_property_predicates = set([
            URIRef(self.prefix_manager.expand(x))
            for x in self.toolkit.get_all_node_properties(formatted=True)
        ])
        self.node_property_predicates.update(
            set(self.toolkit.get_all_node_properties(formatted=True)))
        self.node_property_predicates.update(
            set(self.toolkit.get_all_edge_properties(formatted=True)))

        # TODO: validate expansion of the scope of this statement to include 'knowledge_source' and its descendants?
        for ksf in knowledge_provenance_properties:
            self.node_property_predicates.add(
                URIRef(self.prefix_manager.expand("biolink:" + ksf)))

        self.reification_types = {
            RDF.Statement,
            self.BIOLINK.Association,
            self.OBAN.association,
        }
        self.reification_predicates = {
            self.BIOLINK.subject,
            self.BIOLINK.predicate,
            self.BIOLINK.object,
            RDF.subject,
            RDF.object,
            RDF.predicate,
            self.OBAN.association_has_subject,
            self.OBAN.association_has_predicate,
            self.OBAN.association_has_object,
        }
        self.reified_nodes: Set = set()
        self.start: int = 0
        self.count: int = 0
        self.CACHE_SIZE = 10000
        self.node_record = {}
        self.edge_record = {}
        self.node_cache = {}
        self.edge_cache = {}
        self._incomplete_nodes = {}
Exemplo n.º 18
0
 def __init__(self):
     super().__init__()
     self.DEFAULT = Namespace(self.prefix_manager.prefix_map[''])
     # TODO: use OBO IRI from biolink model context once
     #  https://github.com/biolink/biolink-model/issues/211 is resolved
     # self.OBO = Namespace('http://purl.obolibrary.org/obo/')
     self.OBAN = Namespace(self.prefix_manager.prefix_map['OBAN'])
     self.PMID = Namespace(self.prefix_manager.prefix_map['PMID'])
     self.BIOLINK = Namespace(self.prefix_manager.prefix_map['biolink'])
     self.predicate_mapping = {}
     self.cache: Dict = {}
     self.toolkit = get_toolkit()
     self.node_property_predicates = set(
         [
             URIRef(self.prefix_manager.expand(x))
             for x in self.toolkit.get_all_node_properties(formatted=True)
         ]
     )
     self.node_property_predicates.update(
         set(self.toolkit.get_all_node_properties(formatted=True))
     )
     self.node_property_predicates.update(
         set(self.toolkit.get_all_edge_properties(formatted=True))
     )
     self.node_property_predicates.add(URIRef(self.prefix_manager.expand('biolink:provided_by')))
     self.reification_types = {RDF.Statement, self.BIOLINK.Association, self.OBAN.association}
     self.reification_predicates = {
         self.BIOLINK.subject,
         self.BIOLINK.predicate,
         self.BIOLINK.object,
         RDF.subject,
         RDF.object,
         RDF.predicate,
         self.OBAN.association_has_subject,
         self.OBAN.association_has_predicate,
         self.OBAN.association_has_object,
     }
     self.reified_nodes: Set = set()
     self.start: int = 0
     self.count: int = 0
     self.CACHE_SIZE = 10000
     self.node_record = {}
     self.edge_record = {}
     self.node_cache = {}
     self.edge_cache = {}
     self._incomplete_nodes = {}
Exemplo n.º 19
0
    def __init__(self,
                 verbose: bool = False,
                 progress_monitor: Optional[Callable[[GraphEntityType, List],
                                                     None]] = None):
        # formal arguments
        self.verbose: bool = verbose
        self.progress_monitor: Optional[Callable[[GraphEntityType, List],
                                                 None]] = progress_monitor

        # internal attributes
        self.toolkit = get_toolkit()
        self.prefix_manager = PrefixManager()
        self.jsonld = get_jsonld_context()
        self.prefixes = Validator.get_all_prefixes(self.jsonld)
        self.required_node_properties = Validator.get_required_node_properties(
        )
        self.required_edge_properties = Validator.get_required_edge_properties(
        )
        self.errors: List[ValidationError] = list()
Exemplo n.º 20
0
    def get_required_edge_properties() -> list:
        """
        Get all properties for an edge that are required, as defined by Biolink Model.

        Returns
        -------
        list
            A list of required edge properties

        """
        toolkit = get_toolkit()
        edge_properties = toolkit.children('association slot')
        required_properties = []
        for p in edge_properties:
            element = toolkit.get_element(p)
            if hasattr(element, 'required') and element.required:
                # TODO: this should be handled by bmt
                formatted_name = sentencecase_to_snakecase(element.name)
                required_properties.append(formatted_name)
        return required_properties
Exemplo n.º 21
0
    def get_required_edge_properties() -> list:
        """
        Get all properties for an edge that are required, as defined by Biolink Model.

        Returns
        -------
        list
            A list of required edge properties

        """
        toolkit = get_toolkit()
        edge_properties = toolkit.get_all_edge_properties()
        required_properties = []
        for p in edge_properties:
            element = toolkit.get_element(p)
            if element and element.deprecated is None:
                if hasattr(element, 'required') and element.required:
                    formatted_name = sentencecase_to_snakecase(element.name)
                    required_properties.append(formatted_name)
        return required_properties
Exemplo n.º 22
0
    def get_biolink_element(self, predicate: Any) -> Optional[Element]:
        """
        Returns a Biolink Model element for a given predicate.

        Parameters
        ----------
        predicate: Any
            The CURIE of a predicate

        Returns
        -------
        Optional[Element]
            The corresponding Biolink Model element

        """
        toolkit = get_toolkit()
        if self.prefix_manager.is_iri(predicate):
            predicate_curie = self.prefix_manager.contract(predicate)
        else:
            predicate_curie = predicate
        if self.prefix_manager.is_curie(predicate_curie):
            reference = self.prefix_manager.get_reference(predicate_curie)
        else:
            reference = predicate_curie
        element = toolkit.get_element(reference)
        if not element:
            try:
                mapping = toolkit.get_element_by_mapping(predicate)
                if mapping:
                    element = toolkit.get_element(mapping)
            except ValueError as e:
                self.owner.log_error(
                    entity=str(predicate),
                    error_type=ErrorType.INVALID_EDGE_PREDICATE,
                    message=str(e))
                element = None
        return element
Exemplo n.º 23
0
def test_distinct_validator_class_versus_default_toolkit_biolink_version():
    Validator.set_biolink_model(version="1.8.2")
    default_tk = get_toolkit()
    validator_tk = Validator.get_toolkit()
    assert default_tk.get_model_version() != validator_tk.get_model_version()
Exemplo n.º 24
0
from ordered_set import OrderedSet

from kgx.config import get_logger
from kgx.graph.base_graph import BaseGraph
from kgx.utils.kgx_utils import (
    get_prefix_prioritization_map,
    get_biolink_element,
    get_biolink_ancestors,
    current_time_in_millis,
    format_biolink_category,
    generate_edge_key,
    get_toolkit,
)

log = get_logger()
toolkit = get_toolkit()
SAME_AS = "biolink:same_as"
SUBCLASS_OF = "biolink:subclass_of"
LEADER_ANNOTATION = "clique_leader"
ORIGINAL_SUBJECT_PROPERTY = "_original_subject"
ORIGINAL_OBJECT_PROPERTY = "_original_object"


def clique_merge(
    target_graph: BaseGraph,
    leader_annotation: str = None,
    prefix_prioritization_map: Optional[Dict[str, List[str]]] = None,
    category_mapping: Optional[Dict[str, str]] = None,
    strict: bool = True,
) -> Tuple[BaseGraph, nx.MultiDiGraph]:
    """
Exemplo n.º 25
0
 def __init__(self, source_graph: nx.MultiDiGraph = None):
     super().__init__(source_graph)
     self.ontologies = []
     self.prefix_manager = PrefixManager()
     self.toolkit = get_toolkit()
Exemplo n.º 26
0
    def validate_node_property_types(node: str, data: dict) -> list:
        """
        Checks if node properties have the expected value type.

        Parameters
        ----------
        node: str
            Node identifier
        data: dict
            Node properties

        Returns
        -------
        list
            A list of errors for a given node

        """
        toolkit = get_toolkit()
        errors = []
        error_type = ErrorType.INVALID_NODE_PROPERTY_VALUE_TYPE
        if not isinstance(node, str):
            message = "Node property 'id' expected to be of type 'string'"
            errors.append(
                ValidationError(node, error_type, message, MessageLevel.ERROR))

        for key, value in data.items():
            element = toolkit.get_element(key)
            if element:
                if hasattr(element, 'typeof'):
                    if element.typeof == 'string' and not isinstance(
                            value, str):
                        message = f"Node property '{key}' expected to be of type '{element.typeof}'"
                        errors.append(
                            ValidationError(node, error_type, message,
                                            MessageLevel.ERROR))
                    elif (element.typeof == 'uriorcurie'
                          and not isinstance(value, str)
                          and not validators.url(value)):
                        message = f"Node property '{key}' expected to be of type 'uri' or 'CURIE'"
                        errors.append(
                            ValidationError(node, error_type, message,
                                            MessageLevel.ERROR))
                    elif element.typeof == 'double' and not isinstance(
                            value, (int, float)):
                        message = f"Node property '{key}' expected to be of type '{element.typeof}'"
                        errors.append(
                            ValidationError(node, error_type, message,
                                            MessageLevel.ERROR))
                    else:
                        logger.warning(
                            "Skipping validation for Node property '{}'. Expected type '{}' vs Actual type '{}'"
                            .format(key, element.typeof, type(value)))
                if hasattr(element, 'multivalued'):
                    if element.multivalued:
                        if not isinstance(value, list):
                            message = f"Multi-valued node property '{key}' expected to be of type '{list}'"
                            errors.append(
                                ValidationError(node, error_type, message,
                                                MessageLevel.ERROR))
                    else:
                        if isinstance(value, (list, set, tuple)):
                            message = f"Single-valued node property '{key}' expected to be of type '{str}'"
                            errors.append(
                                ValidationError(node, error_type, message,
                                                MessageLevel.ERROR))
        return errors
Exemplo n.º 27
0
def test_get_toolkit():
    tk = get_toolkit()
    assert isinstance(tk, Toolkit)
Exemplo n.º 28
0
    def validate_edge_property_types(subject: str, object: str,
                                     data: dict) -> list:
        """
        Checks if edge properties have the expected value type.

        Parameters
        ----------
        subject: str
            Subject identifier
        object: str
            Object identifier
        data: dict
            Edge properties

        Returns
        -------
        list
            A list of errors for a given edge

        """
        toolkit = get_toolkit()
        errors = []
        error_type = ErrorType.INVALID_EDGE_PROPERTY_VALUE_TYPE
        if not isinstance(subject, str):
            message = "'subject' of an edge expected to be of type 'string'"
            errors.append(
                ValidationError(f"{subject}-{object}", error_type, message,
                                MessageLevel.ERROR))
        if not isinstance(object, str):
            message = "'object' of an edge expected to be of type 'string'"
            errors.append(
                ValidationError(f"{subject}-{object}", error_type, message,
                                MessageLevel.ERROR))

        for key, value in data.items():
            element = toolkit.get_element(key)
            if element:
                if hasattr(element, 'typeof'):
                    if element.typeof == 'string' and not isinstance(
                            value, str):
                        message = f"Edge property '{key}' expected to be of type 'string'"
                        errors.append(
                            ValidationError(f"{subject}-{object}", error_type,
                                            message, MessageLevel.ERROR))
                    elif (element.typeof == 'uriorcurie'
                          and not isinstance(value, str)
                          and not validators.url(value)):
                        message = f"Edge property '{key}' expected to be of type 'uri' or 'CURIE'"
                        errors.append(
                            ValidationError(f"{subject}-{object}", error_type,
                                            message, MessageLevel.ERROR))
                    elif element.typeof == 'double' and not isinstance(
                            value, (int, float)):
                        message = f"Edge property '{key}' expected to be of type 'double'"
                        errors.append(
                            ValidationError(f"{subject}-{object}", error_type,
                                            message, MessageLevel.ERROR))
                    else:
                        logger.warning(
                            "Skipping validation for Edge property '{}'. Expected type '{}' vs Actual type '{}'"
                            .format(key, element.typeof, type(value)))
                if hasattr(element, 'multivalued'):
                    if element.multivalued:
                        if not isinstance(value, list):
                            message = (
                                f"Multi-valued edge property '{key}' expected to be of type 'list'"
                            )
                            errors.append(
                                ValidationError(f"{subject}-{object}",
                                                error_type, message,
                                                MessageLevel.ERROR))
                    else:
                        if isinstance(value, (list, set, tuple)):
                            message = (
                                f"Single-valued edge property '{key}' expected to be of type 'str'"
                            )
                            errors.append(
                                ValidationError(f"{subject}-{object}",
                                                error_type, message,
                                                MessageLevel.ERROR))
        return errors
Exemplo n.º 29
0
def clique_merge(graph: nx.Graph, report=False) -> nx.Graph:
    """
    Builds up cliques using the `same_as` attribute of each node. Uses those
    cliques to build up a mapping for relabelling nodes. Chooses labels so as
    to preserve the original nodes, rather than taking xrefs that don't appear
    as nodes in the graph.

    This method will also expand the `same_as` attribute of the nodes to
    include the discovered clique.
    """
    original_size = len(graph)
    print('original graph has {} nodes'.format(original_size))

    cliqueGraph = nx.Graph()

    with click.progressbar(
            graph.nodes(data=True),
            label='building cliques from same_as node property') as bar:
        for n, attr_dict in bar:
            if 'same_as' in attr_dict:
                for m in attr_dict['same_as']:
                    cliqueGraph.add_edge(n, m)

    with click.progressbar(graph.edges(data=True),
                           label='building cliques from same_as edges') as bar:
        for u, v, attr_dict in bar:
            if 'edge_label' in attr_dict and attr_dict[
                    'edge_label'] == 'same_as':
                cliqueGraph.add_edge(u, v)

    edges = []
    with click.progressbar(cliqueGraph.edges(),
                           label='Breaking invalid cliques') as bar:
        for u, v in bar:
            try:
                u_categories = graph.node[u].get('category', [])
                v_categories = graph.node[v].get('category', [])
            except:
                continue
            l = len(edges)
            for a in u_categories:
                if len(edges) > l:
                    break
                if get_toolkit().get_element(a) is None:
                    continue
                for b in v_categories:
                    if get_toolkit().get_element(b) is None:
                        continue
                    a_ancestors = get_toolkit().ancestors(a)
                    b_ancestors = get_toolkit().ancestors(b)
                    if a_ancestors == b_ancestors == []:
                        continue
                    elif a not in b_ancestors and b not in a_ancestors:
                        edges.append((u, v))
                        break

    print('breaking {} many edges'.format(len(edges)))
    cliqueGraph.remove_edges_from(edges)

    mapping = {}

    connected_components = list(nx.connected_components(cliqueGraph))

    print('Discovered {} cliques'.format(len(connected_components)))

    with click.progressbar(connected_components,
                           label='building mapping') as bar:
        for nodes in bar:
            nodes = list(nodes)
            categories = set()
            for n in nodes:
                if not graph.has_node(n):
                    continue

                attr_dict = graph.node[n]

                attr_dict['same_as'] = nodes

                if 'category' in attr_dict:
                    categories.update(listify(attr_dict['category']))

                if 'categories' in attr_dict:
                    categories.update(listify(attr_dict['categories']))

            list_of_prefixes = []
            for category in categories:
                try:
                    list_of_prefixes.append(
                        get_toolkit().get_element(category).id_prefixes)
                except:
                    pass

            nodes.sort()
            nodes.sort(key=build_sort_key(list_of_prefixes))

            for n in nodes:
                if n != nodes[0]:
                    mapping[n] = nodes[0]

    g = relabel_nodes(graph, mapping)

    edges = []
    for u, v, key, data in g.edges(keys=True, data=True):
        if data.get('edge_label') == 'same_as':
            edges.append((u, v, key))
    g.remove_edges_from(edges)

    for n, data in g.nodes(data=True):
        data['iri'] = expand_uri(n)
        if 'id' in data and data['id'] != n:
            data['id'] = n
        if 'same_as' in data and n in data['same_as']:
            data['same_as'].remove(n)
            if data['same_as'] == []:
                del data['same_as']

    final_size = len(g)
    print('Resulting graph has {} nodes'.format(final_size))
    print('Eliminated {} nodes'.format(original_size - final_size))

    return g
Exemplo n.º 30
0
 def set_biolink_model(cls, version: Optional[str]):
     """
     Set Biolink Model version of Validator Toolkit
     """
     cls._currently_active_toolkit = get_toolkit(biolink_release=version)