예제 #1
0
파일: __init__.py 프로젝트: deepakunni3/kgx
def fold_predicate(
    graph: BaseGraph, predicate: str, remove_prefix: bool = False
) -> None:
    """
    Fold predicate as node property where every edge with ``predicate``
    will be folded as a node property.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    predicate: str
        The predicate to fold
    remove_prefix: bool
        Whether or not to remove prefix from the predicate (``False``, by default)

    """
    node_cache = []
    edge_cache = []
    start = current_time_in_millis()
    p = predicate.split(":", 1)[1] if remove_prefix else predicate
    for u, v, k, data in graph.edges(keys=True, data=True):
        if data["predicate"] == predicate:
            node_cache.append((u, p, v))
            edge_cache.append((u, v, k))
    while node_cache:
        n = node_cache.pop()
        graph.add_node_attribute(*n)
    while edge_cache:
        e = edge_cache.pop()
        graph.remove_edge(*e)
    end = current_time_in_millis()
    log.info(f"Time taken: {end - start} ms")
예제 #2
0
    def parse(
        self,
        filename: str,
        format: str = "owl",
        compression: Optional[str] = None,
        **kwargs: Any,
    ) -> Generator:
        """
        This method reads from an OWL and yields records.

        Parameters
        ----------
        filename: str
            The filename to parse
        format: str
            The format (``owl``)
        compression: Optional[str]
            The compression type (``gz``)
        kwargs: Any
            Any additional arguments

        Returns
        -------
        Generator
            A generator for node and edge records read from the file

        """
        rdfgraph = rdflib.Graph()
        if compression:
            log.warning(
                f"compression mode '{compression}' not supported by OwlSource")
        if format is None:
            format = rdflib.util.guess_format(filename)

        if format == "owl":
            format = "xml"

        log.info("Parsing {} with '{}' format".format(filename, format))
        rdfgraph.parse(filename, format=format)
        log.info("{} parsed with {} triples".format(filename, len(rdfgraph)))

        self.set_provenance_map(kwargs)

        self.start = current_time_in_millis()
        log.info(f"Done parsing {filename}")

        triples = rdfgraph.triples((None, OWL.imports, None))
        for s, p, o in triples:
            # Load all imports first
            if p == OWL.imports:
                if o not in self.imported:
                    input_format = rdflib.util.guess_format(o)
                    imported_rdfgraph = rdflib.Graph()
                    log.info(f"Parsing OWL import: {o}")
                    self.imported.add(o)
                    imported_rdfgraph.parse(o, format=input_format)
                    self.load_graph(imported_rdfgraph)
                else:
                    log.warning(f"Trying to import {o} but its already done")
        yield from self.load_graph(rdfgraph)
예제 #3
0
    def parse(self,
              filename: str = None,
              input_format: str = None,
              provided_by: str = None,
              predicates: Set[URIRef] = None) -> None:
        """
        Parse a n-triple file into networkx.MultiDiGraph

        The file must be a *.nt formatted file.

        Parameters
        ----------
        filename : str
            File to read from.
        input_format : str
            The input file format. Must be one of ``['nt', 'nt.gz']``
        provided_by : str
            Define the source providing the input file.

        """
        p = p = NTriplesParser(self)
        self.start = current_time_in_millis()
        if input_format == INPUT_FORMATS[0]:
            p.parse(open(filename, 'rb'))
        elif input_format == INPUT_FORMATS[1]:
            p.parse(gzip.open(filename, 'rb'))
        else:
            raise NameError(
                f"input_format: {input_format} not supported. Must be one of {INPUT_FORMATS}"
            )
        print("Done parsing NT file")
        self.dereify(self.assocs)
예제 #4
0
def unfold_node_property(graph: BaseGraph,
                         node_property: str,
                         prefix: Optional[str] = None) -> None:
    """
    Unfold node property as a predicate where every node with ``node_property``
    will be unfolded as an edge.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    node_property: str
        The node property to unfold
    prefix: Optional[str]
        The prefix to use

    """
    node_cache = []
    edge_cache = []
    start = current_time_in_millis()
    p = f"{prefix}:{node_property}" if prefix else node_property
    for n, data in graph.nodes(data=True):
        sub = n
        if node_property in data:
            obj = data[node_property]
            edge_cache.append((sub, obj, p))
            node_cache.append((n, node_property))
    while edge_cache:
        e = edge_cache.pop()
        graph.add_edge(
            *e, **{
                'subject': e[0],
                'object': e[1],
                'predicate': e[2],
                'relation': e[2]
            })
    while node_cache:
        n = node_cache.pop()
        del graph.nodes()[n[0]][n[1]]
    end = current_time_in_millis()
    log.info(f"Time taken: {end - start} ms")
예제 #5
0
파일: __init__.py 프로젝트: deepakunni3/kgx
def remove_singleton_nodes(graph: BaseGraph) -> None:
    """
    Remove singleton nodes (nodes that have a degree of 0) from the graph.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph

    """
    start = current_time_in_millis()
    singleton = []
    for n, d in graph.degree():
        if d == 0:
            singleton.append(n)
    while singleton:
        n = singleton.pop()
        log.debug(f"Removing singleton node {n}")
        graph.remove_node(n)
    end = current_time_in_millis()
    log.info(f"Time taken: {end - start} ms")
예제 #6
0
def clique_merge(
    target_graph: BaseGraph,
    leader_annotation: str = None,
    prefix_prioritization_map: Optional[Dict[str, List[str]]] = None,
    category_mapping: Optional[Dict[str, str]] = None,
    strict: bool = True,
) -> Tuple[BaseGraph, nx.MultiDiGraph]:
    """

    Parameters
    ----------
    target_graph: kgx.graph.base_graph.BaseGraph
        The original graph
    leader_annotation: str
        The field on a node that signifies that the node is the leader of a clique
    prefix_prioritization_map: Optional[Dict[str, List[str]]]
        A map that gives a prefix priority for one or more categories
    category_mapping: Optional[Dict[str, str]]
        Mapping for non-Biolink Model categories to Biolink Model categories
    strict: bool
        Whether or not to merge nodes in a clique that have conflicting node categories

    Returns
    -------
    Tuple[kgx.graph.base_graph.BaseGraph, networkx.MultiDiGraph]
        A tuple containing the updated target graph, and the clique graph

    """
    ppm = get_prefix_prioritization_map()
    if prefix_prioritization_map:
        ppm.update(prefix_prioritization_map)
    prefix_prioritization_map = ppm

    if not leader_annotation:
        leader_annotation = LEADER_ANNOTATION

    start = current_time_in_millis()
    clique_graph = build_cliques(target_graph)
    end = current_time_in_millis()
    log.info(f"Total time taken to build cliques: {end - start} ms")

    start = current_time_in_millis()
    elect_leader(
        target_graph,
        clique_graph,
        leader_annotation,
        prefix_prioritization_map,
        category_mapping,
        strict,
    )
    end = current_time_in_millis()
    log.info(f"Total time taken to elect leaders for all cliques: {end - start} ms")

    start = current_time_in_millis()
    graph = consolidate_edges(target_graph, clique_graph, leader_annotation)
    end = current_time_in_millis()
    log.info(f"Total time taken to consolidate edges in target graph: {end - start} ms")
    return graph, clique_graph
예제 #7
0
    def triple(self, s: URIRef, p: URIRef, o: URIRef) -> None:
        """
        Hook for rdflib.plugins.parsers.ntriples.NTriplesParser

        This method will be called by NTriplesParser when reading from a file.

        Parameters
        ----------
        s: rdflib.term.URIRef
            The subject of a triple.
        p: rdflib.term.URIRef
            The predicate of a triple.
        o: rdflib.term.URIRef
            The object of a triple.

        """
        if p in self.cache:
            # already processed this predicate before; pull from cache
            element = self.cache[p]['element']
            predicate = self.cache[p]['predicate']
            property_name = self.cache[p]['property_name']
        else:
            # haven't seen this predicate before; map to element
            predicate = self.prefix_manager.contract(str(p))
            property_name = self.prefix_manager.get_reference(predicate)
            element = self.get_biolink_element(predicate, property_name)
            self.cache[p] = {
                'element': element,
                'predicate': predicate,
                'property_name': property_name
            }

        if element:
            if element.is_a == 'association slot' or predicate in self.edge_properties:
                logging.debug(
                    f"property {property_name} is an edge property but belongs to a reified node"
                )
                n = self.add_node(s)
                self.add_node_attribute(n, p, o)
                self.assocs.add(n)
            elif element.is_a == 'node property' or predicate in self.node_properties:
                logging.debug(f"property {property_name} is a node property")
                n = self.add_node(s)
                self.add_node_attribute(n, p, o)
            else:
                logging.debug(
                    f"property {property_name} is a related_to property")
                self.add_edge(s, o, p)
        else:
            logging.debug(
                f"property {property_name} is not a biolink model element")
            if predicate in self.node_properties:
                logging.debug(f"treating {predicate} as node property")
                n = self.add_node(s)
                self.add_node_attribute(n, p, o)
            else:
                # treating as an edge
                logging.debug(f"treating {predicate} as edge property")
                self.add_edge(s, o, p)
        self.count += 1
        if self.count % 1000 == 0:
            logging.info(
                f"Parsed {self.count} triples; time taken: {current_time_in_millis() - self.start} ms"
            )
            self.start = current_time_in_millis()