示例#1
0
class ObographSource(JsonSource):
    """
    ObographSource is responsible for reading data as records
    from an OBO Graph JSON.
    """

    HAS_OBO_NAMESPACE = 'http://www.geneontology.org/formats/oboInOwl#hasOBONamespace'
    SKOS_EXACT_MATCH = 'http://www.w3.org/2004/02/skos/core#exactMatch'

    def __init__(self):
        super().__init__()
        self.toolkit = Toolkit()
        self.ecache: Dict = {}

    def parse(
        self,
        filename: str,
        format: str = 'json',
        compression: Optional[str] = None,
        provided_by: Optional[str] = None,
        **kwargs: Any,
    ) -> Generator:
        """
        This method reads from JSON and yields records.

        Parameters
        ----------
        filename: str
            The filename to parse
        format: str
            The format (``json``)
        compression: Optional[str]
            The compression type (``gz``)
        provided_by: Optional[str]
            The name of the source providing the input file
        kwargs: Any
            Any additional arguments

        Returns
        -------
        Generator
            A generator for records

        """
        if provided_by:
            self.graph_metadata['provided_by'] = [provided_by]
        n = self.read_nodes(filename, compression)
        e = self.read_edges(filename, compression)
        yield from chain(n, e)

    def read_nodes(self,
                   filename: str,
                   compression: Optional[str] = None) -> Generator:
        """
        Read node records from a JSON.

        Parameters
        ----------
        filename: str
            The filename to read from
        compression: Optional[str]
            The compression type

        Returns
        -------
        Generator
            A generator for node records

        """
        if compression and compression == 'gz':
            FH = gzip.open(filename, 'rb')
        else:
            FH = open(filename, 'rb')
        for n in ijson.items(FH, 'graphs.item.nodes.item'):
            yield self.read_node(n)

    def read_node(self, node: Dict) -> Dict:
        """
        Read and parse a node record.

        Parameters
        ----------
        node: Dict
            The node record

        Returns
        -------
        Dict
            The processed node

        """
        curie = self.prefix_manager.contract(node['id'])
        node_properties = {}
        if 'meta' in node:
            node_properties = self.parse_meta(node['id'], node['meta'])

        fixed_node = dict()
        fixed_node['id'] = curie
        if 'lbl' in node:
            fixed_node['name'] = node['lbl']
        fixed_node['iri'] = node['id']

        if 'description' in node_properties:
            fixed_node['description'] = node_properties['description']
        if 'synonym' in node_properties:
            fixed_node['synonym'] = node_properties['synonym']
        if 'xrefs' in node_properties:
            fixed_node['xref'] = node_properties['xrefs']
        if 'subsets' in node_properties:
            fixed_node['subsets'] = node_properties['subsets']

        if 'category' not in node:
            category = self.get_category(curie, node)
            if category:
                fixed_node['category'] = [category]
            else:
                fixed_node['category'] = ['biolink:OntologyClass']
        if 'equivalent_nodes' in node_properties:
            equivalent_nodes = node_properties['equivalent_nodes']
            fixed_node['same_as'] = equivalent_nodes
            # for n in node_properties['equivalent_nodes']:
            #     data = {'subject': fixed_node['id'], 'predicate': 'biolink:same_as', 'object': n, 'relation': 'owl:sameAs'}
            #     super().load_node({'id': n, 'category': ['biolink:OntologyClass']})
            #     self.graph.add_edge(fixed_node['id'], n, **data)
        return super().read_node(fixed_node)

    def read_edges(self,
                   filename: str,
                   compression: Optional[str] = None) -> Generator:
        """
        Read edge records from a JSON.

        Parameters
        ----------
        filename: str
            The filename to read from
        compression: Optional[str]
            The compression type

        Returns
        -------
        Generator
            A generator for edge records

        """
        if compression == 'gz':
            FH = gzip.open(filename, 'rb')
        else:
            FH = open(filename, 'rb')
        for e in ijson.items(FH, 'graphs.item.edges.item'):
            yield self.read_edge(e)

    def read_edge(self, edge: Dict) -> Dict:
        """
        Read and parse an edge record.

        Parameters
        ----------
        edge: Dict
            The edge record

        Returns
        -------
        Dict
            The processed edge

        """
        fixed_edge = dict()
        fixed_edge['subject'] = self.prefix_manager.contract(edge['sub'])
        if PrefixManager.is_iri(edge['pred']):
            curie = self.prefix_manager.contract(edge['pred'])
            if curie in self.ecache:
                edge_predicate = self.ecache[curie]
            else:
                element = get_biolink_element(curie)
                if not element:
                    try:
                        mapping = self.toolkit.get_element_by_mapping(
                            edge['pred'])
                        if mapping:
                            element = self.toolkit.get_element(mapping)
                    except ValueError as e:
                        log.error(e)

                if element:
                    edge_predicate = format_biolink_slots(
                        element.name.replace(',', ''))
                    fixed_edge['predicate'] = edge_predicate
                else:
                    edge_predicate = 'biolink:related_to'
                self.ecache[curie] = edge_predicate
            fixed_edge['predicate'] = edge_predicate
            fixed_edge['relation'] = curie
        else:
            if edge['pred'] == 'is_a':
                fixed_edge['predicate'] = 'biolink:subclass_of'
                fixed_edge['relation'] = 'rdfs:subClassOf'
            elif edge['pred'] == 'has_part':
                fixed_edge['predicate'] = 'biolink:has_part'
                fixed_edge['relation'] = "BFO:0000051"
            elif edge['pred'] == 'part_of':
                fixed_edge['predicate'] = 'biolink:part_of'
                fixed_edge['relation'] = "BFO:0000050"
            else:
                fixed_edge[
                    'predicate'] = f"biolink:{edge['pred'].replace(' ', '_')}"
                fixed_edge['relation'] = edge['pred']

        fixed_edge['object'] = self.prefix_manager.contract(edge['obj'])
        for x in edge.keys():
            if x not in {'sub', 'pred', 'obj'}:
                fixed_edge[x] = edge[x]
        return super().read_edge(fixed_edge)

    def get_category(self, curie: str, node: dict) -> Optional[str]:
        """
        Get category for a given CURIE.

        Parameters
        ----------
        curie: str
            Curie for node
        node: dict
            Node data

        Returns
        -------
        Optional[str]
            Category for the given node CURIE.

        """
        category = None
        # use meta.basicPropertyValues
        if 'meta' in node and 'basicPropertyValues' in node['meta']:
            for p in node['meta']['basicPropertyValues']:
                if p['pred'] == self.HAS_OBO_NAMESPACE:
                    category = p['val']
                    element = self.toolkit.get_element(category)
                    if element:
                        category = (
                            f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                        )
                    else:
                        element = self.toolkit.get_element_by_mapping(category)
                        if element:
                            category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                        else:
                            category = 'biolink:OntologyClass'

        if not category or category == 'biolink:OntologyClass':
            prefix = PrefixManager.get_prefix(curie)
            # TODO: the mapping should be via biolink-model lookups
            if prefix == 'HP':
                category = "biolink:PhenotypicFeature"
            elif prefix == 'CHEBI':
                category = "biolink:ChemicalSubstance"
            elif prefix == 'MONDO':
                category = "biolink:Disease"
            elif prefix == 'UBERON':
                category = "biolink:AnatomicalEntity"
            elif prefix == 'SO':
                category = "biolink:SequenceFeature"
            elif prefix == 'CL':
                category = "biolink:Cell"
            elif prefix == 'PR':
                category = "biolink:Protein"
            elif prefix == 'NCBITaxon':
                category = "biolink:OrganismalEntity"
            else:
                log.debug(
                    f"{curie} Could not find a category mapping for '{category}'; Defaulting to 'biolink:OntologyClass'"
                )
        return category

    def parse_meta(self, node: str, meta: Dict) -> Dict:
        """
        Parse 'meta' field of a node.

        Parameters
        ----------
        node: str
            Node identifier
        meta: Dict
            meta dictionary for the node

        Returns
        -------
        Dict
            A dictionary that contains 'description', 'synonyms',
            'xrefs', and 'equivalent_nodes'.

        """
        # cross species links are in meta; this needs to be parsed properly too
        # do not put assumptions in code; import as much as possible

        properties = {}
        if 'definition' in meta:
            # parse 'definition' as 'description'
            description = meta['definition']['val']
            properties['description'] = description

        if 'subsets' in meta:
            # parse 'subsets'
            subsets = meta['subsets']
            properties['subsets'] = [
                x.split('#')[1] if '#' in x else x for x in subsets
            ]

        if 'synonyms' in meta:
            # parse 'synonyms' as 'synonym'
            synonyms = [s['val'] for s in meta['synonyms']]
            properties['synonym'] = synonyms

        if 'xrefs' in meta:
            # parse 'xrefs' as 'xrefs'
            xrefs = [x['val'] for x in meta['xrefs']]
            properties['xrefs'] = xrefs

        if 'deprecated' in meta:
            # parse 'deprecated' flag
            properties['deprecated'] = meta['deprecated']

        equivalent_nodes = []
        if 'basicPropertyValues' in meta:
            # parse SKOS_EXACT_MATCH entries as 'equivalent_nodes'
            for p in meta['basicPropertyValues']:
                if p['pred'] in {self.SKOS_EXACT_MATCH}:
                    n = self.prefix_manager.contract(p['val'])
                    if not n:
                        n = p['val']
                    equivalent_nodes.append(n)
        properties['equivalent_nodes'] = equivalent_nodes
        return properties
示例#2
0
class ObographSource(JsonSource):
    """
    ObographSource is responsible for reading data as records
    from an OBO Graph JSON.
    """

    HAS_OBO_NAMESPACE = "http://www.geneontology.org/formats/oboInOwl#hasOBONamespace"
    SKOS_EXACT_MATCH = "http://www.w3.org/2004/02/skos/core#exactMatch"

    def __init__(self, owner):
        super().__init__(owner)
        self.toolkit = Toolkit()
        self.ecache: Dict = {}

    def parse(
        self,
        filename: str,
        format: str = "json",
        compression: Optional[str] = None,
        **kwargs: Any,
    ) -> Generator:
        """
        This method reads from JSON and yields records.

        Parameters
        ----------
        filename: str
            The filename to parse
        format: str
            The format (``json``)
        compression: Optional[str]
            The compression type (``gz``)
        kwargs: Any
            Any additional arguments

        Returns
        -------
        Generator
            A generator for records

        """
        self.set_provenance_map(kwargs)

        n = self.read_nodes(filename, compression)
        e = self.read_edges(filename, compression)
        yield from chain(n, e)

    def read_nodes(self,
                   filename: str,
                   compression: Optional[str] = None) -> Generator:
        """
        Read node records from a JSON.

        Parameters
        ----------
        filename: str
            The filename to read from
        compression: Optional[str]
            The compression type

        Returns
        -------
        Generator
            A generator for node records

        """
        if compression and compression == "gz":
            FH = gzip.open(filename, "rb")
        else:
            FH = open(filename, "rb")
        for n in ijson.items(FH, "graphs.item.nodes.item"):
            yield self.read_node(n)

    def read_node(self, node: Dict) -> Optional[Tuple[str, Dict]]:
        """
        Read and parse a node record.

        Parameters
        ----------
        node: Dict
            The node record

        Returns
        -------
        Dict
            The processed node

        """
        curie = self.prefix_manager.contract(node["id"])
        node_properties = {}
        if "meta" in node:
            node_properties = self.parse_meta(node["id"], node["meta"])

        fixed_node = dict()
        fixed_node["id"] = curie
        if "lbl" in node:
            fixed_node["name"] = node["lbl"]
        fixed_node["iri"] = node["id"]

        if "description" in node_properties:
            fixed_node["description"] = node_properties["description"]
        if "synonym" in node_properties:
            fixed_node["synonym"] = node_properties["synonym"]
        if "xrefs" in node_properties:
            fixed_node["xref"] = node_properties["xrefs"]
        if "subsets" in node_properties:
            fixed_node["subsets"] = node_properties["subsets"]

        if "category" not in node:
            category = self.get_category(curie, node)
            if category:
                fixed_node["category"] = [category]
            else:
                fixed_node["category"] = ["biolink:OntologyClass"]
        if "equivalent_nodes" in node_properties:
            equivalent_nodes = node_properties["equivalent_nodes"]
            fixed_node["same_as"] = equivalent_nodes
            # for n in node_properties['equivalent_nodes']:
            #     data = {'subject': fixed_node['id'], 'predicate': 'biolink:same_as',
            #     'object': n, 'relation': 'owl:sameAs'}
            #     super().load_node({'id': n, 'category': ['biolink:OntologyClass']})
            #     self.graph.add_edge(fixed_node['id'], n, **data)
        return super().read_node(fixed_node)

    def read_edges(self,
                   filename: str,
                   compression: Optional[str] = None) -> Generator:
        """
        Read edge records from a JSON.

        Parameters
        ----------
        filename: str
            The filename to read from
        compression: Optional[str]
            The compression type

        Returns
        -------
        Generator
            A generator for edge records

        """
        if compression == "gz":
            FH = gzip.open(filename, "rb")
        else:
            FH = open(filename, "rb")
        for e in ijson.items(FH, "graphs.item.edges.item"):
            yield self.read_edge(e)

    def read_edge(self, edge: Dict) -> Optional[Tuple]:
        """
        Read and parse an edge record.

        Parameters
        ----------
        edge: Dict
            The edge record

        Returns
        -------
        Dict
            The processed edge

        """
        fixed_edge = dict()
        fixed_edge["subject"] = self.prefix_manager.contract(edge["sub"])
        if PrefixManager.is_iri(edge["pred"]):
            curie = self.prefix_manager.contract(edge["pred"])
            if curie in self.ecache:
                edge_predicate = self.ecache[curie]
            else:
                element = get_biolink_element(curie)
                if not element:
                    try:
                        mapping = self.toolkit.get_element_by_mapping(
                            edge["pred"])
                        if mapping:
                            element = self.toolkit.get_element(mapping)

                    #  TODO: not sure how this exception would be thrown here.. under what conditions?
                    except ValueError as e:
                        self.owner.log_error(
                            entity=str(edge["pred"]),
                            error_type=ErrorType.INVALID_EDGE_PREDICATE,
                            message=str(e))
                        element = None

                if element:
                    edge_predicate = format_biolink_slots(
                        element.name.replace(",", ""))
                    fixed_edge["predicate"] = edge_predicate
                else:
                    edge_predicate = "biolink:related_to"
                self.ecache[curie] = edge_predicate
            fixed_edge["predicate"] = edge_predicate
            fixed_edge["relation"] = curie
        else:
            if edge["pred"] == "is_a":
                fixed_edge["predicate"] = "biolink:subclass_of"
                fixed_edge["relation"] = "rdfs:subClassOf"
            elif edge["pred"] == "has_part":
                fixed_edge["predicate"] = "biolink:has_part"
                fixed_edge["relation"] = "BFO:0000051"
            elif edge["pred"] == "part_of":
                fixed_edge["predicate"] = "biolink:part_of"
                fixed_edge["relation"] = "BFO:0000050"
            else:
                fixed_edge[
                    "predicate"] = f"biolink:{edge['pred'].replace(' ', '_')}"
                fixed_edge["relation"] = edge["pred"]

        fixed_edge["object"] = self.prefix_manager.contract(edge["obj"])
        for x in edge.keys():
            if x not in {"sub", "pred", "obj"}:
                fixed_edge[x] = edge[x]
        return super().read_edge(fixed_edge)

    def get_category(self, curie: str, node: dict) -> Optional[str]:
        """
        Get category for a given CURIE.

        Parameters
        ----------
        curie: str
            Curie for node
        node: dict
            Node data

        Returns
        -------
        Optional[str]
            Category for the given node CURIE.

        """
        category = None
        # use meta.basicPropertyValues
        if "meta" in node and "basicPropertyValues" in node["meta"]:
            for p in node["meta"]["basicPropertyValues"]:
                if p["pred"] == self.HAS_OBO_NAMESPACE:
                    category = p["val"]
                    element = self.toolkit.get_element(category)
                    if element:
                        category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                    else:
                        element = self.toolkit.get_element_by_mapping(category)
                        if element:
                            category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element))}"
                        else:
                            category = "biolink:OntologyClass"

        if not category or category == "biolink:OntologyClass":
            prefix = PrefixManager.get_prefix(curie)
            # TODO: the mapping should be via biolink-model lookups
            if prefix == "HP":
                category = "biolink:PhenotypicFeature"
            elif prefix == "CHEBI":
                category = "biolink:ChemicalSubstance"
            elif prefix == "MONDO":
                category = "biolink:Disease"
            elif prefix == "UBERON":
                category = "biolink:AnatomicalEntity"
            elif prefix == "SO":
                category = "biolink:SequenceFeature"
            elif prefix == "CL":
                category = "biolink:Cell"
            elif prefix == "PR":
                category = "biolink:Protein"
            elif prefix == "NCBITaxon":
                category = "biolink:OrganismalEntity"
            else:
                self.owner.log_error(
                    entity=f"{str(category)} for node {curie}",
                    error_type=ErrorType.MISSING_CATEGORY,
                    message=
                    f"Missing category; Defaulting to 'biolink:OntologyClass'",
                    message_level=MessageLevel.WARNING)
        return category

    def parse_meta(self, node: str, meta: Dict) -> Dict:
        """
        Parse 'meta' field of a node.

        Parameters
        ----------
        node: str
            Node identifier
        meta: Dict
            meta dictionary for the node

        Returns
        -------
        Dict
            A dictionary that contains 'description', 'synonyms',
            'xrefs', and 'equivalent_nodes'.

        """
        # cross species links are in meta; this needs to be parsed properly too
        # do not put assumptions in code; import as much as possible

        properties = {}
        if "definition" in meta:
            # parse 'definition' as 'description'
            description = meta["definition"]["val"]
            properties["description"] = description

        if "subsets" in meta:
            # parse 'subsets'
            subsets = meta["subsets"]
            properties["subsets"] = [
                x.split("#")[1] if "#" in x else x for x in subsets
            ]

        if "synonyms" in meta:
            # parse 'synonyms' as 'synonym'
            synonyms = [s["val"] for s in meta["synonyms"]]
            properties["synonym"] = synonyms

        if "xrefs" in meta:
            # parse 'xrefs' as 'xrefs'
            xrefs = [x["val"] for x in meta["xrefs"]]
            properties["xrefs"] = xrefs

        if "deprecated" in meta:
            # parse 'deprecated' flag
            properties["deprecated"] = meta["deprecated"]

        equivalent_nodes = []
        if "basicPropertyValues" in meta:
            # parse SKOS_EXACT_MATCH entries as 'equivalent_nodes'
            for p in meta["basicPropertyValues"]:
                if p["pred"] in {self.SKOS_EXACT_MATCH}:
                    n = self.prefix_manager.contract(p["val"])
                    if not n:
                        n = p["val"]
                    equivalent_nodes.append(n)
        properties["equivalent_nodes"] = equivalent_nodes
        return properties