Пример #1
0
class BiolinkModel:
    root_type = 'biolink:NamedThing'

    def __init__(self, bl_version='1.5.0'):
        self.bl_url = f'https://raw.githubusercontent.com/biolink/biolink-model/{bl_version}/biolink-model.yaml'
        self.toolkit = Toolkit(self.bl_url)

    """ Programmatic model of Biolink. """

    def to_camel_case(self, snake_str):
        """ Convert a snake case string to camel case. """
        components = snake_str.split('_')
        return ''.join(x.title() for x in components)

    def get_class(self, name):
        """ Get a Python class from a string name. """
        return getattr(sys.modules["biolink.model"], name)

    def is_derived(self, a_class_name, classes):
        """ Return true if the class derives from any of the provided classes. """
        for c in classes:
            if isinstance(self.get_class(self.to_camel_case(a_class_name)), c):
                return True
        return False

    def find_biolink_leaves(self, biolink_concepts):
        """
        Given a list of biolink concepts, returns the leaves removing any parent concepts.
        :param biolink_concepts: list of biolink concepts
        :return: leave concepts.
        """
        ancestry_set = set()
        all_mixins_in_tree = set()
        all_concepts = set(biolink_concepts)
        # Keep track of things like "MacromolecularMachine" in current datasets
        # @TODO remove this and make nodes as errors
        unknown_elements = set()
        for x in all_concepts:
            current_element = self.toolkit.get_element(x)
            mixins = set()
            if current_element:
                if 'mixins' in current_element and len(
                        current_element['mixins']):
                    for m in current_element['mixins']:
                        mixins.add(self.toolkit.get_element(m).class_uri)
            else:
                unknown_elements.add(x)
            ancestors = set(
                self.toolkit.get_ancestors(x, reflexive=False, formatted=True))
            ancestry_set = ancestry_set.union(ancestors)
            all_mixins_in_tree = all_mixins_in_tree.union(mixins)
        leaf_set = all_concepts - ancestry_set - all_mixins_in_tree - unknown_elements
        return leaf_set

    def get_leaf_class(self, names):
        """ Return the leaf classes in the provided list of names. """
        leaves = list(self.find_biolink_leaves(names))
        return leaves[0]
Пример #2
0
def test_get_element():
    toolkit = Toolkit()
    gene = toolkit.get_element('gene')
    locus = toolkit.get_element('locus')
    assert gene == locus

    o = toolkit.get_element('drug intake')
    assert o and o.name == 'drug exposure'

    o = toolkit.get_element('molecular function')
    assert o and o.name == 'molecular activity'

    o = toolkit.get_element('RNA Product')
    assert o and o.name == 'RNA product'

    o = toolkit.get_element('rna product')
    assert o and o.name == 'RNA product'
Пример #3
0
class NodeFactory:
    def __init__(self, label_dir):
        #self.url_base = 'http://arrival.edc.renci.org:32511/bl'
        self.url_base = 'https://bl-lookup-sri.renci.org/bl'
        self.toolkit = Toolkit(
            'https://raw.githubusercontent.com/biolink/biolink-model/1.6.1/biolink-model.yaml'
        )
        self.ancestor_map = {}
        self.prefix_map = {}
        self.ignored_prefixes = set()
        self.extra_labels = {}
        self.label_dir = label_dir

    def get_ancestors(self, input_type):
        if input_type in self.ancestor_map:
            return self.ancestor_map[input_type]
        a = self.toolkit.get_ancestors(input_type)
        ancs = [self.toolkit.get_element(ai)['class_uri'] for ai in a]
        if input_type not in ancs:
            ancs = [input_type] + ancs
        self.ancestor_map[input_type] = ancs
        return ancs

    def get_prefixes(self, input_type):
        if input_type in self.prefix_map:
            return self.prefix_map[input_type]
        url = f'{self.url_base}/{input_type}'
        response = requests.get(url)
        try:
            j = response.json()
            prefs = j['id_prefixes']
        except:
            #this is a mega hack to deal with the taxon change
            prefs = ['NCBITaxon', 'MESH']
        #The pref are in a particular order, but apparently it can have dups (ugh)
        newprefs = ['']
        for pref in prefs:
            if not pref == newprefs[-1]:
                newprefs.append(pref)
        prefs = newprefs[1:]
        self.prefix_map[input_type] = prefs
        return prefs

    def make_json_id(self, input):
        if isinstance(input, LabeledID):
            if input.label is not None and input.label != '':
                return {'identifier': input.identifier, 'label': input.label}
            return {'identifier': input.identifier}
        return {'identifier': input}

    def clean_list(self, input_identifiers):
        #Sometimes we end up with something like [(HP:123,'name'),HP:123,UMLS:3445] Clean up
        cleanup = defaultdict(list)
        for x in list(input_identifiers):
            if isinstance(x, LabeledID):
                cleanup[x.identifier].append(x)
            else:
                cleanup[x].append(x)
        cleaned = []
        for v in cleanup.values():
            if len(v) == 1:
                cleaned.append(v[0])
            else:
                #Originally, we were just trying to get the LabeledID.  But sometimes we get more than one, so len(v)
                # can be more than two.
                wrote = False
                for vi in v:
                    if isinstance(vi, LabeledID):
                        cleaned.append(vi)
                        wrote = True
                        break
                if not wrote:
                    print(input_identifiers)
                    exit()
        return cleaned

    def load_extra_labels(self, prefix):
        labelfname = os.path.join(self.label_dir, prefix, 'labels')
        lbs = {}
        if os.path.exists(labelfname):
            with open(labelfname, 'r') as inf:
                for line in inf:
                    x = line.strip().split('\t')
                    lbs[x[0]] = x[1]
        self.extra_labels[prefix] = lbs

    def apply_labels(self, input_identifiers, labels):
        #Originally we needed to clean up the identifer lists, because there would be both labeledids and
        # string ids and we had to reconcile them.
        # But now, we only allow regular ids in the list, and now we need to turn some of them into labeled ids for output
        labeled_list = []
        for iid in input_identifiers:
            if isinstance(iid, LabeledID):
                print('LabeledID dont belong here, pass in labels seperately',
                      iid)
                exit()
            if iid in labels:
                labeled_list.append(
                    LabeledID(identifier=iid, label=labels[iid]))
            else:
                prefix = Text.get_prefix(iid)
                if prefix not in self.extra_labels:
                    self.load_extra_labels(prefix)
                if iid in self.extra_labels[prefix]:
                    labeled_list.append(
                        LabeledID(identifier=iid,
                                  label=self.extra_labels[prefix][iid]))
                else:
                    labeled_list.append(iid)
        return labeled_list

    def create_node(self, input_identifiers, node_type, labels={}):
        #This is where we will normalize, i.e. choose the best id, and add types in accord with BL.
        #we should also include provenance and version information for the node set build.
        ancestors = self.get_ancestors(node_type)
        #ancestors.reverse()
        prefixes = self.get_prefixes(node_type)
        if len(input_identifiers) == 0:
            return None
        if len(input_identifiers) > 1000:
            print('this seems like a lot')
            print(len(input_identifiers))
        cleaned = self.apply_labels(input_identifiers, labels)
        try:
            idmap = defaultdict(list)
            for i in list(cleaned):
                idmap[Text.get_curie(i).upper()].append(i)
        except AttributeError:
            print('something very bad')
            print(input_identifiers)
            print(len(input_identifiers))
            for i in list(input_identifiers):
                print(i)
                print(type(i))
                print(Text.get_curie(i))
                print(Text.get_curie(i).upper())
            exit()
        identifiers = []
        accepted_ids = set()
        #Converting identifiers from LabeledID to dicts
        #In order to be consistent from run to run, we need to worry about the
        # case where e.g. there are 2 UMLS id's and UMLS is the preferred pref.
        # We're going to choose the canonical ID here just by sorting the N .
        for p in prefixes:
            pupper = p.upper()
            if pupper in idmap:
                newids = []
                for v in idmap[pupper]:
                    newid = Text.recurie(v, p)
                    jid = self.make_json_id(newid)
                    newids.append((jid['identifier'], jid))
                    accepted_ids.add(v)
                newids.sort()
                identifiers += [nid[1] for nid in newids]
        #Warn if we have prefixes that we're ignoring
        for k, vals in idmap.items():
            for v in vals:
                if v not in accepted_ids and (
                        k, node_type) not in self.ignored_prefixes:
                    print(
                        f'Ignoring prefix {k} for type {node_type}, identifier {v}'
                    )
                    self.ignored_prefixes.add((k, node_type))
        if len(identifiers) == 0:
            return None
        best_id = identifiers[0]['identifier']
        # identifiers is in preferred order, so choose the first non-empty label to be the node label
        labels = list(
            filter(lambda x: len(x) > 0,
                   [l['label'] for l in identifiers if 'label' in l]))
        label = None
        if len(labels) > 0:
            label = labels[0]

        node = {
            'id': {
                'identifier': best_id,
            },
            'equivalent_identifiers': identifiers,
            'type': ancestors
        }
        if label is not None:
            node['id']['label'] = label
        return node
Пример #4
0
class ObographSource(JsonSource):
    """
    ObographSource is responsible for reading data as records
    from an OBO Graph JSON.
    """

    HAS_OBO_NAMESPACE = 'http://www.geneontology.org/formats/oboInOwl#hasOBONamespace'
    SKOS_EXACT_MATCH = 'http://www.w3.org/2004/02/skos/core#exactMatch'

    def __init__(self):
        super().__init__()
        self.toolkit = Toolkit()
        self.ecache: Dict = {}

    def parse(
        self,
        filename: str,
        format: str = 'json',
        compression: Optional[str] = None,
        provided_by: Optional[str] = None,
        **kwargs: Any,
    ) -> Generator:
        """
        This method reads from JSON and yields records.

        Parameters
        ----------
        filename: str
            The filename to parse
        format: str
            The format (``json``)
        compression: Optional[str]
            The compression type (``gz``)
        provided_by: Optional[str]
            The name of the source providing the input file
        kwargs: Any
            Any additional arguments

        Returns
        -------
        Generator
            A generator for records

        """
        if provided_by:
            self.graph_metadata['provided_by'] = [provided_by]
        n = self.read_nodes(filename, compression)
        e = self.read_edges(filename, compression)
        yield from chain(n, e)

    def read_nodes(self,
                   filename: str,
                   compression: Optional[str] = None) -> Generator:
        """
        Read node records from a JSON.

        Parameters
        ----------
        filename: str
            The filename to read from
        compression: Optional[str]
            The compression type

        Returns
        -------
        Generator
            A generator for node records

        """
        if compression and compression == 'gz':
            FH = gzip.open(filename, 'rb')
        else:
            FH = open(filename, 'rb')
        for n in ijson.items(FH, 'graphs.item.nodes.item'):
            yield self.read_node(n)

    def read_node(self, node: Dict) -> Dict:
        """
        Read and parse a node record.

        Parameters
        ----------
        node: Dict
            The node record

        Returns
        -------
        Dict
            The processed node

        """
        curie = self.prefix_manager.contract(node['id'])
        node_properties = {}
        if 'meta' in node:
            node_properties = self.parse_meta(node['id'], node['meta'])

        fixed_node = dict()
        fixed_node['id'] = curie
        if 'lbl' in node:
            fixed_node['name'] = node['lbl']
        fixed_node['iri'] = node['id']

        if 'description' in node_properties:
            fixed_node['description'] = node_properties['description']
        if 'synonym' in node_properties:
            fixed_node['synonym'] = node_properties['synonym']
        if 'xrefs' in node_properties:
            fixed_node['xref'] = node_properties['xrefs']
        if 'subsets' in node_properties:
            fixed_node['subsets'] = node_properties['subsets']

        if 'category' not in node:
            category = self.get_category(curie, node)
            if category:
                fixed_node['category'] = [category]
            else:
                fixed_node['category'] = ['biolink:OntologyClass']
        if 'equivalent_nodes' in node_properties:
            equivalent_nodes = node_properties['equivalent_nodes']
            fixed_node['same_as'] = equivalent_nodes
            # for n in node_properties['equivalent_nodes']:
            #     data = {'subject': fixed_node['id'], 'predicate': 'biolink:same_as', 'object': n, 'relation': 'owl:sameAs'}
            #     super().load_node({'id': n, 'category': ['biolink:OntologyClass']})
            #     self.graph.add_edge(fixed_node['id'], n, **data)
        return super().read_node(fixed_node)

    def read_edges(self,
                   filename: str,
                   compression: Optional[str] = None) -> Generator:
        """
        Read edge records from a JSON.

        Parameters
        ----------
        filename: str
            The filename to read from
        compression: Optional[str]
            The compression type

        Returns
        -------
        Generator
            A generator for edge records

        """
        if compression == 'gz':
            FH = gzip.open(filename, 'rb')
        else:
            FH = open(filename, 'rb')
        for e in ijson.items(FH, 'graphs.item.edges.item'):
            yield self.read_edge(e)

    def read_edge(self, edge: Dict) -> Dict:
        """
        Read and parse an edge record.

        Parameters
        ----------
        edge: Dict
            The edge record

        Returns
        -------
        Dict
            The processed edge

        """
        fixed_edge = dict()
        fixed_edge['subject'] = self.prefix_manager.contract(edge['sub'])
        if PrefixManager.is_iri(edge['pred']):
            curie = self.prefix_manager.contract(edge['pred'])
            if curie in self.ecache:
                edge_predicate = self.ecache[curie]
            else:
                element = get_biolink_element(curie)
                if not element:
                    try:
                        mapping = self.toolkit.get_element_by_mapping(
                            edge['pred'])
                        if mapping:
                            element = self.toolkit.get_element(mapping)
                    except ValueError as e:
                        log.error(e)

                if element:
                    edge_predicate = format_biolink_slots(
                        element.name.replace(',', ''))
                    fixed_edge['predicate'] = edge_predicate
                else:
                    edge_predicate = 'biolink:related_to'
                self.ecache[curie] = edge_predicate
            fixed_edge['predicate'] = edge_predicate
            fixed_edge['relation'] = curie
        else:
            if edge['pred'] == 'is_a':
                fixed_edge['predicate'] = 'biolink:subclass_of'
                fixed_edge['relation'] = 'rdfs:subClassOf'
            elif edge['pred'] == 'has_part':
                fixed_edge['predicate'] = 'biolink:has_part'
                fixed_edge['relation'] = "BFO:0000051"
            elif edge['pred'] == 'part_of':
                fixed_edge['predicate'] = 'biolink:part_of'
                fixed_edge['relation'] = "BFO:0000050"
            else:
                fixed_edge[
                    'predicate'] = f"biolink:{edge['pred'].replace(' ', '_')}"
                fixed_edge['relation'] = edge['pred']

        fixed_edge['object'] = self.prefix_manager.contract(edge['obj'])
        for x in edge.keys():
            if x not in {'sub', 'pred', 'obj'}:
                fixed_edge[x] = edge[x]
        return super().read_edge(fixed_edge)

    def get_category(self, curie: str, node: dict) -> Optional[str]:
        """
        Get category for a given CURIE.

        Parameters
        ----------
        curie: str
            Curie for node
        node: dict
            Node data

        Returns
        -------
        Optional[str]
            Category for the given node CURIE.

        """
        category = None
        # use meta.basicPropertyValues
        if 'meta' in node and 'basicPropertyValues' in node['meta']:
            for p in node['meta']['basicPropertyValues']:
                if p['pred'] == self.HAS_OBO_NAMESPACE:
                    category = p['val']
                    element = self.toolkit.get_element(category)
                    if element:
                        category = (
                            f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                        )
                    else:
                        element = self.toolkit.get_element_by_mapping(category)
                        if element:
                            category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                        else:
                            category = 'biolink:OntologyClass'

        if not category or category == 'biolink:OntologyClass':
            prefix = PrefixManager.get_prefix(curie)
            # TODO: the mapping should be via biolink-model lookups
            if prefix == 'HP':
                category = "biolink:PhenotypicFeature"
            elif prefix == 'CHEBI':
                category = "biolink:ChemicalSubstance"
            elif prefix == 'MONDO':
                category = "biolink:Disease"
            elif prefix == 'UBERON':
                category = "biolink:AnatomicalEntity"
            elif prefix == 'SO':
                category = "biolink:SequenceFeature"
            elif prefix == 'CL':
                category = "biolink:Cell"
            elif prefix == 'PR':
                category = "biolink:Protein"
            elif prefix == 'NCBITaxon':
                category = "biolink:OrganismalEntity"
            else:
                log.debug(
                    f"{curie} Could not find a category mapping for '{category}'; Defaulting to 'biolink:OntologyClass'"
                )
        return category

    def parse_meta(self, node: str, meta: Dict) -> Dict:
        """
        Parse 'meta' field of a node.

        Parameters
        ----------
        node: str
            Node identifier
        meta: Dict
            meta dictionary for the node

        Returns
        -------
        Dict
            A dictionary that contains 'description', 'synonyms',
            'xrefs', and 'equivalent_nodes'.

        """
        # cross species links are in meta; this needs to be parsed properly too
        # do not put assumptions in code; import as much as possible

        properties = {}
        if 'definition' in meta:
            # parse 'definition' as 'description'
            description = meta['definition']['val']
            properties['description'] = description

        if 'subsets' in meta:
            # parse 'subsets'
            subsets = meta['subsets']
            properties['subsets'] = [
                x.split('#')[1] if '#' in x else x for x in subsets
            ]

        if 'synonyms' in meta:
            # parse 'synonyms' as 'synonym'
            synonyms = [s['val'] for s in meta['synonyms']]
            properties['synonym'] = synonyms

        if 'xrefs' in meta:
            # parse 'xrefs' as 'xrefs'
            xrefs = [x['val'] for x in meta['xrefs']]
            properties['xrefs'] = xrefs

        if 'deprecated' in meta:
            # parse 'deprecated' flag
            properties['deprecated'] = meta['deprecated']

        equivalent_nodes = []
        if 'basicPropertyValues' in meta:
            # parse SKOS_EXACT_MATCH entries as 'equivalent_nodes'
            for p in meta['basicPropertyValues']:
                if p['pred'] in {self.SKOS_EXACT_MATCH}:
                    n = self.prefix_manager.contract(p['val'])
                    if not n:
                        n = p['val']
                    equivalent_nodes.append(n)
        properties['equivalent_nodes'] = equivalent_nodes
        return properties
Пример #5
0
class ObographSource(JsonSource):
    """
    ObographSource is responsible for reading data as records
    from an OBO Graph JSON.
    """

    HAS_OBO_NAMESPACE = "http://www.geneontology.org/formats/oboInOwl#hasOBONamespace"
    SKOS_EXACT_MATCH = "http://www.w3.org/2004/02/skos/core#exactMatch"

    def __init__(self, owner):
        super().__init__(owner)
        self.toolkit = Toolkit()
        self.ecache: Dict = {}

    def parse(
        self,
        filename: str,
        format: str = "json",
        compression: Optional[str] = None,
        **kwargs: Any,
    ) -> Generator:
        """
        This method reads from JSON and yields records.

        Parameters
        ----------
        filename: str
            The filename to parse
        format: str
            The format (``json``)
        compression: Optional[str]
            The compression type (``gz``)
        kwargs: Any
            Any additional arguments

        Returns
        -------
        Generator
            A generator for records

        """
        self.set_provenance_map(kwargs)

        n = self.read_nodes(filename, compression)
        e = self.read_edges(filename, compression)
        yield from chain(n, e)

    def read_nodes(self,
                   filename: str,
                   compression: Optional[str] = None) -> Generator:
        """
        Read node records from a JSON.

        Parameters
        ----------
        filename: str
            The filename to read from
        compression: Optional[str]
            The compression type

        Returns
        -------
        Generator
            A generator for node records

        """
        if compression and compression == "gz":
            FH = gzip.open(filename, "rb")
        else:
            FH = open(filename, "rb")
        for n in ijson.items(FH, "graphs.item.nodes.item"):
            yield self.read_node(n)

    def read_node(self, node: Dict) -> Optional[Tuple[str, Dict]]:
        """
        Read and parse a node record.

        Parameters
        ----------
        node: Dict
            The node record

        Returns
        -------
        Dict
            The processed node

        """
        curie = self.prefix_manager.contract(node["id"])
        node_properties = {}
        if "meta" in node:
            node_properties = self.parse_meta(node["id"], node["meta"])

        fixed_node = dict()
        fixed_node["id"] = curie
        if "lbl" in node:
            fixed_node["name"] = node["lbl"]
        fixed_node["iri"] = node["id"]

        if "description" in node_properties:
            fixed_node["description"] = node_properties["description"]
        if "synonym" in node_properties:
            fixed_node["synonym"] = node_properties["synonym"]
        if "xrefs" in node_properties:
            fixed_node["xref"] = node_properties["xrefs"]
        if "subsets" in node_properties:
            fixed_node["subsets"] = node_properties["subsets"]

        if "category" not in node:
            category = self.get_category(curie, node)
            if category:
                fixed_node["category"] = [category]
            else:
                fixed_node["category"] = ["biolink:OntologyClass"]
        if "equivalent_nodes" in node_properties:
            equivalent_nodes = node_properties["equivalent_nodes"]
            fixed_node["same_as"] = equivalent_nodes
            # for n in node_properties['equivalent_nodes']:
            #     data = {'subject': fixed_node['id'], 'predicate': 'biolink:same_as',
            #     'object': n, 'relation': 'owl:sameAs'}
            #     super().load_node({'id': n, 'category': ['biolink:OntologyClass']})
            #     self.graph.add_edge(fixed_node['id'], n, **data)
        return super().read_node(fixed_node)

    def read_edges(self,
                   filename: str,
                   compression: Optional[str] = None) -> Generator:
        """
        Read edge records from a JSON.

        Parameters
        ----------
        filename: str
            The filename to read from
        compression: Optional[str]
            The compression type

        Returns
        -------
        Generator
            A generator for edge records

        """
        if compression == "gz":
            FH = gzip.open(filename, "rb")
        else:
            FH = open(filename, "rb")
        for e in ijson.items(FH, "graphs.item.edges.item"):
            yield self.read_edge(e)

    def read_edge(self, edge: Dict) -> Optional[Tuple]:
        """
        Read and parse an edge record.

        Parameters
        ----------
        edge: Dict
            The edge record

        Returns
        -------
        Dict
            The processed edge

        """
        fixed_edge = dict()
        fixed_edge["subject"] = self.prefix_manager.contract(edge["sub"])
        if PrefixManager.is_iri(edge["pred"]):
            curie = self.prefix_manager.contract(edge["pred"])
            if curie in self.ecache:
                edge_predicate = self.ecache[curie]
            else:
                element = get_biolink_element(curie)
                if not element:
                    try:
                        mapping = self.toolkit.get_element_by_mapping(
                            edge["pred"])
                        if mapping:
                            element = self.toolkit.get_element(mapping)

                    #  TODO: not sure how this exception would be thrown here.. under what conditions?
                    except ValueError as e:
                        self.owner.log_error(
                            entity=str(edge["pred"]),
                            error_type=ErrorType.INVALID_EDGE_PREDICATE,
                            message=str(e))
                        element = None

                if element:
                    edge_predicate = format_biolink_slots(
                        element.name.replace(",", ""))
                    fixed_edge["predicate"] = edge_predicate
                else:
                    edge_predicate = "biolink:related_to"
                self.ecache[curie] = edge_predicate
            fixed_edge["predicate"] = edge_predicate
            fixed_edge["relation"] = curie
        else:
            if edge["pred"] == "is_a":
                fixed_edge["predicate"] = "biolink:subclass_of"
                fixed_edge["relation"] = "rdfs:subClassOf"
            elif edge["pred"] == "has_part":
                fixed_edge["predicate"] = "biolink:has_part"
                fixed_edge["relation"] = "BFO:0000051"
            elif edge["pred"] == "part_of":
                fixed_edge["predicate"] = "biolink:part_of"
                fixed_edge["relation"] = "BFO:0000050"
            else:
                fixed_edge[
                    "predicate"] = f"biolink:{edge['pred'].replace(' ', '_')}"
                fixed_edge["relation"] = edge["pred"]

        fixed_edge["object"] = self.prefix_manager.contract(edge["obj"])
        for x in edge.keys():
            if x not in {"sub", "pred", "obj"}:
                fixed_edge[x] = edge[x]
        return super().read_edge(fixed_edge)

    def get_category(self, curie: str, node: dict) -> Optional[str]:
        """
        Get category for a given CURIE.

        Parameters
        ----------
        curie: str
            Curie for node
        node: dict
            Node data

        Returns
        -------
        Optional[str]
            Category for the given node CURIE.

        """
        category = None
        # use meta.basicPropertyValues
        if "meta" in node and "basicPropertyValues" in node["meta"]:
            for p in node["meta"]["basicPropertyValues"]:
                if p["pred"] == self.HAS_OBO_NAMESPACE:
                    category = p["val"]
                    element = self.toolkit.get_element(category)
                    if element:
                        category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}"
                    else:
                        element = self.toolkit.get_element_by_mapping(category)
                        if element:
                            category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element))}"
                        else:
                            category = "biolink:OntologyClass"

        if not category or category == "biolink:OntologyClass":
            prefix = PrefixManager.get_prefix(curie)
            # TODO: the mapping should be via biolink-model lookups
            if prefix == "HP":
                category = "biolink:PhenotypicFeature"
            elif prefix == "CHEBI":
                category = "biolink:ChemicalSubstance"
            elif prefix == "MONDO":
                category = "biolink:Disease"
            elif prefix == "UBERON":
                category = "biolink:AnatomicalEntity"
            elif prefix == "SO":
                category = "biolink:SequenceFeature"
            elif prefix == "CL":
                category = "biolink:Cell"
            elif prefix == "PR":
                category = "biolink:Protein"
            elif prefix == "NCBITaxon":
                category = "biolink:OrganismalEntity"
            else:
                self.owner.log_error(
                    entity=f"{str(category)} for node {curie}",
                    error_type=ErrorType.MISSING_CATEGORY,
                    message=
                    f"Missing category; Defaulting to 'biolink:OntologyClass'",
                    message_level=MessageLevel.WARNING)
        return category

    def parse_meta(self, node: str, meta: Dict) -> Dict:
        """
        Parse 'meta' field of a node.

        Parameters
        ----------
        node: str
            Node identifier
        meta: Dict
            meta dictionary for the node

        Returns
        -------
        Dict
            A dictionary that contains 'description', 'synonyms',
            'xrefs', and 'equivalent_nodes'.

        """
        # cross species links are in meta; this needs to be parsed properly too
        # do not put assumptions in code; import as much as possible

        properties = {}
        if "definition" in meta:
            # parse 'definition' as 'description'
            description = meta["definition"]["val"]
            properties["description"] = description

        if "subsets" in meta:
            # parse 'subsets'
            subsets = meta["subsets"]
            properties["subsets"] = [
                x.split("#")[1] if "#" in x else x for x in subsets
            ]

        if "synonyms" in meta:
            # parse 'synonyms' as 'synonym'
            synonyms = [s["val"] for s in meta["synonyms"]]
            properties["synonym"] = synonyms

        if "xrefs" in meta:
            # parse 'xrefs' as 'xrefs'
            xrefs = [x["val"] for x in meta["xrefs"]]
            properties["xrefs"] = xrefs

        if "deprecated" in meta:
            # parse 'deprecated' flag
            properties["deprecated"] = meta["deprecated"]

        equivalent_nodes = []
        if "basicPropertyValues" in meta:
            # parse SKOS_EXACT_MATCH entries as 'equivalent_nodes'
            for p in meta["basicPropertyValues"]:
                if p["pred"] in {self.SKOS_EXACT_MATCH}:
                    n = self.prefix_manager.contract(p["val"])
                    if not n:
                        n = p["val"]
                    equivalent_nodes.append(n)
        properties["equivalent_nodes"] = equivalent_nodes
        return properties
Пример #6
0
    class _GraphInterface:
        def __init__(self, host, port, auth):
            self.driver = Neo4jHTTPDriver(host=host, port=port, auth=auth)
            self.schema = None
            self.summary = None
            self.meta_kg = None
            self.bl_version = config.get('BL_VERSION', '1.5.0')
            self.bl_url = f'https://raw.githubusercontent.com/biolink/biolink-model/{self.bl_version}/biolink-model.yaml'
            self.toolkit = Toolkit(self.bl_url)

        def find_biolink_leaves(self, biolink_concepts: list):
            """
            Given a list of biolink concepts, returns the leaves removing any parent concepts.
            :param biolink_concepts: list of biolink concepts
            :return: leave concepts.
            """
            ancestry_set = set()
            all_mixins_in_tree = set()
            all_concepts = set(biolink_concepts)
            # Keep track of things like "MacromolecularMachine" in current datasets.
            unknown_elements = set()

            for x in all_concepts:
                current_element = self.toolkit.get_element(x)
                mixins = set()
                if current_element:
                    if 'mixins' in current_element and len(
                            current_element['mixins']):
                        for m in current_element['mixins']:
                            mixins.add(self.toolkit.get_element(m).class_uri)
                else:
                    unknown_elements.add(x)
                ancestors = set(
                    self.toolkit.get_ancestors(x,
                                               reflexive=False,
                                               formatted=True))
                ancestry_set = ancestry_set.union(ancestors)
                all_mixins_in_tree = all_mixins_in_tree.union(mixins)
            leaf_set = all_concepts - ancestry_set - all_mixins_in_tree - unknown_elements
            return leaf_set

        def invert_predicate(self, biolink_predicate):
            """Given a biolink predicate, find its inverse"""
            element = self.toolkit.get_element(biolink_predicate)
            if element is None:
                return None
            # If its symmetric
            if 'symmetric' in element and element.symmetric:
                return biolink_predicate
            # if neither symmetric nor an inverse is found
            if 'inverse' not in element or not element['inverse']:
                return None
            # if an inverse is found
            return self.toolkit.get_element(element['inverse']).slot_uri

        def get_schema(self):
            """
            Gets the schema of the graph. To be used by. Also generates graph summary
            :return: Dict of structure source label as outer most keys, target labels as inner keys and list of predicates
            as value.
            :rtype: dict
            """
            self.schema_raw_result = {}
            if self.schema is None:
                query = """
                           MATCH (a)-[x]->(b)
                           WHERE not a:Concept and not b:Concept                                                          
                           RETURN DISTINCT labels(a) as source_labels, type(x) as predicate, labels(b) as target_labels
                           """
                logger.info(
                    f"starting query {query} on graph... this might take a few"
                )
                result = self.driver.run_sync(query)
                logger.info(f"completed query, preparing initial schema")
                structured = self.convert_to_dict(result)
                self.schema_raw_result = structured
                schema_bag = {}
                # permute source labels and target labels array
                # replacement for unwind for previous cypher
                structured_expanded = []
                for triplet in structured:
                    # Since there are some nodes in data currently just one label ['biolink:NamedThing']
                    # This filter is to avoid that scenario.
                    # @TODO need to remove this filter when data build
                    #  avoids adding nodes with single ['biolink:NamedThing'] labels.
                    filter_named_thing = lambda x: list(
                        filter(lambda y: y != 'biolink:NamedThing', x))
                    source_labels, predicate, target_labels =\
                        self.find_biolink_leaves(filter_named_thing(triplet['source_labels'])), triplet['predicate'], \
                        self.find_biolink_leaves(filter_named_thing(triplet['target_labels']))
                    for source_label in source_labels:
                        for target_label in target_labels:
                            structured_expanded.append({
                                'source_label': source_label,
                                'target_label': target_label,
                                'predicate': predicate
                            })
                structured = structured_expanded
                for triplet in structured:
                    subject = triplet['source_label']
                    predicate = triplet['predicate']
                    objct = triplet['target_label']
                    if subject not in schema_bag:
                        schema_bag[subject] = {}
                    if objct not in schema_bag[subject]:
                        schema_bag[subject][objct] = []
                    if predicate not in schema_bag[subject][objct]:
                        schema_bag[subject][objct].append(predicate)

                    # If we invert the order of the nodes we also have to invert the predicate
                    inverse_predicate = self.invert_predicate(predicate)
                    if inverse_predicate is not None and \
                            inverse_predicate not in schema_bag.get(objct,{}).get(subject,[]):
                        # create the list if empty
                        if objct not in schema_bag:
                            schema_bag[objct] = {}
                        if subject not in schema_bag[objct]:
                            schema_bag[objct][subject] = []
                        schema_bag[objct][subject].append(inverse_predicate)
                self.schema = schema_bag
                logger.info("schema done.")
                if not self.summary:
                    query = """
                    MATCH (c) RETURN DISTINCT labels(c) as types, count(c) as count                
                    """
                    logger.info(f'generating graph summary: {query}')
                    raw = self.convert_to_dict(self.driver.run_sync(query))
                    summary = {}
                    for node in raw:
                        labels = node['types']
                        count = node['count']
                        query = f"""
                        MATCH (:{':'.join(labels)})-[e]->(b) WITH DISTINCT e , b 
                        RETURN 
                            type(e) as edge_types, 
                            count(e) as edge_counts,
                            labels(b) as target_labels 
                        """
                        raw = self.convert_to_dict(self.driver.run_sync(query))
                        summary_key = ':'.join(labels)
                        summary[summary_key] = {'nodes_count': count}
                        for row in raw:
                            target_key = ':'.join(row['target_labels'])
                            edge_name = row['edge_types']
                            edge_count = row['edge_counts']
                            summary[summary_key][target_key] = summary[
                                summary_key].get(target_key, {})
                            summary[summary_key][target_key][
                                edge_name] = edge_count
                    self.summary = summary
                    logger.info(
                        f'generated summary for {len(summary)} node types.')
            return self.schema

        async def get_mini_schema(self, source_id, target_id):
            """
            Given either id of source and/or target returns predicates that relate them. And their
            possible labels.
            :param source_id:
            :param target_id:
            :return:
            """
            source_id_syntaxed = f"{{id: \"{source_id}\"}}" if source_id else ''
            target_id_syntaxed = f"{{id: \"{target_id}\"}}" if target_id else ''
            query = f"""
                            MATCH (a{source_id_syntaxed})-[x]->(b{target_id_syntaxed}) WITH
                                [la in labels(a) where la <> 'Concept'] as source_label,
                                [lb in labels(b) where lb <> 'Concept'] as target_label,
                                type(x) as predicate
                            RETURN DISTINCT source_label, predicate, target_label
                        """
            response = await self.driver.run(query)
            response = self.convert_to_dict(response)
            return response

        async def get_node(self, node_type: str, curie: str) -> list:
            """
            Returns a node that matches curie as its ID.
            :param node_type: Type of the node.
            :type node_type:str
            :param curie: Curie.
            :type curie: str
            :return: value of the node in neo4j.
            :rtype: list
            """
            query = f"MATCH (c:`{node_type}`{{id: '{curie}'}}) return c"
            response = await self.driver.run(query)

            data = response.get('results', [{}])[0].get('data', [])
            '''
            data looks like 
            [
            {'row': [{...node data..}], 'meta': [{...}]},
            {'row': [{...node data..}], 'meta': [{...}]},
            {'row': [{...node data..}], 'meta': [{...}]}
            ]            
            '''
            rows = []
            if len(data):
                from functools import reduce
                rows = reduce(lambda x, y: x + y.get('row', []), data, [])
            return rows

        async def get_single_hops(self, source_type: str, target_type: str,
                                  curie: str) -> list:
            """
            Returns a triplets of source to target where source id is curie.
            :param source_type: Type of the source node.
            :type source_type: str
            :param target_type: Type of target node.
            :type target_type: str
            :param curie: Curie of source node.
            :type curie: str
            :return: list of triplets where each item contains source node, edge, target.
            :rtype: list
            """

            query = f'MATCH (c:`{source_type}`{{id: \'{curie}\'}})-[e]->(b:`{target_type}`) return distinct c , e, b'
            response = await self.driver.run(query)
            rows = list(
                map(lambda data: data['row'], response['results'][0]['data']))
            query = f'MATCH (c:`{source_type}`{{id: \'{curie}\'}})<-[e]-(b:`{target_type}`) return distinct b , e, c'
            response = await self.driver.run(query)
            rows += list(
                map(lambda data: data['row'], response['results'][0]['data']))

            return rows

        async def run_cypher(self, cypher: str, **kwargs) -> list:
            """
            Runs cypher directly.
            :param cypher: cypher query.
            :type cypher: str
            :return: unprocessed neo4j response.
            :rtype: list
            """
            return await self.driver.run(cypher, **kwargs)

        async def get_sample(self, node_type):
            """
            Returns a few nodes.
            :param node_type: Type of nodes.
            :type node_type: str
            :return: Node dict values.
            :rtype: dict
            """
            query = f"MATCH (c:{node_type}) return c limit 5"
            response = await self.driver.run(query)
            rows = response['results'][0]['data'][0]['row']
            return rows

        async def get_examples(self, source, target=None):
            """
            Returns an example for source node only if target is not specified, if target is specified a sample one hop
            is returned.
            :param source: Node type of the source node.
            :type source: str
            :param target: Node type of the target node.
            :type target: str
            :return: A single source node value if target is not provided. If target is provided too, a triplet.
            :rtype:
            """
            if target:
                query = f"MATCH (source:{source})-[edge]->(target:{target}) return source, edge, target limit 1"
                response = await self.run_cypher(query)
                final = list(
                    map(lambda data: data['row'],
                        response['results'][0]['data']))
                return final
            else:
                query = f"MATCH ({source}:{source}) return {source} limit 1"
                response = await self.run_cypher(query)
                final = list(
                    map(lambda node: node[source],
                        self.driver.convert_to_dict(response)))
                return final

        def get_curie_prefix_by_node_type(self, node_type):
            query = f"""
            MATCH (n:`{node_type}`) return collect(n.id) as ids
            """
            logger.info(
                f"starting query {query} on graph... this might take a few")
            result = self.driver.run_sync(query)
            logger.info(f"completed query, collecting node curie prefixes")
            result = self.convert_to_dict(result)
            curie_prefixes = set()
            for i in result[0]['ids']:
                curie_prefixes.add(i.split(':')[0])
            # sort according to bl model
            node_bl_def = self.toolkit.get_element(node_type)
            id_prefixes = node_bl_def.id_prefixes
            sorted_curie_prefixes = [
                i for i in id_prefixes if i in curie_prefixes
            ]  # gives presidence to what's in BL
            # add other ids even if not in BL next
            sorted_curie_prefixes += [
                i for i in curie_prefixes if i not in sorted_curie_prefixes
            ]
            return sorted_curie_prefixes

        async def get_meta_kg(self):
            if self.meta_kg:
                return self.meta_kg
            schema = self.get_schema()
            nodes = {}
            predicates = []
            for subject in schema:
                for object in schema[subject]:
                    for edge_type in schema[subject][object]:
                        predicates.append({
                            'subject': subject,
                            'object': object,
                            'predicate': edge_type
                        })
                    if object not in nodes:
                        nodes[object] = {
                            'id_prefixes':
                            list(self.get_curie_prefix_by_node_type(object))
                        }
                if subject not in nodes:
                    nodes[subject] = {
                        'id_prefixes':
                        list(self.get_curie_prefix_by_node_type(subject))
                    }
            self.meta_kg = {'nodes': nodes, 'edges': predicates}
            return self.meta_kg

        def supports_apoc(self):
            """
            Returns true if apoc is supported by backend database.
            :return: bool true if neo4j supports apoc.
            """
            return self.driver.check_apoc_support()

        async def run_apoc_cover(self, ids: list):
            """
            Runs apoc.algo.cover on list of ids
            :param ids:
            :return: dictionary of edges and source and target nodes ids
            """
            query = f"""
                        MATCH (node:`biolink:NamedThing`)
                        USING INDEX node:`biolink:NamedThing`(id)
                        WHERE node.id in {ids}
                        WITH collect(node) as nodes
                        CALL apoc.algo.cover(nodes) yield rel
                        WITH {{subject: startNode(rel).id ,
                               object: endNode(rel).id,
                               predicate: type(rel),
                               edge: rel }} as row
                        return collect(row) as result                                        
                        """
            result = self.convert_to_dict(self.driver.run_sync(query))
            return result

        def convert_to_dict(self, result):
            return self.driver.convert_to_dict(result)
Пример #7
0
class WrappedBMT:
    """
    Wrapping around some of the BMT Toolkit functions
    to provide case conversions to the new format
    """
    def __init__(self):
        self.bmt = BMToolkit()
        self.all_slots = self.bmt.get_all_slots()
        self.all_slots_formatted = [
            "biolink:" + s.replace(" ", "_") for s in self.all_slots
        ]
        self.prefix = "biolink:"

        self.entity_prefix_mapping = {
            bmt.util.format(el_name, case="pascal"): id_prefixes
            for el_name in self.bmt.get_all_classes()
            if (el := self.bmt.get_element(el_name)) is not None
            if (id_prefixes := getattr(el, "id_prefixes", []))
        }

    def new_case_to_old_case(self, s):
        """
        Convert new biolink case format (biolink:GeneOrGeneProduct)
        to old case format (gene or gene product)

        Also works with slots (biolink:related_to -> related to)
        """
        s = s.replace(self.prefix, "")
        if s in self.all_slots_formatted:
            return s.replace("_", " ")
        else:
            return camel_to_snake(s)

    def old_case_to_new_case(self, s):
        """
        Convert old case format (gene or gene product)
        to new biolink case format (biolink:GeneOrGeneProduct)

        Also works with slots (related to -> biolink:related_to)
        """
        if s in self.all_slots:
            return self.prefix + s.replace(" ", "_")
        else:
            return self.prefix + snake_to_camel(s)

    def get_descendants(self, concept):
        """Wrapped BMT descendants function that does case conversions"""
        descendants = self.bmt.get_descendants(concept, formatted=True)
        if len(descendants) == 0:
            descendants.append(concept)
        return descendants

    def get_ancestors(self, concept, reflexive=True):
        """Wrapped BMT ancestors function that does case conversions"""
        concept_old_format = self.new_case_to_old_case(concept)
        ancestors_old_format = self.bmt.get_ancestors(concept_old_format,
                                                      reflexive=reflexive)
        ancestors = [
            self.old_case_to_new_case(a) for a in ancestors_old_format
        ]
        return ancestors

    def predicate_is_symmetric(self, predicate):
        """Get whether a given predicate is symmetric"""
        predicate_old_format = self.new_case_to_old_case(predicate)
        predicate_element = self.bmt.get_element(predicate_old_format)
        if not predicate_element:
            # Not in the biolink model
            return False
        return predicate_element.symmetric

    def predicate_inverse(self, predicate):
        """Get the inverse of a predicate if it has one"""
        predicate_old_format = self.new_case_to_old_case(predicate)
        predicate_element = self.bmt.get_element(predicate_old_format)
        if not predicate_element:
            # Not in the biolink model
            return None
        if predicate_element.symmetric:
            return predicate
        predicate_inverse_old_format = predicate_element.inverse
        if not predicate_inverse_old_format:
            # No inverse
            return None
        return self.old_case_to_new_case(predicate_inverse_old_format)