Exemplo n.º 1
0
def get_all_common_ancestors(node_ids: List[str],
                             ontology: Ontology,
                             min_distance_from_root: int = 0,
                             nodeids_blacklist: List[str] = None):
    # check if all ids are connected to the same root node
    common_root = None
    for node_id in node_ids:
        onto_node = ontology.node(node_id)
        if "meta" in onto_node and "basicPropertyValues" in onto_node["meta"]:
            for basic_prop_val in onto_node["meta"]["basicPropertyValues"]:
                if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                    if common_root and common_root != basic_prop_val["val"]:
                        raise ValueError(
                            "Cannot get common ancestors of nodes connected to different roots"
                        )
                    common_root = basic_prop_val["val"]
    ancestors = defaultdict(list)
    for node_id in node_ids:
        for ancestor in ontology.ancestors(node=node_id, reflexive=True):
            onto_anc = ontology.node(ancestor)
            onto_anc_root = None
            if "meta" in onto_anc and "basicPropertyValues" in onto_anc["meta"]:
                for basic_prop_val in onto_anc["meta"]["basicPropertyValues"]:
                    if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                        onto_anc_root = basic_prop_val["val"]
            if onto_anc["depth"] >= min_distance_from_root and (not onto_anc_root or onto_anc_root == common_root) \
                and (not nodeids_blacklist or ancestor not in nodeids_blacklist):
                ancestors[ancestor].append(node_id)
    return [(ancestor, ontology.label(ancestor), set(covered_nodes))
            for ancestor, covered_nodes in ancestors.items()
            if len(covered_nodes) > 1 or ancestor == covered_nodes[0]]
Exemplo n.º 2
0
def materialize_inferences(ontology_graph: ontol.Ontology, annotation):
    materialized_annotations = []  #(gp, new_mf)

    mf = annotation["object"]["id"]
    gp = annotation["subject"]["id"]
    global __ancestors_cache
    mf_ancestors = ancestors(mf, ontology_graph, __ancestors_cache)

    # if mf_ancestors:
    #     logger.info("For {term} \"{termdef}\":".format(term=mf, termdef=ontology_graph.label(mf)))
    messages = []

    for mf_anc in mf_ancestors:
        has_part_mfs = neighbor_by_relation(ontology_graph, mf_anc, HAS_PART)

        # if has_part_mfs:
        #     logger.info("\tHas Parent --> {parent} \"{parentdef}\"".format(parent=mf_anc, parentdef=ontology_graph.label(mf_anc)))
        if has_part_mfs:
            messages.append((gp, mf, mf_anc, has_part_mfs))

        for new_mf in has_part_mfs:
            # logger.info("\t\thas_part --> {part} \"{partdef}\"".format(part=new_mf, partdef=ontology_graph.label(new_mf)))

            new_annotation = transform_relation(annotation, new_mf,
                                                ontology_graph)
            materialized_annotations.append(new_annotation)

    messages = [message for message in messages
                if message[3]]  # Filter out empty has_parts
    for message in messages:
        logger.info("\nFor {gp} -> {term} \"{termdef}\":".format(
            gp=message[0],
            term=message[1],
            termdef=ontology_graph.label(message[1])))
        logger.info("\tHas Parent --> {parent} \"{parentdef}\"".format(
            parent=message[1], parentdef=ontology_graph.label(message[1])))
        for part in message[3]:
            logger.info("\t\t has_part --> {part} \"{partdef}\"".format(
                part=part, partdef=ontology_graph.label(part)))

    return materialized_annotations
Exemplo n.º 3
0
def expand_tsv(input: str,
               ontology: Ontology = None,
               outfile=None,
               sep='\t',
               cols: List[str] = None) -> None:
    """
    Adds additional columns to a TSV by performing additional ontology lookups

    For example, given a TSV with a column `term`, this can add a column `term_label`
    in future it may also add closures

    :param input: filename of a TSV (must have column headers)
    :param ontology: used for lookup
    :param outfile: pathname for output file
    :param sep: delimiter
    :param cols: names of columns
    :return:
    """
    with open(input, newline='') as io:
        reader = csv.DictReader(io, delimiter='\t')
        items = []
        if True:
            outwriter = csv.writer(outfile, delimiter=sep)
            first = True
            for row in reader:
                if first:
                    first = False
                    hdr = []
                    for k in row.keys():
                        hdr.append(k)
                        if k in cols:
                            hdr.append(f'{k}_label')
                    outwriter.writerow(hdr)
                vals = []
                for k, v in row.items():
                    vals.append(v)
                    if k in cols:
                        id = row[k]
                        label = ontology.label(id)
                        vals.append(label)
                        if label is None:
                            logging.warning(f"No id: {id}")
                        #item[f'{k}_label'] = label
                outwriter.writerow(vals)
def get_all_common_ancestors(node_ids: List[str],
                             ontology: Ontology,
                             min_distance_from_root: int = 0,
                             nodeids_blacklist: List[str] = None):
    """
    Retrieve all common ancestors for the provided list of nodes

    Args:
        node_ids (List[str]): list of starting nodes
        ontology (Ontology): the ontology to which the provided nodes belong
        min_distance_from_root (int): minimum distance from root node
        nodeids_blacklist (List[str]): node ids to be excluded from the result

    Returns:
        List[CommonAncestor]: list of common ancestors
    """
    common_root = nodes_have_same_root(node_ids=node_ids, ontology=ontology)
    if common_root is False:
        raise ValueError(
            "Cannot get common ancestors of nodes connected to different roots"
        )
    ancestors = defaultdict(list)
    for node_id in node_ids:
        for ancestor in ontology.ancestors(node=node_id, reflexive=True):
            onto_anc = ontology.node(ancestor)
            onto_anc_root = None
            if "meta" in onto_anc and "basicPropertyValues" in onto_anc["meta"]:
                for basic_prop_val in onto_anc["meta"]["basicPropertyValues"]:
                    if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                        onto_anc_root = basic_prop_val["val"]
            if (ancestor in node_ids
                    or onto_anc["depth"] >= min_distance_from_root) and (
                        not onto_anc_root or onto_anc_root
                        == common_root) and (not nodeids_blacklist or ancestor
                                             not in nodeids_blacklist):
                ancestors[ancestor].append(node_id)
    return [
        CommonAncestor(node_id=ancestor,
                       node_label=ontology.label(ancestor),
                       covered_starting_nodes=set(covered_nodes))
        for ancestor, covered_nodes in ancestors.items()
        if len(covered_nodes) > 1 or ancestor == covered_nodes[0]
    ]
Exemplo n.º 5
0
class LexicalMapEngine():
    """
    generates lexical matches between pairs of ontology classes
    """

    SCORE = 'score'
    LEXSCORE = 'lexscore'
    SIMSCORES = 'simscores'
    CONDITIONAL_PR = 'cpr'

    def __init__(self, wsmap=default_wsmap(), config=None):
        """
        Arguments
        ---------
        wdmap: dict
            maps words to normalized synonyms.
        config: dict
            A configuration conforming to LexicalMapConfigSchema
        """
        # maps label or syn value to Synonym object
        self.lmap = {}
        # maps node id to synonym objects
        self.smap = {}
        self.wsmap = wsmap
        self.npattern = re.compile('[\W_]+')
        self.exclude_obsolete = True
        self.ontology_pairs = None
        self.id_to_ontology_map = defaultdict(list)
        self.merged_ontology = Ontology()
        self.config = config if config is not None else {}
        self.stats = {}

    def index_ontologies(self, onts):
        logging.info('Indexing: {}'.format(onts))
        for ont in onts:
            self.index_ontology(ont)

    def index_ontology(self, ont):
        """
        Adds an ontology to the index

        This iterates through all labels and synonyms in the ontology, creating an index
        """
        self.merged_ontology.merge([ont])
        syns = ont.all_synonyms(include_label=True)

        include_id = self._is_meaningful_ids()
        logging.info("Include IDs as synonyms: {}".format(include_id))
        if include_id:
            for n in ont.nodes():
                v = n
                # Get fragment
                if v.startswith('http'):
                    v = re.sub('.*/', '', v)
                    v = re.sub('.*#', '', v)
                syns.append(Synonym(n, val=v, pred='label'))

        logging.info("Indexing {} syns in {}".format(len(syns), ont))
        logging.info("Distinct lexical values: {}".format(len(
            self.lmap.keys())))
        for syn in syns:
            self.index_synonym(syn, ont)
        for nid in ont.nodes():
            self.id_to_ontology_map[nid].append(ont)

    def label(self, nid):
        return self.merged_ontology.label(nid)

    def index_synonym(self, syn, ont):
        """
        Index a synonym

        Typically not called from outside this object; called by `index_ontology`
        """
        if not syn.val:
            if syn.pred == 'label':
                if not self._is_meaningful_ids():
                    if not ont.is_obsolete(syn.class_id):
                        pass
                        #logging.error('Use meaningful ids if label not present: {}'.format(syn))
            else:
                logging.warning("Incomplete syn: {}".format(syn))
            return
        if self.exclude_obsolete and ont.is_obsolete(syn.class_id):
            return

        syn.ontology = ont
        prefix, _ = ont.prefix_fragment(syn.class_id)

        v = syn.val

        caps_match = re.match('[A-Z]+', v)
        if caps_match:
            # if > 75% of length is caps, assume abbreviation
            if caps_match.span()[1] >= len(v) / 3:
                syn.is_abbreviation(True)

        # chebi 'synonyms' are often not real synonyms
        # https://github.com/ebi-chebi/ChEBI/issues/3294
        if not re.match('.*[a-zA-Z]', v):
            if prefix != 'CHEBI':
                logging.warning('Ignoring suspicous synonym: {}'.format(syn))
            return

        v = self._standardize_label(v)

        # TODO: do this once ahead of time
        wsmap = {}
        for w, s in self.wsmap.items():
            wsmap[w] = s
        for ss in self._get_config_val(prefix, 'synsets', []):
            # TODO: weights
            wsmap[ss['synonym']] = ss['word']
        nv = self._normalize_label(v, wsmap)

        self._index_synonym_val(syn, v)
        nweight = self._get_config_val(prefix, 'normalized_form_confidence',
                                       0.8)
        if nweight > 0 and not syn.is_abbreviation():
            if nv != v:
                nsyn = Synonym(syn.class_id,
                               val=syn.val,
                               pred=syn.pred,
                               lextype=syn.lextype,
                               ontology=ont,
                               confidence=syn.confidence * nweight)
                self._index_synonym_val(nsyn, nv)

    def _index_synonym_val(self, syn, v):
        lmap = self.lmap
        smap = self.smap
        cid = syn.class_id
        if v not in lmap:
            lmap[v] = []
        lmap[v].append(syn)
        if cid not in smap:
            smap[cid] = []
        smap[cid].append(syn)

    def _standardize_label(self, v):
        # Add spaces separating camelcased strings
        v = re.sub('([a-z])([A-Z])', r'\1 \2', v)

        # always use lowercase when comparing
        # we may want to make this configurable in future
        v = v.lower()
        return v

    def _normalize_label(self, s, wsmap):
        """
        normalized form of a synonym
        """
        toks = []
        for tok in list(set(self.npattern.sub(' ', s).split(' '))):
            if tok in wsmap:
                tok = wsmap[tok]
            if tok != "":
                toks.append(tok)
        toks.sort()
        return " ".join(toks)

    def _get_config_val(self, prefix, k, default=None):
        v = None
        for oc in self.config.get('ontology_configurations', []):
            if prefix == oc.get('prefix', ''):
                v = oc.get(k, None)
        if v is None:
            v = self.config.get(k, None)
        if v is None:
            v = default
        return v

    def _is_meaningful_ids(self):
        return self.config.get('meaningful_ids', False)

    def find_equiv_sets(self):
        return self.lmap

    def get_xref_graph(self):
        """

        Generate mappings based on lexical properties and return as nx graph.

        Algorithm
        ~~~~~~~~~

        - A dictionary is stored between ref:`Synonym` values and synonyms. See ref:`index_synonym`.
          Note that Synonyms include the primary label

        - Each key in the dictionary is examined to determine if there exist two Synonyms from
          different ontology classes

        This avoids N^2 pairwise comparisons: instead the time taken is linear

        After initial mapping is made, additional scoring is performed on each mapping

        Edge properties
        ~~~~~~~~~~~~~~~
        The return object is a nx graph, connecting pairs of ontology classes.

        Edges are annotated with metadata about how the match was found:

        syns: pair
            pair of `Synonym` objects, corresponding to the synonyms for the two nodes
        score: int
            score indicating strength of mapping, between 0 and 100

        Returns
        -------
        Graph
            nx graph (bidirectional)
        """

        # initial graph; all matches
        g = nx.MultiDiGraph()

        # lmap collects all syns by token
        items = self.lmap.items()
        logging.info("collecting initial xref graph, items={}".format(
            len(items)))
        i = 0
        sum_nsyns = 0
        n_skipped = 0
        has_self_comparison = False
        if self.ontology_pairs:
            for (o1id, o2id) in self.ontology_pairs:
                if o1id == o2id:
                    has_self_comparison = True

        for (v, syns) in items:
            sum_nsyns += len(syns)
            i += 1
            if i % 1000 == 1:
                logging.info(
                    '{}/{}  lexical items avgSyns={}, skipped={}'.format(
                        i, len(items), sum_nsyns / len(items), n_skipped))
            if len(syns) < 2:
                n_skipped += 1
                next
            if len(syns) > 10:
                logging.info('Syns for {} = {}'.format(v, len(syns)))
            for s1 in syns:
                s1oid = s1.ontology.id
                s1cid = s1.class_id
                for s2 in syns:
                    # optimization step: although this is redundant with _is_comparable,
                    # we avoid inefficient additional calls
                    if s1oid == s2.ontology.id and not has_self_comparison:
                        next
                    if s1cid != s2.class_id:
                        if self._is_comparable(s1, s2):
                            g.add_edge(s1.class_id, s2.class_id, syns=(s1, s2))

        logging.info("getting best supporting synonym pair for each match")
        # graph of best matches
        xg = nx.Graph()
        for i in g.nodes():
            for j in g.neighbors(i):
                best = 0
                bestm = None
                for m in g.get_edge_data(i, j).values():
                    (s1, s2) = m['syns']
                    score = self._combine_syns(s1, s2)
                    if score > best:
                        best = score
                        bestm = m
                syns = bestm['syns']
                xg.add_edge(i,
                            j,
                            score=best,
                            lexscore=best,
                            syns=syns,
                            idpair=(i, j))

        self.score_xrefs_by_semsim(xg)
        self.assign_best_matches(xg)
        if self.merged_ontology.xref_graph is not None:
            self.compare_to_xrefs(xg, self.merged_ontology.xref_graph)
        else:
            logging.error("No xref graph for merged ontology")
        logging.info("finished xref graph")
        return xg

    # true if syns s1 and s2 should be compared.
    #  - if ontology_pairs is set, then only consider (s1,s2) if their respective source ontologies are in the list of pairs
    #  - otherwise compare all classes, but only in one direction
    def _is_comparable(self, s1, s2):
        if s1.class_id == s2.class_id:
            return False
        if self.ontology_pairs is not None:
            #logging.debug('TEST: {}{} in {}'.format(s1.ontology.id, s2.ontology.id, self.ontology_pairs))
            return (s1.ontology.id, s2.ontology.id) in self.ontology_pairs
        else:
            return s1.class_id < s2.class_id

    def _blanket(self, nid):
        nodes = set()
        for ont in self.id_to_ontology_map[nid]:
            nodes.update(ont.ancestors(nid))
            nodes.update(ont.descendants(nid))
        return list(nodes)

    def score_xrefs_by_semsim(self, xg, ont=None):
        """
        Given an xref graph (see ref:`get_xref_graph`), this will adjust scores based on
        the semantic similarity of matches.
        """
        logging.info(
            "scoring xrefs by semantic similarity for {} nodes in {}".format(
                len(xg.nodes()), ont))
        for (i, j, d) in xg.edges(data=True):
            pfx1 = self._id_to_ontology(i)
            pfx2 = self._id_to_ontology(j)
            ancs1 = self._blanket(i)
            ancs2 = self._blanket(j)
            s1, _, _ = self._sim(xg, ancs1, ancs2, pfx1, pfx2)
            s2, _, _ = self._sim(xg, ancs2, ancs1, pfx2, pfx1)
            s = 1 - ((1 - s1) * (1 - s2))
            logging.debug("Score {} x {} = {} x {} = {} // {}".format(
                i, j, s1, s2, s, d))
            xg[i][j][self.SIMSCORES] = (s1, s2)
            xg[i][j][self.SCORE] *= s

    def _sim(self, xg, ancs1, ancs2, pfx1, pfx2):
        """
        Compare two lineages
        """
        xancs1 = set()
        for a in ancs1:
            if a in xg:
                # TODO: restrict this to neighbors in single ontology
                for n in xg.neighbors(a):
                    pfx = self._id_to_ontology(n)
                    if pfx == pfx2:
                        xancs1.add(n)
        logging.debug('SIM={}/{} ## {}'.format(len(xancs1.intersection(ancs2)),
                                               len(xancs1),
                                               xancs1.intersection(ancs2),
                                               xancs1))
        n_shared = len(xancs1.intersection(ancs2))
        n_total = len(xancs1)
        return (1 + n_shared) / (1 + n_total), n_shared, n_total

    # given an ontology class id,
    # return map keyed by ontology id, value is a list of (score, ext_class_id) pairs
    def _neighborscores_by_ontology(self, xg, nid):
        xrefmap = defaultdict(list)
        for x in xg.neighbors(nid):
            score = xg[nid][x][self.SCORE]
            for ont in self.id_to_ontology_map[x]:
                xrefmap[ont.id].append((score, x))
        return xrefmap

    # normalize direction
    def _dirn(self, edge, i, j):
        if edge['idpair'] == (i, j):
            return 'fwd'
        elif edge['idpair'] == (j, i):
            return 'rev'
        else:
            return None

    def _id_to_ontology(self, id):
        return self.merged_ontology.prefix(id)
        #onts = self.id_to_ontology_map[id]
        #if len(onts) > 1:
        #    logging.warning(">1 ontology for {}".format(id))

    def compare_to_xrefs(self, xg1, xg2):
        """
        Compares a base xref graph with another one
        """
        ont = self.merged_ontology
        for (i, j, d) in xg1.edges(data=True):
            ont_left = self._id_to_ontology(i)
            ont_right = self._id_to_ontology(j)
            unique_lr = True
            num_xrefs_left = 0
            same_left = False
            if i in xg2:
                for j2 in xg2.neighbors(i):
                    ont_right2 = self._id_to_ontology(j2)
                    if ont_right2 == ont_right:
                        unique_lr = False
                        num_xrefs_left += 1
                        if j2 == j:
                            same_left = True
            unique_rl = True
            num_xrefs_right = 0
            same_right = False
            if j in xg2:
                for i2 in xg2.neighbors(j):
                    ont_left2 = self._id_to_ontology(i2)
                    if ont_left2 == ont_left:
                        unique_rl = False
                        num_xrefs_right += 1
                        if i2 == i:
                            same_right = True

            (x, y) = d['idpair']
            xg1[x][y]['left_novel'] = num_xrefs_left == 0
            xg1[x][y]['right_novel'] = num_xrefs_right == 0
            xg1[x][y]['left_consistent'] = same_left
            xg1[x][y]['right_consistent'] = same_right

    def assign_best_matches(self, xg):
        """
        For each node in the xref graph, tag best match edges
        """
        logging.info("assigning best matches for {} nodes".format(
            len(xg.nodes())))
        for i in xg.nodes():
            xrefmap = self._neighborscores_by_ontology(xg, i)
            for (ontid, score_node_pairs) in xrefmap.items():
                score_node_pairs.sort(reverse=True)
                (best_score, best_node) = score_node_pairs[0]
                logging.info("BEST for {}: {} in {} from {}".format(
                    i, best_node, ontid, score_node_pairs))
                edge = xg[i][best_node]
                dirn = self._dirn(edge, i, best_node)
                best_kwd = 'best_' + dirn
                if len(score_node_pairs
                       ) == 1 or score_node_pairs[0] > score_node_pairs[1]:
                    edge[best_kwd] = 2
                else:
                    edge[best_kwd] = 1
                for (score, j) in score_node_pairs:
                    edge_ij = xg[i][j]
                    dirn_ij = self._dirn(edge_ij, i, j)
                    edge_ij['cpr_' + dirn_ij] = score / sum(
                        [s for s, _ in score_node_pairs])
        for (i, j, edge) in xg.edges(data=True):
            # reciprocal score is set if (A) i is best for j, and (B) j is best for i
            rs = 0
            if 'best_fwd' in edge and 'best_rev' in edge:
                rs = edge['best_fwd'] * edge['best_rev']
            edge['reciprocal_score'] = rs
            edge['cpr'] = edge['cpr_fwd'] * edge['cpr_rev']

    def _best_match_syn(self, sx, sys, scope_map):
        """
        The best match is determined by the highest magnitude weight
        """
        SUBSTRING_WEIGHT = 0.2
        WBEST = None
        sbest = None
        sxv = self._standardize_label(sx.val)
        sxp = self._id_to_ontology(sx.class_id)
        for sy in sys:
            syv = self._standardize_label(sy.val)
            syp = self._id_to_ontology(sy.class_id)
            W = None
            if sxv == syv:
                confidence = sx.confidence * sy.confidence
                if sx.is_abbreviation() or sy.is_abbreviation:
                    confidence *= self._get_config_val(
                        sxp, 'abbreviation_confidence', 0.5)
                    confidence *= self._get_config_val(
                        syp, 'abbreviation_confidence', 0.5)
                W = scope_map[sx.scope()][sy.scope()] + logit(confidence / 2)
            elif sxv in syv:
                W = np.array((-SUBSTRING_WEIGHT, SUBSTRING_WEIGHT, 0, 0))
            elif syv in sxv:
                W = np.array((SUBSTRING_WEIGHT, -SUBSTRING_WEIGHT, 0, 0))
            if W is not None:
                # The best match is determined by the highest magnitude weight
                if WBEST is None or max(abs(W)) > max(abs(WBEST)):
                    WBEST = W
                    sbest = sy
        return WBEST, sbest

    def weighted_axioms(self, x, y, xg):
        """
        return a tuple (sub,sup,equiv,other) indicating estimated prior probabilities for an interpretation of a mapping
        between x and y.

        See kboom paper
        """
        # TODO: allow additional weighting
        # weights are log odds w=log(p/(1-p))
        # (Sub,Sup,Eq,Other)
        scope_pairs = [('label', 'label', 0.0, 0.0, 3.0, -0.8),
                       ('label', 'exact', 0.0, 0.0, 2.5, -0.5),
                       ('label', 'broad', -1.0, 1.0, 0.0, 0.0),
                       ('label', 'narrow', 1.0, -1.0, 0.0, 0.0),
                       ('label', 'related', 0.0, 0.0, 0.0, 0.0),
                       ('exact', 'exact', 0.0, 0.0, 2.5, -0.5),
                       ('exact', 'broad', -1.0, 1.0, 0.0, 0.0),
                       ('exact', 'narrow', 1.0, -1.0, 0.0, 0.0),
                       ('exact', 'related', 0.0, 0.0, 0.0, 0.0),
                       ('related', 'broad', -0.5, 0.5, 0.0, 0.0),
                       ('related', 'narrow', 0.5, -0.5, 0.0, 0.0),
                       ('related', 'related', 0.0, 0.0, 0.0, 0.0),
                       ('broad', 'broad', 0.0, 0.0, 0.0, 1.0),
                       ('broad', 'narrow', -0.5, 0.5, 0.0, 0.2),
                       ('narrow', 'narrow', 0.0, 0.0, 0.0, 0.0)]
        # populate symmetric lookup matrix
        scope_map = defaultdict(dict)
        for (l, r, w1, w2, w3, w4) in scope_pairs:
            l = l.upper()
            r = r.upper()
            scope_map[l][r] = np.array((w1, w2, w3, w4))
            scope_map[r][l] = np.array((w2, w1, w3, w4))

        # TODO: get prior based on ontology pair
        # cumulative sum of weights
        WS = None
        pfx1 = self._id_to_ontology(x)
        pfx2 = self._id_to_ontology(y)
        for mw in self.config.get('match_weights', []):
            mpfx1 = mw.get('prefix1', '')
            mpfx2 = mw.get('prefix2', '')
            X = np.array(mw['weights'])
            if mpfx1 == pfx1 and mpfx2 == pfx2:
                WS = X
            elif mpfx2 == pfx1 and mpfx1 == pfx2:
                WS = self._flipweights(X)
            elif mpfx1 == pfx1 and mpfx2 == '' and WS is None:
                WS = X
            elif mpfx2 == pfx1 and mpfx1 == '' and WS is None:
                WS = self._flipweights(X)

        if WS is None:
            WS = np.array((0.0, 0.0, 0.0, 0.0))
        # defaults
        WS += np.array(
            self.config.get('default_weights', [0.0, 0.0, 1.5, -0.1]))
        logging.info('WS defaults={}'.format(WS))

        for xw in self.config.get('xref_weights', []):
            left = xw.get('left', '')
            right = xw.get('right', '')
            X = np.array(xw['weights'])
            if x == left and y == right:
                WS += X
                logging.info('MATCH: {} for {}-{}'.format(X, x, y))
            elif y == left and x == right:
                WS += self._flipweights(X)
                logging.info('IMATCH: {}'.format(X))

        smap = self.smap
        # TODO: symmetrical
        WT = np.array((0.0, 0.0, 0.0, 0.0))
        WBESTMAX = np.array((0.0, 0.0, 0.0, 0.0))
        n = 0
        for sx in smap[x]:
            WBEST, _ = self._best_match_syn(sx, smap[y], scope_map)
            if WBEST is not None:
                WT += WBEST
                n += 1
                if max(abs(WBEST)) > max(abs(WBESTMAX)):
                    WBESTMAX = WBEST
        for sy in smap[y]:
            WBEST, _ = self._best_match_syn(sy, smap[x], scope_map)
            if WBEST is not None:
                WT += WBEST
                n += 1
        # average best match
        if n > 0:
            logging.info('Adding BESTMAX={}'.format(WBESTMAX))
            WS += WBESTMAX

        # TODO: xref, many to many
        WS += self._graph_weights(x, y, xg)
        # TODO: include additional defined weights, eg ORDO
        logging.info('Adding WS, gw={}'.format(WS))

        # jaccard similarity
        (ss1, ss2) = xg[x][y][self.SIMSCORES]
        WS[3] += ((1 - ss1) + (1 - ss2)) / 2

        # reciprocal best hits are higher confidence of equiv
        rs = xg[x][y]['reciprocal_score']
        if rs == 4:
            WS[2] += 0.5
        if rs == 0:
            WS[2] -= 0.2

        #P = np.expit(WS)
        P = 1 / (1 + np.exp(-WS))
        logging.info('Final WS={}, init P={}'.format(WS, P))
        # probs should sum to 1.0
        P = P / np.sum(P)
        return P

    def _graph_weights(self, x, y, xg):
        ont = self.merged_ontology
        xancs = ont.ancestors(x)
        yancs = ont.ancestors(y)
        pfx = self._id_to_ontology(x)
        pfy = self._id_to_ontology(y)
        xns = [
            n for n in xg.neighbors(y)
            if n != x and pfx == self._id_to_ontology(n)
        ]
        yns = [
            n for n in xg.neighbors(x)
            if n != y and pfy == self._id_to_ontology(n)
        ]
        pweight = 1.0
        W = np.array((0, 0, 0, 0))
        card = '11'
        if len(xns) > 0:
            card = 'm1'
            for x2 in xns:
                if x2 in xancs:
                    W[0] += pweight
                if x in ont.ancestors(x2):
                    W[1] += pweight
        if len(yns) > 0:
            if card == '11':
                card = '1m'
            else:
                card = 'mm'
            for y2 in yns:
                if y2 in yancs:
                    W[1] += pweight
                if y in ont.ancestors(y2):
                    W[0] += pweight

        logging.debug('CARD: {}/{} <-> {}/{} = {} // X={} Y={} // W={}'.format(
            x, pfx, y, pfy, card, xns, yns, W))
        invcard = card
        if card == '1m':
            invcard = 'm1'
        elif card == 'm1':
            invcard = '1m'

        CW = None
        DEFAULT_CW = None
        for cw in self.config.get('cardinality_weights', []):
            if 'prefix1' not in cw and 'prefix2' not in cw:
                if card == cw['cardinality']:
                    DEFAULT_CW = np.array(cw['weights'])
                if invcard == cw['cardinality']:
                    DEFAULT_CW = self._flipweights(np.array(cw['weights']))
            if 'prefix1' in cw and 'prefix2' in cw:
                if pfx == cw['prefix1'] and pfy == cw[
                        'prefix2'] and card == cw['cardinality']:
                    CW = np.array(cw['weights'])

                if pfx == cw['prefix2'] and pfy == cw[
                        'prefix1'] and invcard == cw['cardinality']:
                    CW = self._flipweights(np.array(cw['weights']))

        if CW is None:
            if DEFAULT_CW is not None:
                CW = DEFAULT_CW
            else:
                if card == '11':
                    CW = np.array((0.0, 0.0, 1.0, 0.0))
                elif card == '1m':
                    CW = np.array((0.6, 0.4, 0.0, 0.0))
                elif card == 'm1':
                    CW = np.array((0.4, 0.6, 0.0, 0.0))
                elif card == 'mm':
                    CW = np.array((0.2, 0.2, 0.0, 0.5))
        return W + CW

    def _flipweights(self, W):
        return np.array((W[1], W[0], W[2], W[3]))

    def grouped_mappings(self, id):
        """
        return all mappings for a node, grouped by ID prefix
        """
        g = self.get_xref_graph()
        m = {}
        for n in g.neighbors(id):
            [prefix, local] = n.split(':')
            if prefix not in m:
                m[prefix] = []
            m[prefix].append(n)
        return m

    def unmapped_nodes(self, xg, rs_threshold=0):
        unmapped_set = set()
        for nid in self.merged_ontology.nodes():
            if nid in xg:
                for (j, edge) in xg[nid].items():
                    rs = edge.get('reciprocal_score', 0)
                    if rs < rs_threshold:
                        unmapped_set.add(nid)
            else:
                unmapped_set.add(nid)
        return unmapped_set

    def unmapped_dataframe(self, xg, **args):
        unodes = self.unmapped_nodes(xg, **args)
        ont = self.merged_ontology
        eg = ont.equiv_graph()
        items = []
        for n in unodes:
            mapped_equivs = ''
            if n in eg:
                equivs = set(eg.neighbors(n))
                mapped_equivs = list(equivs - unodes)
            items.append(
                dict(id=n, label=ont.label(n), mapped_equivs=mapped_equivs))
        df = pd.DataFrame(items, columns=['id', 'label', 'mapped_equivs'])
        df = df.sort_values(["id"])
        return df

    # scores a pairwise combination of synonyms. This will be a mix of
    #  * individual confidence in the synonyms themselves
    #  * confidence of equivalence based on scopes
    # TODO: unify this with probabilistic calculation
    def _combine_syns(self, s1, s2):
        cpred = self._combine_preds(s1.pred, s2.pred)
        s = self._pred_score(cpred)
        s *= s1.confidence * s2.confidence
        if s1.is_abbreviation() or s2.is_abbreviation():
            s *= self._get_config_val(self._id_to_ontology(s1.class_id),
                                      'abbreviation_confidence', 0.5)
            s *= self._get_config_val(self._id_to_ontology(s1.class_id),
                                      'abbreviation_confidence', 0.5)
        logging.debug("COMBINED: {} + {} = {}/{}".format(s1, s2, cpred, s))
        return round(s)

    def _rollup(self, p):
        if p == 'label':
            return LABEL_OR_EXACT
        if p == 'hasExactSynonym':
            return LABEL_OR_EXACT
        return p

    def _combine_preds(self, p1, p2):
        if p1 == p2:
            return p1
        if self._rollup(p1) == self._rollup(p2):
            return self._rollup(p1)
        return p1 + p2

    ## TODO: allow this to be weighted by ontology
    def _pred_score(self, p):
        if p == 'label':
            return 100
        if p == LABEL_OR_EXACT:
            return 90
        if p == 'hasExactSynonym':
            return 90
        return 50

    def _in_clique(self, x, cliques):
        for s in cliques:
            if x in s:
                return s
        return set()

    def as_dataframe(self, xg):
        cliques = self.cliques(xg)
        ont = self.merged_ontology
        items = []
        for (x, y, d) in xg.edges(data=True):
            # xg is a non-directional Graph object.
            # to get a deterministic ordering we use the idpair key
            (x, y) = d['idpair']
            (s1, s2) = d['syns']
            (ss1, ss2) = d['simscores']
            clique = self._in_clique(x, cliques)
            #ancs = nx.ancestors(g,x)
            left_label = ont.label(x)
            right_label = ont.label(y)
            if ont.is_obsolete(x) and not left_label.startwith('obsolete'):
                left_label = "obsolete " + left_label
            if ont.is_obsolete(y) and not right_label.startwith('obsolete'):
                right_label = "obsolete " + right_label

            P = self.weighted_axioms(x, y, xg)
            item = {
                'left': x,
                'left_label': left_label,
                'right': y,
                'right_label': right_label,
                'score': d['score'],
                'left_match_type': s1.pred,
                'right_match_type': s2.pred,
                'left_match_val': s1.val,
                'right_match_val': s2.val,
                'left_simscore': ss1,
                'right_simscore': ss2,
                'reciprocal_score': d.get('reciprocal_score', 0),
                'conditional_pr_equiv': d.get('cpr'),
                'pr_subClassOf': P[0],
                'pr_superClassOf': P[1],
                'pr_equivalentTo': P[2],
                'pr_other': P[3],
                'left_novel': d.get('left_novel'),
                'right_novel': d.get('right_novel'),
                'left_consistent': d.get('left_consistent'),
                'right_consistent': d.get('right_consistent'),
                'equiv_clique_size': len(clique)
            }

            items.append(item)

        ix = [
            'left', 'left_label', 'right', 'right_label', 'left_match_type',
            'right_match_type', 'left_match_val', 'right_match_val', 'score',
            'left_simscore', 'right_simscore', 'reciprocal_score',
            'conditional_pr_equiv', 'pr_subClassOf', 'pr_superClassOf',
            'pr_equivalentTo', 'pr_other', 'left_novel', 'right_novel',
            'left_consistent', 'right_consistent', 'equiv_clique_size'
        ]
        df = pd.DataFrame(items, columns=ix)
        df = df.sort_values(["left", "score", "right"])
        return df

    def cliques(self, xg):
        """
        Return all equivalence set cliques, assuming each edge in the xref graph is treated as equivalent,
        and all edges in ontology are subClassOf

        Arguments
        ---------
        xg : Graph
            an xref graph

        Returns
        -------
        list of sets
        """
        g = nx.DiGraph()
        for (x, y) in self.merged_ontology.get_graph().edges():
            g.add_edge(x, y)
        for (x, y) in xg.edges():
            g.add_edge(x, y)
            g.add_edge(y, x)
        return list(strongly_connected_components(g))
Exemplo n.º 6
0
def get_best_nodes_naive(
    node_ids: List[str],
    ontology: Ontology,
    min_distance_from_root: int = 3,
    max_num_nodes: int = 3,
    nodeids_blacklist: List[str] = None
) -> Tuple[bool, List[Tuple[str, Set[str]]]]:
    """remove terms with common ancestor and keep the ancestor term instead

    Args:
        node_ids (List[str]): the list of nodes to merge by common ancestor
        min_distance_from_root (int): set a minimum distance from root terms for ancestors that can group children terms
        max_num_nodes (int): maximum number of nodes to be returned by the trimming algorithm
        nodeids_blacklist (List[str]): a list of node ids to be excluded from common ancestors list
        ontology (Ontology): the ontology
    Returns:
        Set[str]: the set of merged terms, together with the set of original terms that each of them covers
    """
    logger.debug("applying trimming through naive algorithm")
    final_terms_set = {}
    ancestor_paths = defaultdict(list)
    term_paths = defaultdict(set)
    # step 1: get all path for each term and populate data structures
    for node_id in node_ids:
        node_root = None
        node_ont = ontology.node(node_id)
        if "meta" in node_ont and "basicPropertyValues" in node_ont["meta"]:
            for basic_prop_val in node_ont["meta"]["basicPropertyValues"]:
                if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                    node_root = basic_prop_val["val"]
        paths = get_all_paths_to_root(
            node_id=node_id,
            ontology=ontology,
            min_distance_from_root=min_distance_from_root,
            relations=None,
            nodeids_blacklist=nodeids_blacklist,
            root_node=node_root)
        for path in paths:
            term_paths[node_id].add(path)
            ancestor_paths[path[-1]].append(path)
    # step 2: merge terms and keep common ancestors
    for node_id in sorted(node_ids):
        term_paths_copy = sorted(term_paths[node_id].copy(),
                                 key=lambda x: len(x))
        while len(term_paths_copy) > 0:
            curr_path = list(term_paths_copy.pop())
            selected_highest_ancestor = curr_path.pop()
            related_paths = ancestor_paths[selected_highest_ancestor]
            if not related_paths:
                break
            covered_nodes_set = set(
                [related_path[0] for related_path in related_paths])
            del ancestor_paths[selected_highest_ancestor]
            if curr_path:
                if all(map(lambda x: x[0] == curr_path[0], related_paths)):
                    selected_highest_ancestor = curr_path[0]
                else:
                    i = -1
                    while len(curr_path) > 1:
                        i -= 1
                        curr_highest_ancestor = curr_path.pop()
                        if not all(map(lambda x: len(x) >= -i, related_paths)):
                            break
                        if all(
                                map(lambda x: x[i] == curr_highest_ancestor,
                                    related_paths)):
                            selected_highest_ancestor = curr_highest_ancestor
                            if selected_highest_ancestor in ancestor_paths:
                                del ancestor_paths[selected_highest_ancestor]
                            for path in related_paths:
                                term_paths[path[0]].discard(path)
            final_terms_set[selected_highest_ancestor] = covered_nodes_set
            for path in related_paths:
                term_paths[path[0]].discard(path)
            if len(term_paths[node_id]) > 0:
                term_paths_copy = term_paths[node_id].copy()
            else:
                break
    if len(list(final_terms_set.keys())) <= max_num_nodes:
        return False, [
            (term_label, covered_terms)
            for term_label, covered_terms in final_terms_set.items()
        ]

    else:
        best_terms = find_set_covering(
            [(k, ontology.label(k, id_if_null=True), v)
             for k, v in final_terms_set.items()],
            max_num_subsets=max_num_nodes)
        covered_terms = set([
            e for best_term_label, covered_terms in best_terms
            for e in covered_terms
        ])
        return covered_terms != set(node_ids), best_terms
Exemplo n.º 7
0
def get_best_nodes_lca(
    node_ids: List[str],
    ontology: Ontology,
    min_distance_from_root: int = 3,
    max_num_nodes: int = 3,
    nodeids_blacklist: List[str] = None
) -> Tuple[bool, List[Tuple[str, Set[str]]]]:
    candidates = {
        node_id: (node_label, covered_nodes)
        for node_id, node_label, covered_nodes in get_all_common_ancestors(
            node_ids=node_ids,
            ontology=ontology,
            min_distance_from_root=min_distance_from_root,
            nodeids_blacklist=nodeids_blacklist)
    }
    cands_ids_to_process = set(candidates.keys())
    selected_cands_ids = []
    node_to_cands_map = defaultdict(list)
    for cand in cands_ids_to_process:
        for node in candidates[cand][1]:
            node_to_cands_map[node].append(cand)
    while len(cands_ids_to_process) > 0:
        cand_id = cands_ids_to_process.pop()
        comparable_cands = [
            (cid, cval[1]) for cid, cval in candidates.items()
            if cid != cand_id and all(
                [child_id in cval[1] for child_id in candidates[cand_id][1]])
        ]
        if len(comparable_cands) > 0:
            max_len = max(map(lambda x: len(x[1]), comparable_cands))
            best_cands = [
                candidate for candidate in comparable_cands
                if len(candidate[1]) == max_len
            ]
            if len(best_cands) > 1:
                weighted_best_cands = sorted(
                    [(ontology.node(cand[0])["depth"], cand)
                     for cand in best_cands],
                    key=lambda x: x[0],
                    reverse=True)
                max_weight = max(map(lambda x: x[0], weighted_best_cands))
                best_cands = [
                    wcand[1] for wcand in weighted_best_cands
                    if wcand[0] == max_weight
                ]
            else:
                max_weight = ontology.node(best_cands[0][0])["depth"]
            if len(candidates[cand_id][1]) > len(best_cands[0][1]) or \
                    (len(candidates[cand_id][1]) > len(best_cands[0][1]) and
                     ontology.node(cand_id)["depth"] > max_weight):
                best_cands = [(cand_id, candidates[cand_id][1])]
            for best_cand in best_cands:
                selected_cands_ids.append(best_cand[0])
                for node_id in candidates[best_cand[0]][1]:
                    cands_ids_to_process -= set(node_to_cands_map[node_id])
        else:
            selected_cands_ids.append(cand_id)
    if len(selected_cands_ids) <= max_num_nodes:
        return False, [(node_id, candidates[node_id][1])
                       for node_id in selected_cands_ids]

    else:
        best_terms = find_set_covering(
            [(node_id, ontology.label(node_id,
                                      id_if_null=True), candidates[node_id][1])
             for node_id in selected_cands_ids],
            max_num_subsets=max_num_nodes)
        covered_terms = set([
            e for best_term_label, covered_terms in best_terms
            for e in covered_terms
        ])
        return covered_terms != set(node_ids), best_terms