示例#1
0
    def get_related(self,
                    stmt,
                    possibly_related=None,
                    direction='less_specific'):
        # Corner case: if this is a new statement that wasn't part of the
        # initialization, it is possible that it has a type that we've not
        # seen during initialization at all. In this case, we can assume
        # there are no refinements for it.
        stmt_type = indra_stmt_type(stmt)
        if stmt_type not in self.shared_data:
            return {}

        # Step 1. Recover relevant parts ot the initialized data
        hash_to_agent_key = self.shared_data[stmt_type]['hash_to_agent_key']
        agent_key_to_hash = self.shared_data[stmt_type]['agent_key_to_hash']
        all_keys_by_role = self.shared_data[stmt_type]['all_keys_by_role']

        # Step 2. We iterate over all statements and find ones that this one
        # can refine
        stmt_hash = stmt.get_hash()
        relevants = possibly_related
        # We now iterate over all the agent roles in the given statement
        # type
        for role, hash_to_agent_key_for_role in hash_to_agent_key.items():
            # If we have seen this statement before during initialization then
            # we can use its precalculated agent keys, otherwise we
            # calculate new agent keys for it.
            if stmt_hash in hash_to_agent_key_for_role:
                agent_keys = hash_to_agent_key_for_role[stmt_hash]
            else:
                agent_keys = self._agent_keys_for_stmt_role(stmt, role)

            # We get all the agent keys in all other statements that the
            # agent in this given role in this statement can refine.
            for agent_key in agent_keys:
                relevant_keys = get_relevant_keys(agent_key,
                                                  all_keys_by_role[role],
                                                  self.ontology,
                                                  direction=direction)
                # We now get the actual statement hashes that these other
                # potentially refined agent keys appear in in the given role
                role_relevant_stmt_hashes = set.union(
                    *[agent_key_to_hash[role][rel]
                      for rel in relevant_keys]) - {stmt_hash}
                # In the first iteration, we initialize the set with the
                # relevant statement hashes
                if relevants is None:
                    relevants = role_relevant_stmt_hashes
                # In subsequent iterations, we take the intersection of
                # the relevant sets per role
                else:
                    relevants &= role_relevant_stmt_hashes

        # These hashes are now the ones that this statement needs
        # to be compared against. Importantly, the relationship is in
        # a well-defined direction so we don't need to test both ways.
        return relevants
示例#2
0
def ontology_refinement_filter(stmts_by_hash, stmts_to_compare, ontology):
    """Return possible refinement relationships based on an ontology.

    Parameters
    ----------
    stmts_by_hash : dict
        A dict whose keys are statement hashes that point to the
        (deduplicated) statement with that hash as a value.
    stmts_to_compare : dict or None
        A dict of existing statements to compare that will be further
        filtered down in this function and then returned.
    ontology : indra.ontology.IndraOntology
        An IndraOntology instance iwth respect to which this
        filter is applied.

    Returns
    -------
    dict
        A dict whose keys are statement hashes and values are sets
        of statement hashes that can potentially be refined by the
        statement identified by the key.
    """
    ts = time.time()
    stmts_by_type = collections.defaultdict(set)
    for stmt_hash, stmt in stmts_by_hash.items():
        stmts_by_type[indra_stmt_type(stmt)].add(stmt_hash)
    stmts_by_type = dict(stmts_by_type)

    first_filter = stmts_to_compare is None
    if first_filter:
        stmts_to_compare = collections.defaultdict(set)
    for stmt_type, stmt_hashes in stmts_by_type.items():
        logger.info('Finding ontology-based refinements for %d %s statements' %
                    (len(stmts_by_type[stmt_type]), stmt_type.__name__))
        stmts_by_hash_this_type = {
            stmt_hash: stmts_by_hash[stmt_hash]
            for stmt_hash in stmt_hashes
        }
        stmts_to_compare_by_type = \
            ontology_refinement_filter_by_stmt_type(stmts_by_hash_this_type,
                                                    ontology)
        if first_filter:
            stmts_to_compare.update(stmts_to_compare_by_type)
        else:
            for k, v in stmts_to_compare_by_type.items():
                stmts_to_compare[k] = stmts_to_compare[k] & v

    te = time.time()
    logger.debug('Identified ontology-based possible refinements in %.2fs' %
                 (te - ts))
    # Make an empty dict to make sure we don't return a None
    if stmts_to_compare is None:
        stmts_to_compare = {}
    return stmts_to_compare
示例#3
0
    def find_contradicts(self):
        """Return pairs of contradicting Statements.

        Returns
        -------
        contradicts : list(tuple(Statement, Statement))
            A list of Statement pairs that are contradicting.
        """
        # Make a dict of Statement by type
        stmts_by_type = collections.defaultdict(lambda: [])
        for idx, stmt in enumerate(self.stmts):
            stmts_by_type[indra_stmt_type(stmt)].append((idx, stmt))

        # Handle Statements with polarity first
        pos_stmts = AddModification.__subclasses__()
        neg_stmts = [modclass_to_inverse[c] for c in pos_stmts]

        pos_stmts += [Activation, IncreaseAmount]
        neg_stmts += [Inhibition, DecreaseAmount]

        contradicts = []
        for pst, nst in zip(pos_stmts, neg_stmts):
            poss = stmts_by_type.get(pst, [])
            negs = stmts_by_type.get(nst, [])

            pos_stmt_by_group = self._get_stmt_by_group(
                pst, poss, self.ontology)
            neg_stmt_by_group = self._get_stmt_by_group(
                nst, negs, self.ontology)
            for key, pg in pos_stmt_by_group.items():
                ng = neg_stmt_by_group.get(key, [])
                for (_, st1), (_, st2) in itertools.product(pg, ng):
                    if st1.contradicts(st2, self.ontology):
                        contradicts.append((st1, st2))

        # Handle neutral Statements next
        neu_stmts = [Influence, ActiveForm]
        for stt in neu_stmts:
            stmts = stmts_by_type.get(stt, [])
            for (_, st1), (_, st2) in itertools.combinations(stmts, 2):
                if st1.contradicts(st2, self.ontology):
                    contradicts.append((st1, st2))

        return contradicts
示例#4
0
    def find_contradicts(self):
        """Return pairs of contradicting Statements.

        Returns
        -------
        contradicts : list(tuple(Statement, Statement))
            A list of Statement pairs that are contradicting.
        """
        eh = self.hierarchies['entity']

        # Make a dict of Statement by type
        stmts_by_type = collections.defaultdict(lambda: [])
        for idx, stmt in enumerate(self.stmts):
            stmts_by_type[indra_stmt_type(stmt)].append((idx, stmt))

        # Handle Statements with polarity first
        pos_stmts = AddModification.__subclasses__()
        neg_stmts = [modclass_to_inverse[c] for c in pos_stmts]

        pos_stmts += [Activation, IncreaseAmount]
        neg_stmts += [Inhibition, DecreaseAmount]

        contradicts = []
        for pst, nst in zip(pos_stmts, neg_stmts):
            poss = stmts_by_type.get(pst, [])
            negs = stmts_by_type.get(nst, [])

            pos_stmt_by_group = self._get_stmt_by_group(pst, poss, eh)
            neg_stmt_by_group = self._get_stmt_by_group(nst, negs, eh)
            for key, pg in pos_stmt_by_group.items():
                ng = neg_stmt_by_group.get(key, [])
                for (_, st1), (_, st2) in itertools.product(pg, ng):
                    if st1.contradicts(st2, self.hierarchies):
                        contradicts.append((st1, st2))

        # Handle neutral Statements next
        neu_stmts = [Influence, ActiveForm]
        for stt in neu_stmts:
            stmts = stmts_by_type.get(stt, [])
            for (_, st1), (_, st2) in itertools.combinations(stmts, 2):
                if st1.contradicts(st2, self.hierarchies):
                    contradicts.append((st1, st2))

        return contradicts
示例#5
0
    def find_contradicts(self):
        """Return pairs of contradicting Statements.

        Returns
        -------
        contradicts : list(tuple(Statement, Statement))
            A list of Statement pairs that are contradicting.
        """
        # Make a dict of Statement by type
        stmts_by_type = collections.defaultdict(list)
        for stmt in self.stmts:
            stmts_by_type[indra_stmt_type(stmt)].append(stmt)
        stmts_by_type = dict(stmts_by_type)

        # Handle Statements with polarity first
        pos_stmts = AddModification.__subclasses__()
        neg_stmts = [modclass_to_inverse[c] for c in pos_stmts]

        pos_stmts += [Activation, IncreaseAmount]
        neg_stmts += [Inhibition, DecreaseAmount]

        contradicts = []
        # Handle statements with polarity first
        # TODO: we could probably do some optimization here
        # to not have to check statements combinatorially
        for pst, nst in zip(pos_stmts, neg_stmts):
            poss = stmts_by_type.get(pst, [])
            negs = stmts_by_type.get(nst, [])

            for ps, ns in itertools.product(poss, negs):
                if ps.contradicts(ns, self.ontology):
                    contradicts.append((ps, ns))

        # Handle neutral Statements next
        neu_stmts = [Influence, ActiveForm]
        for stt in neu_stmts:
            stmts = stmts_by_type.get(stt, [])
            for st1, st2 in itertools.combinations(stmts, 2):
                if st1.contradicts(st2, self.ontology):
                    contradicts.append((st1, st2))

        return contradicts
示例#6
0
    def extend(self, stmts_by_hash):
        self.shared_data['stmts_by_hash'].update(stmts_by_hash)
        # Build up data structure of statement hashes by
        # statement type
        stmts_by_type = collections.defaultdict(set)
        for stmt_hash, stmt in stmts_by_hash.items():
            stmts_by_type[indra_stmt_type(stmt)].add(stmt_hash)
        stmts_by_type = dict(stmts_by_type)

        # Now iterate over each statement type and build up
        # data structures for quick filtering
        for stmt_type, stmts_this_type in stmts_by_type.items():
            # Step 1. initialize data structures
            # noinspection PyProtectedMember
            roles = stmts_by_hash[next(iter(stmts_this_type))]._agent_order
            if stmt_type not in self.shared_data:
                self.shared_data[stmt_type] = {}
                # Mapping agent keys to statement hashes
                self.shared_data[stmt_type]['agent_key_to_hash'] = \
                    {role: collections.defaultdict(set) for role in roles}
                # Mapping statement hashes to agent keys
                self.shared_data[stmt_type]['hash_to_agent_key'] = \
                    {role: collections.defaultdict(set) for role in roles}
                # All agent keys for a given agent role
                self.shared_data[stmt_type]['all_keys_by_role'] = {}

            # Step 2. Fill up the initial data structures in preparation
            # for identifying potential refinements
            for sh in stmts_this_type:
                for role in roles:
                    agent_keys = self._agent_keys_for_stmt_role(
                        stmts_by_hash[sh], role)
                    for agent_key in agent_keys:
                        self.shared_data[stmt_type]['agent_key_to_hash'][
                            role][agent_key].add(sh)
                        self.shared_data[stmt_type]['hash_to_agent_key'][
                            role][sh].add(agent_key)

            for role in roles:
                self.shared_data[stmt_type]['all_keys_by_role'][role] = \
                    set(self.shared_data[stmt_type]['agent_key_to_hash'][role])
示例#7
0
    def _generate_id_maps(self, unique_stmts, *args, **kwargs):
        """Connect statements using their refinement relationship."""
        # Make a list of Statement types
        stmt_to_idx = {
            stmt.get_hash(matches_fun=self.matches_fun): idx
            for idx, stmt in enumerate(unique_stmts)
        }
        stmts_by_type = collections.defaultdict(list)
        for stmt in unique_stmts:
            stmts_by_type[indra_stmt_type(stmt)].append(stmt)
        stmts_by_type = dict(stmts_by_type)

        # Here we handle split_idx to allow finding refinements between
        # to distinct groups of statements (identified by an index at which we
        # split the unique_statements list) rather than globally across
        # all unique statements.
        split_idx = kwargs.pop('split_idx', None)
        if split_idx:
            # This dict maps statement hashes to a bool value based on which
            # of the two groups the statement belongs to.
            hash_to_split_group = {
                sh: (idx <= split_idx)
                for sh, idx in stmt_to_idx.items()
            }
        else:
            hash_to_split_group = None

        maps = []
        for stmt_type, stmts in stmts_by_type.items():
            logger.info('Finding refinements for %d %s statements' %
                        (len(stmts), stmt_type.__name__))
            maps += self._generate_hash_maps_by_stmt_type(
                stmts, stmts[0]._agent_order, split_groups=hash_to_split_group)
        idx_maps = [(stmt_to_idx[refinement], stmt_to_idx[refined])
                    for refinement, refined in maps]
        return idx_maps
示例#8
0
    def _generate_id_maps(self,
                          unique_stmts,
                          poolsize=None,
                          size_cutoff=100,
                          split_idx=None):
        """Connect statements using their refinement relationships."""
        if not self.ontology._initialized:
            self.ontology.initialize()
        if len(unique_stmts) > 10000:
            self.ontology._build_transitive_closure()
        # Check arguments relating to multiprocessing
        if poolsize is None:
            logger.debug('combine_related: poolsize not set, '
                         'not using multiprocessing.')
            use_mp = False
        elif sys.version_info[0] >= 3 and sys.version_info[1] >= 4:
            use_mp = True
            logger.info('combine_related: Python >= 3.4 detected, '
                        'using multiprocessing with poolsize %d, '
                        'size_cutoff %d' % (poolsize, size_cutoff))
        else:
            use_mp = False
            logger.info('combine_related: Python < 3.4 detected, '
                        'not using multiprocessing.')
        # Make a list of Statement types
        stmts_by_type = collections.defaultdict(lambda: [])
        for idx, stmt in enumerate(unique_stmts):
            stmts_by_type[indra_stmt_type(stmt)].append((idx, stmt))

        child_proc_groups = []
        parent_proc_groups = []
        skipped_groups = 0
        # Each Statement type can be preassembled independently
        for stmt_type, stmts_this_type in stmts_by_type.items():
            logger.info('Grouping %s (%s)' %
                        (stmt_type.__name__, len(stmts_this_type)))
            stmt_by_group = self._get_stmt_by_group(stmt_type, stmts_this_type,
                                                    self.ontology)

            # Divide statements by group size
            # If we're not using multiprocessing, then all groups are local
            for g_name, g in stmt_by_group.items():
                if len(g) < 2:
                    skipped_groups += 1
                    continue
                if use_mp and len(g) >= size_cutoff:
                    child_proc_groups.append(g)
                else:
                    parent_proc_groups.append(g)

        # Now run preassembly!
        logger.debug(
            "Groups: %d parent, %d worker, %d skipped." %
            (len(parent_proc_groups), len(child_proc_groups), skipped_groups))

        supports_func = functools.partial(_set_supports_stmt_pairs,
                                          ontology=self.ontology,
                                          split_idx=split_idx,
                                          check_entities_match=False,
                                          refinement_fun=self.refinement_fun)

        # Check if we are running any groups in child processes; note that if
        # use_mp is False, child_proc_groups will be empty
        if child_proc_groups:
            # Get a multiprocessing context
            ctx = mp.get_context('spawn')
            pool = ctx.Pool(poolsize)
            # Run the large groups remotely
            logger.debug("Running %d groups in child processes" %
                         len(child_proc_groups))
            res = pool.map_async(supports_func, child_proc_groups)
            workers_ready = False
        else:
            workers_ready = True

        # Run the small groups locally
        logger.debug("Running %d groups in parent process" %
                     len(parent_proc_groups))
        stmt_ix_map = [
            supports_func(stmt_tuples) for stmt_tuples in parent_proc_groups
        ]
        logger.debug("Done running parent process groups")

        while not workers_ready:
            logger.debug("Checking child processes")
            if res.ready():
                workers_ready = True
                logger.debug('Child process group comparisons successful? %s' %
                             res.successful())
                if not res.successful():
                    # The get method re-raises the underlying error that we can
                    # now catch and print.
                    try:
                        res.get()
                    except Exception as e:
                        raise Exception(
                            "Sorry, there was a problem with "
                            "preassembly in the child processes: %s" % e)
                else:
                    stmt_ix_map += res.get()
                logger.debug("Closing pool...")
                pool.close()
                logger.debug("Joining pool...")
                pool.join()
                logger.debug("Pool closed and joined.")
            time.sleep(1)
        logger.debug("Done.")
        # Combine all redundant map edges
        stmt_ix_map_set = set([])
        for group_ix_map in stmt_ix_map:
            for ix_pair in group_ix_map:
                stmt_ix_map_set.add(ix_pair)
        return stmt_ix_map_set
示例#9
0
    def _generate_id_maps(self, unique_stmts, poolsize=None,
                          size_cutoff=100, split_idx=None):
        """Connect statements using their refinement relationships."""
        # Check arguments relating to multiprocessing
        if poolsize is None:
            logger.debug('combine_related: poolsize not set, '
                         'not using multiprocessing.')
            use_mp = False
        elif sys.version_info[0] >= 3 and sys.version_info[1] >= 4:
            use_mp = True
            logger.info('combine_related: Python >= 3.4 detected, '
                        'using multiprocessing with poolsize %d, '
                        'size_cutoff %d' % (poolsize, size_cutoff))
        else:
            use_mp = False
            logger.info('combine_related: Python < 3.4 detected, '
                        'not using multiprocessing.')
        eh = self.hierarchies['entity']
        # Make a list of Statement types
        stmts_by_type = collections.defaultdict(lambda: [])
        for idx, stmt in enumerate(unique_stmts):
            stmts_by_type[indra_stmt_type(stmt)].append((idx, stmt))

        child_proc_groups = []
        parent_proc_groups = []
        skipped_groups = 0
        # Each Statement type can be preassembled independently
        for stmt_type, stmts_this_type in stmts_by_type.items():
            logger.info('Grouping %s (%s)' %
                        (stmt_type.__name__, len(stmts_this_type)))
            stmt_by_group = self._get_stmt_by_group(stmt_type, stmts_this_type,
                                                    eh)

            # Divide statements by group size
            # If we're not using multiprocessing, then all groups are local
            for g_name, g in stmt_by_group.items():
                if len(g) < 2:
                    skipped_groups += 1
                    continue
                if use_mp and len(g) >= size_cutoff:
                    child_proc_groups.append(g)
                else:
                    parent_proc_groups.append(g)

        # Now run preassembly!
        logger.debug("Groups: %d parent, %d worker, %d skipped." %
                     (len(parent_proc_groups), len(child_proc_groups),
                      skipped_groups))

        supports_func = functools.partial(_set_supports_stmt_pairs,
                                          hierarchies=self.hierarchies,
                                          split_idx=split_idx,
                                          check_entities_match=False)

        # Check if we are running any groups in child processes; note that if
        # use_mp is False, child_proc_groups will be empty
        if child_proc_groups:
            # Get a multiprocessing context
            ctx = mp.get_context('spawn')
            pool = ctx.Pool(poolsize)
            # Run the large groups remotely
            logger.debug("Running %d groups in child processes" %
                         len(child_proc_groups))
            res = pool.map_async(supports_func, child_proc_groups)
            workers_ready = False
        else:
            workers_ready = True

        # Run the small groups locally
        logger.debug("Running %d groups in parent process" %
                     len(parent_proc_groups))
        stmt_ix_map = [supports_func(stmt_tuples)
                       for stmt_tuples in parent_proc_groups]
        logger.debug("Done running parent process groups")

        while not workers_ready:
            logger.debug("Checking child processes")
            if res.ready():
                workers_ready = True
                logger.debug('Child process group comparisons successful? %s' %
                             res.successful())
                if not res.successful():
                    raise Exception("Sorry, there was a problem with "
                                    "preassembly in the child processes.")
                else:
                    stmt_ix_map += res.get()
                logger.debug("Closing pool...")
                pool.close()
                logger.debug("Joining pool...")
                pool.join()
                logger.debug("Pool closed and joined.")
            time.sleep(1)
        logger.debug("Done.")
        # Combine all redundant map edges
        stmt_ix_map_set = set([])
        for group_ix_map in stmt_ix_map:
            for ix_pair in group_ix_map:
                stmt_ix_map_set.add(ix_pair)
        return stmt_ix_map_set