示例#1
0
def get_famplex_links_from_lists(genes_appearing, fplx_appearing):
    links = []
    for gene in genes_appearing:
        parent_ids = [p[1] for p in bio_ontology.get_parents('HGNC', gene)]
        parents_appearing = fplx_appearing & set(parent_ids)
        links += [(gene, parent) for parent in parents_appearing]
    for fplx_child in fplx_appearing:
        parent_ids = [
            p[1] for p in bio_ontology.get_parents('FPLX', fplx_child)
        ]
        parents_appearing = fplx_appearing & set(parent_ids)
        links += [(fplx_child, parent) for parent in parents_appearing]
    return links
示例#2
0
def add_ido_parents(bio_ontology: BioOntology):
    ido_root = bio_ontology.label('IDO', '0')
    bio_ontology.add_node(ido_root, name='infectious disease concept')
    edges_to_add = []
    for node in bio_ontology.nodes():
        if bio_ontology.get_ns(node) == 'IDO' and not \
                bio_ontology.get_parents(*bio_ontology.get_ns_id(node)):
            edges_to_add.append((node, ido_root, {'type': 'isa'}))
    bio_ontology.add_edges_from(edges_to_add)
示例#3
0
def add_efo_parents(bio_ontology):
    edges_to_add = []
    efo_root = 'EFO:0000001'
    for node in bio_ontology.nodes():
        if bio_ontology.get_ns(node) == 'EFO' and \
                not bio_ontology.get_parents(*bio_ontology.get_ns_id(node)):
            edges_to_add.append((node, efo_root, {'type': 'isa'}))
    print('Adding %d EFO isa edges.' % len(edges_to_add))
    bio_ontology.add_edges_from(edges_to_add)
示例#4
0
def get_famplex_terms(genes):
    """Get a list of associated FamPlex IDs from a list of gene IDs."""
    all_parents = set()
    for hgnc_id in genes:
        parent_ids = {p[1] for p in bio_ontology.get_parents('HGNC', hgnc_id)}
        all_parents |= parent_ids
    fplx_terms = sorted(list(all_parents))
    logger.info('Found %d relevant FamPlex terms.' % (len(fplx_terms)))
    return fplx_terms
示例#5
0
def get_family(agents):
    """Get a FamPlex family if all of its members are given."""
    family_sets = []
    ag_groundings = []
    for ag in agents:
        gr = ag.get_grounding()
        ag_groundings.append(gr)
        parents = bio_ontology.get_parents(*gr)
        families = {p for p in parents if p[0] == 'FPLX'}
        family_sets.append(families)
    common_families = family_sets[0].intersection(*family_sets)
    if not common_families:
        return
    for fam in common_families:
        children = bio_ontology.get_children(*fam)
        # Check if all family members are present
        if set(children) == set(ag_groundings):
            return fam[1]
示例#6
0
def unify_lspci(stmts):
    from indra.statements.agent import default_ns_order
    from indra.ontology.bio import bio_ontology
    logger.info('Unifying by LSPCI with %d statements' % len(stmts))
    orig_ns_order = indra.statements.agent.default_ns_order[:]
    indra.statements.agent.default_ns_order = ['LSPCI'] + \
        indra.statements.agent.default_ns_order
    agents_by_lspci = defaultdict(list)
    ns_order = default_ns_order + ['CHEMBL', 'DRUGBANK', 'HMS-LINCS', 'CAS']
    for stmt in stmts:
        for agent in stmt.real_agent_list():
            if 'LSPCI' in agent.db_refs:
                agents_by_lspci[agent.db_refs['LSPCI']].append(agent)
            else:
                agent_gr = agent.get_grounding(ns_order=ns_order)
                if agent_gr[0] is None:
                    continue
                else:
                    parents = bio_ontology.get_parents(*agent_gr)
                    lspci_parents = [p[1] for p in parents if p[0] == 'LSPCI']
                    if len(lspci_parents) != 1:
                        continue
                    lspci_parent = lspci_parents[0]
                    agents_by_lspci[lspci_parent].append(agent)

    for lspci, agents in agents_by_lspci.items():
        lspci_name = bio_ontology.get_name('LSPCI', lspci)
        standard_name = lspci_name if lspci_name else agents[0].name
        for agent in agents:
            agent.db_refs['LSPCI'] = lspci
            agent.name = standard_name

    unique_stmts = ac.run_preassembly(stmts, run_refinement=False)
    indra.statements.agent.default_ns_order = orig_ns_order
    logger.info('Finished unification with %d statements' % len(unique_stmts))
    return unique_stmts
示例#7
0
def sif_dump_df_to_digraph(df: Union[pd.DataFrame, str],
                           date: str,
                           mesh_id_dict: Optional[Dict] = None,
                           graph_type: GraphTypes = 'digraph',
                           include_entity_hierarchies: bool = True,
                           sign_dict: Optional[Dict[str, int]] = None,
                           stmt_types: Optional[List[str]] = None,
                           z_sc_path: Optional[Union[str, pd.DataFrame]] = None,
                           verbosity: int = 0) \
        -> Union[DiGraph, MultiDiGraph, Tuple[MultiDiGraph, DiGraph]]:
    """Return a NetworkX digraph from a pandas dataframe of a db dump

    Parameters
    ----------
    df : Union[str, pd.DataFrame]
        A dataframe, either as a file path to a file (.pkl or .csv) or a
        pandas DataFrame object.
    date : str
        A date string specifying when the data was dumped from the database.
    mesh_id_dict : dict
        A dict object mapping statement hashes to all mesh ids sharing a 
        common PMID
    graph_type : str
        Return type for the returned graph. Currently supports:
            - 'digraph': DiGraph (Default)
            - 'multidigraph': MultiDiGraph
            - 'signed': Tuple[DiGraph, MultiDiGraph]
            - 'signed-expanded': Tuple[DiGraph, MultiDiGraph]
            - 'digraph-signed-types':  DiGraph
    include_entity_hierarchies : bool
        If True, add edges between nodes if they are related ontologically
        with stmt type 'fplx': e.g. BRCA1 is in the BRCA family, so an edge
        is added between the nodes BRCA and BRCA1. Default: True. Note that
        this option only is available for the options directed/unsigned graph
        and multidigraph.
    sign_dict : Dict[str, int]
        A dictionary mapping a Statement type to a sign to be used for the
        edge. By default only Activation and IncreaseAmount are added as
        positive edges and Inhibition and DecreaseAmount are added as
        negative edges, but a user can pass any other Statement types in a
        dictionary.
    stmt_types : List[str]
        A list of statement types to epxand out to other signs
    z_sc_path:
        If provided, must be or be path to a square dataframe with HGNC symbols
        as names on the axes and floats as entries
    verbosity: int
        Output various messages if > 0. For all messages, set to 4.

    Returns
    -------
    Union[DiGraph, MultiDiGraph, Tuple[DiGraph, MultiDiGraph]]
        The type is determined by the graph_type argument
    """
    graph_options = ('digraph', 'multidigraph', 'signed', 'signed-expanded',
                     'digraph-signed-types')
    if graph_type.lower() not in graph_options:
        raise ValueError(f'Graph type {graph_type} not supported. Can only '
                         f'chose between {graph_options}')
    sign_dict = sign_dict if sign_dict else default_sign_dict

    graph_type = graph_type.lower()
    date = date if date else datetime.now().strftime('%Y-%m-%d')

    if isinstance(df, str):
        sif_df = file_opener(df)
    else:
        sif_df = df

    if z_sc_path is not None:
        if isinstance(z_sc_path, str):
            if z_sc_path.endswith('h5'):
                logger.info(f'Loading z-scores from {z_sc_path}')
                z_sc_df = pd.read_hdf(z_sc_path)
            elif z_sc_path.endswith('pkl'):
                logger.info(f'Loading z-scores from {z_sc_path}')
                z_sc_df: pd.DataFrame = file_opener(z_sc_path)
            else:
                raise ValueError(f'Unrecognized file: {z_sc_path}')
        elif isinstance(z_sc_path, pd.DataFrame):
            z_sc_df = z_sc_path
        else:
            raise ValueError('Only file paths and data frames allowed as '
                             'arguments to z_sc_path')
    else:
        z_sc_df = None

    # If signed types: filter out rows that of unsigned types
    if graph_type == 'digraph-signed-types':
        sif_df = sif_df[sif_df.stmt_type.isin(sign_dict.keys())]

    sif_df = sif_dump_df_merger(sif_df,
                                graph_type,
                                sign_dict,
                                stmt_types,
                                mesh_id_dict,
                                verbosity=verbosity)

    # Map ns:id to node name
    logger.info('Creating dictionary mapping (ns,id) to node name')
    ns_id_name_tups = set(zip(
        sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union(
            set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name)))
    ns_id_to_nodename = {(ns, _id): name for ns, _id, name in ns_id_name_tups}

    # Map hashes to edge for non-signed graphs
    if graph_type in {'multidigraph', 'digraph', 'digraph-signed-types'}:
        logger.info('Creating dictionary mapping hashes to edges for '
                    'unsigned graph')
        hash_edge_dict = {
            h: (a, b)
            for a, b, h in zip(sif_df.agA_name, sif_df.agB_name,
                               sif_df.stmt_hash)
        }

    # Create graph from df
    if graph_type == 'multidigraph':
        indranet_graph = IndraNet.from_df(sif_df)
    elif graph_type in ('digraph', 'digraph-signed-types'):
        # Flatten
        indranet_graph = IndraNet.digraph_from_df(sif_df,
                                                  'complementary_belief',
                                                  _weight_mapping)
    elif graph_type in ('signed', 'signed-expanded'):
        signed_edge_graph: MultiDiGraph = IndraNet.signed_from_df(
            df=sif_df,
            flattening_method='complementary_belief',
            weight_mapping=_weight_mapping)
        signed_node_graph: DiGraph = signed_edges_to_signed_nodes(
            graph=signed_edge_graph, copy_edge_data=True)
        signed_edge_graph.graph['date'] = date
        signed_node_graph.graph['date'] = date
        signed_edge_graph.graph['node_by_ns_id'] = ns_id_to_nodename
        signed_node_graph.graph['node_by_ns_id'] = ns_id_to_nodename

        # Get hash to signed edge mapping
        logger.info('Creating dictionary mapping hashes to edges for '
                    'unsigned graph')
        seg_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set)
        for edge in signed_edge_graph.edges:
            for es in signed_edge_graph.edges[edge]['statements']:
                if graph_type == 'signed':
                    seg_hash_edge_dict[es['stmt_hash']] = edge
                else:
                    seg_hash_edge_dict[es['stmt_hash']].add(edge)
        signed_edge_graph.graph['edge_by_hash'] = seg_hash_edge_dict

        sng_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set)
        for edge in signed_node_graph.edges:
            for es in signed_node_graph.edges[edge]['statements']:
                if graph_type == 'signed':
                    sng_hash_edge_dict[es['stmt_hash']] = edge
                else:
                    sng_hash_edge_dict[es['stmt_hash']].add(edge)
        signed_node_graph.graph['edge_by_hash'] = sng_hash_edge_dict
        if z_sc_df is not None:
            # Set z-score attributes
            add_corr_to_edges(graph=signed_edge_graph, z_corr=z_sc_df)
            add_corr_to_edges(graph=signed_node_graph, z_corr=z_sc_df)

        return signed_edge_graph, signed_node_graph
    else:
        raise ValueError(f'Unrecognized graph type {graph_type}. Must be one '
                         f'of: {", ".join(graph_options)}')

    if z_sc_df is not None:
        # Set z-score attributes
        add_corr_to_edges(graph=indranet_graph, z_corr=z_sc_df)

    # Add hierarchy relations to graph (not applicable for signed graphs)
    if include_entity_hierarchies and graph_type in ('multidigraph',
                                                     'digraph'):
        from depmap_analysis.network_functions.famplex_functions import \
            get_all_entities
        logger.info('Fetching entity hierarchy relationships')
        full_entity_list = get_all_entities()
        logger.info('Adding entity hierarchy manager as graph attribute')
        node_by_uri = {uri: _id for (ns, _id, uri) in full_entity_list}
        added_pairs = set()  # Save (A, B, URI)
        logger.info('Building entity relations to be added to data frame')
        entities = 0
        non_corr_weight = None
        if z_sc_df is not None:
            # Get non-corr weight
            for edge in indranet_graph.edges:
                if indranet_graph.edges[edge]['z_score'] == 0:
                    non_corr_weight = indranet_graph.edges[edge]['corr_weight']
                    break
            assert non_corr_weight is not None
            z_sc_attrs = {'z_score': 0, 'corr_weight': non_corr_weight}
        else:
            z_sc_attrs = {}

        for ns, _id, uri in full_entity_list:
            node = _id
            # Get name in case it's different than id
            if ns_id_to_nodename.get((ns, _id), None):
                node = ns_id_to_nodename[(ns, _id)]
            else:
                ns_id_to_nodename[(ns, _id)] = node

            # Add famplex edge
            for pns, pid in bio_ontology.get_parents(ns, _id):
                puri = get_identifiers_url(pns, pid)
                pnode = pid
                if ns_id_to_nodename.get((pns, pid), None):
                    pnode = ns_id_to_nodename[(pns, pid)]
                else:
                    ns_id_to_nodename[(pns, pid)] = pnode
                # Check if edge already exists
                if (node, pnode, puri) not in added_pairs:
                    entities += 1
                    # Belief and evidence are conditional
                    added_pairs.add((node, pnode, puri))  # A, B, uri of B
                    ed = {
                        'agA_name': node,
                        'agA_ns': ns,
                        'agA_id': _id,
                        'agB_name': pnode,
                        'agB_ns': pns,
                        'agB_id': pid,
                        'stmt_type': 'fplx',
                        'evidence_count': 1,
                        'source_counts': {
                            'fplx': 1
                        },
                        'stmt_hash': puri,
                        'belief': 1.0,
                        'weight': MIN_WEIGHT,
                        'curated': True,
                        'english': f'{pns}:{pid} is an ontological parent '
                        f'of {ns}:{_id}',
                        'z_score': 0,
                        'corr_weight': 1
                    }
                    # Add non-existing nodes
                    if ed['agA_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agA_name'],
                                                ns=ed['agA_ns'],
                                                id=ed['agA_id'])
                    if ed['agB_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agB_name'],
                                                ns=ed['agB_ns'],
                                                id=ed['agB_id'])
                    # Add edges
                    ed.pop('agA_id')
                    ed.pop('agA_ns')
                    ed.pop('agB_id')
                    ed.pop('agB_ns')
                    if indranet_graph.is_multigraph():
                        # MultiDiGraph
                        indranet_graph.add_edge(ed['agA_name'], ed['agB_name'],
                                                **ed)
                    else:
                        # DiGraph
                        u = ed.pop('agA_name')
                        v = ed.pop('agB_name')

                        # Check edge
                        if indranet_graph.has_edge(u, v):
                            indranet_graph.edges[(u,
                                                  v)]['statements'].append(ed)
                        else:
                            indranet_graph.add_edge(u,
                                                    v,
                                                    belief=1.0,
                                                    weight=1.0,
                                                    statements=[ed],
                                                    **z_sc_attrs)

        logger.info('Loaded %d entity relations into dataframe' % entities)
        indranet_graph.graph['node_by_uri'] = node_by_uri
    indranet_graph.graph['node_by_ns_id'] = ns_id_to_nodename
    indranet_graph.graph['edge_by_hash'] = hash_edge_dict
    indranet_graph.graph['date'] = date
    return indranet_graph
示例#8
0
def test_mtorc_get_parents():
    p = bio_ontology.get_parents('HGNC', hgnc_client.get_hgnc_id('RICTOR'))
    assert len(p) == 1
    assert p == [('FPLX', 'mTORC2')]
示例#9
0
def test_ido_parents():
    parents = bio_ontology.get_parents('IDO', '0000514')
    assert ('IDO', '0000509') in parents
示例#10
0
def test_efo_bfo_relations():
    assert set(bio_ontology.get_parents('EFO', '0004542')) == \
        {('BFO', '0000015'), ('EFO', '0000001')}
示例#11
0
def test_get_parents():
    prkaa1 = ('HGNC', '9376')
    ampk = ('FPLX', 'AMPK')
    p1 = bio_ontology.get_parents(*prkaa1)
    assert len(p1) == 8, p1
    assert ampk in p1
示例#12
0
def has_fplx_parents(bio_ontology, node):
    """Return True if the given ontology node has FamPlex parents."""
    parents = bio_ontology.get_parents(*bio_ontology.get_ns_id(node))
    if any(p[0] == 'FPLX' for p in parents):
        return True
    return False
示例#13
0
def test_eccode_isa():
    parents = set(bio_ontology.get_parents('ECCODE', '1.1.1.1'))
    assert parents == {('ECCODE', '1.1.1'), ('ECCODE', '1.1'),
                       ('ECCODE', '1')}, parents
    assert bio_ontology.isa('ECCODE', '1.1.1.1', 'ECCODE', '1.1.1')
    ip = indra_db_rest.get_statements(agents=['%s@FPLX' % fplx_id],
                                      ev_limit=10000)
    stmts = filter_out_medscan(ip.statements)
    stmts = ac.filter_human_only(stmts)
    return stmts


if __name__ == '__main__':
    with open(kinase_pkl, 'rb') as fh:
        kinase_stmts = pickle.load(fh)
    fplx_by_kinase = defaultdict(set)
    kinase_by_fplx = defaultdict(set)
    kinase_counts = {}
    for kinase, stmts in kinase_stmts.items():
        hgnc_id = hgnc_client.get_hgnc_id(kinase)
        parents = bio_ontology.get_parents('HGNC', hgnc_id)
        fplx_by_kinase[kinase] |= {fplx_id for _, fplx_id in parents}
        for _, fplx_id in parents:
            kinase_by_fplx[fplx_id].add(kinase)
        kinase_counts[kinase] = len(stmts)
    kinase_by_fplx = dict(kinase_by_fplx)
    fplx_by_kinase = dict(fplx_by_kinase)

    fplx_stmts = {}
    fplx_counts = {}
    for fplx_id in tqdm.tqdm(kinase_by_fplx):
        fplx_stmts[fplx_id] = get_fplx_stmts(fplx_id)
        fplx_counts[fplx_id] = len(fplx_stmts[fplx_id])
    with open('fplx_stmts.pkl', 'wb') as fh:
        pickle.dump(fplx_stmts, fh)
    with open('fplx_counts.json', 'w') as fh: