Exemplo n.º 1
0
def _setup_signed_graph():
    edges, signed_edges, edge_beliefs, all_ns = _digraph_setup()
    seg = nx.MultiDiGraph()

    seg.add_edges_from(signed_edges)
    # ATTN!! seg.edges yields u, v, index while seg.edges() yields u, v
    for u, v, sign in seg.edges:
        seg.edges[(u, v, sign)]['sign'] = sign
        seg.edges[(u, v, sign)]['belief'] = edge_beliefs[(u, v)]

    for node, data in seg.nodes(data=True):
        data['ns'] = node[0]
        data['id'] = node[1]

    sng = signed_edges_to_signed_nodes(graph=seg,
                                       prune_nodes=True,
                                       copy_edge_data=True)
    for u, v in sng.edges:
        sng.edges[(u, v)]['weight'] = -np.log(sng.edges[(u, v)]['belief'])

    return seg, sng, all_ns
Exemplo n.º 2
0
def db_dump_to_pybel_sg(stmts_list=None,
                        pybel_model=None,
                        belief_dump=None,
                        default_belief=0.1,
                        sign_in_edges=False):
    """Create a signed pybel graph from an evidenceless dump from the db

    Parameters
    ----------
    stmts_list : list[indra.statements.Statement]
        Provide a list of statements if they are already loaded. By default
        the latest available pa statements dump is downloaded from s3.
        Default: None.
    pybel_model : pybel.BELGraph
        If provided, skip generating a new pybel model from scratch
    belief_dump : dict
        If provided, reset the belief scores associated with the statements
        supporting the edges.
    default_belief : float
        Only used if belief_dump is provided. When no belief score is
        available, reset to this belief score. Default: 0.1.
    sign_in_edges : bool
        If True, check that all edges are stored with an index corresponding
        to the sign of the edge. Default: False.

    Returns
    -------
    tuple(DiGraph, MultiDiGraph)
    """
    # Get statement dump:
    # Look for latest file on S3 and pickle.loads it
    if pybel_model is None:
        pb_model = _custom_pb_assembly(stmts_list)
    else:
        logger.info('Pybel model provided')
        pb_model = pybel_model

    # If belief dump is provided, reset beliefs to the entries in it
    if belief_dump:
        logger.info('Belief dump provided, resetting belief scores')
        missing_hash = 0
        changed_belief = 0
        no_hash = 0
        logger.info(f'Looking for belief scores among {len(pb_model.edges)} '
                    f'edges')
        for edge in pb_model.edges:
            ed = pb_model.edges[edge]
            if ed and ed.get('stmt_hash'):
                h = ed['stmt_hash']
                if h in belief_dump:
                    ed['belief'] = belief_dump[h]
                    changed_belief += 1
                else:
                    logger.warning(f'No belief found for {h}')
                    ed['belief'] = default_belief
                    missing_hash += 1
            else:
                no_hash += 1
        logger.info(f'{no_hash} edges did not have hashes')
        logger.info(f'{changed_belief} belief scores were changed')
        logger.info(f'{missing_hash} edges did not have a belief entry')

    # Get a signed edge graph
    logger.info('Getting a PyBEL signed edge graph')
    pb_signed_edge_graph = belgraph_to_signed_graph(
        pb_model,
        symmetric_variant_links=True,
        symmetric_component_links=True,
        propagate_annotations=True)

    if sign_in_edges:
        for u, v, ix in pb_signed_edge_graph.edges:
            ed = pb_signed_edge_graph.edges[(u, v, ix)]
            if 'sign' in ed and ix != ed['sign']:
                pb_signed_edge_graph.add_edge(u, v, ed['sign'], **ed)
                pb_signed_edge_graph.remove_edge(u, v, ix)

    # Map hashes to edges
    logger.info('Getting hash to signed edge mapping')
    seg_hash_edge_dict = {}
    for edge in pb_signed_edge_graph.edges:
        if pb_signed_edge_graph.edges[edge].get('stmt_hash'):
            seg_hash_edge_dict[pb_signed_edge_graph.edges[edge]
                               ['stmt_hash']] = edge
    pb_signed_edge_graph.graph['edge_by_hash'] = seg_hash_edge_dict

    # Get the signed node graph
    logger.info('Getting a signed node graph from signed edge graph')
    pb_signed_node_graph = signed_edges_to_signed_nodes(pb_signed_edge_graph,
                                                        copy_edge_data=True)

    # Map hashes to edges for signed nodes
    logger.info('Getting hash to edge mapping')
    sng_hash_edge_dict = {}
    for edge in pb_signed_node_graph.edges:
        if pb_signed_node_graph.edges[edge].get('stmt_hash'):
            sng_hash_edge_dict[pb_signed_node_graph.edges[edge]
                               ['stmt_hash']] = edge
    pb_signed_node_graph.graph['edge_by_hash'] = sng_hash_edge_dict

    logger.info('Done assembling signed edge and signed node PyBEL graphs')
    return pb_signed_edge_graph, pb_signed_node_graph
Exemplo n.º 3
0
def sif_dump_df_to_digraph(df: Union[pd.DataFrame, str],
                           date: str,
                           mesh_id_dict: Optional[Dict] = None,
                           graph_type: GraphTypes = 'digraph',
                           include_entity_hierarchies: bool = True,
                           sign_dict: Optional[Dict[str, int]] = None,
                           stmt_types: Optional[List[str]] = None,
                           z_sc_path: Optional[Union[str, pd.DataFrame]] = None,
                           verbosity: int = 0) \
        -> Union[DiGraph, MultiDiGraph, Tuple[MultiDiGraph, DiGraph]]:
    """Return a NetworkX digraph from a pandas dataframe of a db dump

    Parameters
    ----------
    df : Union[str, pd.DataFrame]
        A dataframe, either as a file path to a file (.pkl or .csv) or a
        pandas DataFrame object.
    date : str
        A date string specifying when the data was dumped from the database.
    mesh_id_dict : dict
        A dict object mapping statement hashes to all mesh ids sharing a 
        common PMID
    graph_type : str
        Return type for the returned graph. Currently supports:
            - 'digraph': DiGraph (Default)
            - 'multidigraph': MultiDiGraph
            - 'signed': Tuple[DiGraph, MultiDiGraph]
            - 'signed-expanded': Tuple[DiGraph, MultiDiGraph]
            - 'digraph-signed-types':  DiGraph
    include_entity_hierarchies : bool
        If True, add edges between nodes if they are related ontologically
        with stmt type 'fplx': e.g. BRCA1 is in the BRCA family, so an edge
        is added between the nodes BRCA and BRCA1. Default: True. Note that
        this option only is available for the options directed/unsigned graph
        and multidigraph.
    sign_dict : Dict[str, int]
        A dictionary mapping a Statement type to a sign to be used for the
        edge. By default only Activation and IncreaseAmount are added as
        positive edges and Inhibition and DecreaseAmount are added as
        negative edges, but a user can pass any other Statement types in a
        dictionary.
    stmt_types : List[str]
        A list of statement types to epxand out to other signs
    z_sc_path:
        If provided, must be or be path to a square dataframe with HGNC symbols
        as names on the axes and floats as entries
    verbosity: int
        Output various messages if > 0. For all messages, set to 4.

    Returns
    -------
    Union[DiGraph, MultiDiGraph, Tuple[DiGraph, MultiDiGraph]]
        The type is determined by the graph_type argument
    """
    graph_options = ('digraph', 'multidigraph', 'signed', 'signed-expanded',
                     'digraph-signed-types')
    if graph_type.lower() not in graph_options:
        raise ValueError(f'Graph type {graph_type} not supported. Can only '
                         f'chose between {graph_options}')
    sign_dict = sign_dict if sign_dict else default_sign_dict

    graph_type = graph_type.lower()
    date = date if date else datetime.now().strftime('%Y-%m-%d')

    if isinstance(df, str):
        sif_df = file_opener(df)
    else:
        sif_df = df

    if z_sc_path is not None:
        if isinstance(z_sc_path, str):
            if z_sc_path.endswith('h5'):
                logger.info(f'Loading z-scores from {z_sc_path}')
                z_sc_df = pd.read_hdf(z_sc_path)
            elif z_sc_path.endswith('pkl'):
                logger.info(f'Loading z-scores from {z_sc_path}')
                z_sc_df: pd.DataFrame = file_opener(z_sc_path)
            else:
                raise ValueError(f'Unrecognized file: {z_sc_path}')
        elif isinstance(z_sc_path, pd.DataFrame):
            z_sc_df = z_sc_path
        else:
            raise ValueError('Only file paths and data frames allowed as '
                             'arguments to z_sc_path')
    else:
        z_sc_df = None

    # If signed types: filter out rows that of unsigned types
    if graph_type == 'digraph-signed-types':
        sif_df = sif_df[sif_df.stmt_type.isin(sign_dict.keys())]

    sif_df = sif_dump_df_merger(sif_df,
                                graph_type,
                                sign_dict,
                                stmt_types,
                                mesh_id_dict,
                                verbosity=verbosity)

    # Map ns:id to node name
    logger.info('Creating dictionary mapping (ns,id) to node name')
    ns_id_name_tups = set(zip(
        sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union(
            set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name)))
    ns_id_to_nodename = {(ns, _id): name for ns, _id, name in ns_id_name_tups}

    # Map hashes to edge for non-signed graphs
    if graph_type in {'multidigraph', 'digraph', 'digraph-signed-types'}:
        logger.info('Creating dictionary mapping hashes to edges for '
                    'unsigned graph')
        hash_edge_dict = {
            h: (a, b)
            for a, b, h in zip(sif_df.agA_name, sif_df.agB_name,
                               sif_df.stmt_hash)
        }

    # Create graph from df
    if graph_type == 'multidigraph':
        indranet_graph = IndraNet.from_df(sif_df)
    elif graph_type in ('digraph', 'digraph-signed-types'):
        # Flatten
        indranet_graph = IndraNet.digraph_from_df(sif_df,
                                                  'complementary_belief',
                                                  _weight_mapping)
    elif graph_type in ('signed', 'signed-expanded'):
        signed_edge_graph: MultiDiGraph = IndraNet.signed_from_df(
            df=sif_df,
            flattening_method='complementary_belief',
            weight_mapping=_weight_mapping)
        signed_node_graph: DiGraph = signed_edges_to_signed_nodes(
            graph=signed_edge_graph, copy_edge_data=True)
        signed_edge_graph.graph['date'] = date
        signed_node_graph.graph['date'] = date
        signed_edge_graph.graph['node_by_ns_id'] = ns_id_to_nodename
        signed_node_graph.graph['node_by_ns_id'] = ns_id_to_nodename

        # Get hash to signed edge mapping
        logger.info('Creating dictionary mapping hashes to edges for '
                    'unsigned graph')
        seg_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set)
        for edge in signed_edge_graph.edges:
            for es in signed_edge_graph.edges[edge]['statements']:
                if graph_type == 'signed':
                    seg_hash_edge_dict[es['stmt_hash']] = edge
                else:
                    seg_hash_edge_dict[es['stmt_hash']].add(edge)
        signed_edge_graph.graph['edge_by_hash'] = seg_hash_edge_dict

        sng_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set)
        for edge in signed_node_graph.edges:
            for es in signed_node_graph.edges[edge]['statements']:
                if graph_type == 'signed':
                    sng_hash_edge_dict[es['stmt_hash']] = edge
                else:
                    sng_hash_edge_dict[es['stmt_hash']].add(edge)
        signed_node_graph.graph['edge_by_hash'] = sng_hash_edge_dict
        if z_sc_df is not None:
            # Set z-score attributes
            add_corr_to_edges(graph=signed_edge_graph, z_corr=z_sc_df)
            add_corr_to_edges(graph=signed_node_graph, z_corr=z_sc_df)

        return signed_edge_graph, signed_node_graph
    else:
        raise ValueError(f'Unrecognized graph type {graph_type}. Must be one '
                         f'of: {", ".join(graph_options)}')

    if z_sc_df is not None:
        # Set z-score attributes
        add_corr_to_edges(graph=indranet_graph, z_corr=z_sc_df)

    # Add hierarchy relations to graph (not applicable for signed graphs)
    if include_entity_hierarchies and graph_type in ('multidigraph',
                                                     'digraph'):
        from depmap_analysis.network_functions.famplex_functions import \
            get_all_entities
        logger.info('Fetching entity hierarchy relationships')
        full_entity_list = get_all_entities()
        logger.info('Adding entity hierarchy manager as graph attribute')
        node_by_uri = {uri: _id for (ns, _id, uri) in full_entity_list}
        added_pairs = set()  # Save (A, B, URI)
        logger.info('Building entity relations to be added to data frame')
        entities = 0
        non_corr_weight = None
        if z_sc_df is not None:
            # Get non-corr weight
            for edge in indranet_graph.edges:
                if indranet_graph.edges[edge]['z_score'] == 0:
                    non_corr_weight = indranet_graph.edges[edge]['corr_weight']
                    break
            assert non_corr_weight is not None
            z_sc_attrs = {'z_score': 0, 'corr_weight': non_corr_weight}
        else:
            z_sc_attrs = {}

        for ns, _id, uri in full_entity_list:
            node = _id
            # Get name in case it's different than id
            if ns_id_to_nodename.get((ns, _id), None):
                node = ns_id_to_nodename[(ns, _id)]
            else:
                ns_id_to_nodename[(ns, _id)] = node

            # Add famplex edge
            for pns, pid in bio_ontology.get_parents(ns, _id):
                puri = get_identifiers_url(pns, pid)
                pnode = pid
                if ns_id_to_nodename.get((pns, pid), None):
                    pnode = ns_id_to_nodename[(pns, pid)]
                else:
                    ns_id_to_nodename[(pns, pid)] = pnode
                # Check if edge already exists
                if (node, pnode, puri) not in added_pairs:
                    entities += 1
                    # Belief and evidence are conditional
                    added_pairs.add((node, pnode, puri))  # A, B, uri of B
                    ed = {
                        'agA_name': node,
                        'agA_ns': ns,
                        'agA_id': _id,
                        'agB_name': pnode,
                        'agB_ns': pns,
                        'agB_id': pid,
                        'stmt_type': 'fplx',
                        'evidence_count': 1,
                        'source_counts': {
                            'fplx': 1
                        },
                        'stmt_hash': puri,
                        'belief': 1.0,
                        'weight': MIN_WEIGHT,
                        'curated': True,
                        'english': f'{pns}:{pid} is an ontological parent '
                        f'of {ns}:{_id}',
                        'z_score': 0,
                        'corr_weight': 1
                    }
                    # Add non-existing nodes
                    if ed['agA_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agA_name'],
                                                ns=ed['agA_ns'],
                                                id=ed['agA_id'])
                    if ed['agB_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agB_name'],
                                                ns=ed['agB_ns'],
                                                id=ed['agB_id'])
                    # Add edges
                    ed.pop('agA_id')
                    ed.pop('agA_ns')
                    ed.pop('agB_id')
                    ed.pop('agB_ns')
                    if indranet_graph.is_multigraph():
                        # MultiDiGraph
                        indranet_graph.add_edge(ed['agA_name'], ed['agB_name'],
                                                **ed)
                    else:
                        # DiGraph
                        u = ed.pop('agA_name')
                        v = ed.pop('agB_name')

                        # Check edge
                        if indranet_graph.has_edge(u, v):
                            indranet_graph.edges[(u,
                                                  v)]['statements'].append(ed)
                        else:
                            indranet_graph.add_edge(u,
                                                    v,
                                                    belief=1.0,
                                                    weight=1.0,
                                                    statements=[ed],
                                                    **z_sc_attrs)

        logger.info('Loaded %d entity relations into dataframe' % entities)
        indranet_graph.graph['node_by_uri'] = node_by_uri
    indranet_graph.graph['node_by_ns_id'] = ns_id_to_nodename
    indranet_graph.graph['edge_by_hash'] = hash_edge_dict
    indranet_graph.graph['date'] = date
    return indranet_graph