Exemplo n.º 1
0
def test_modularity():
    source = np.matrix(test_matrices[4][0])
    target = test_matrices[4][1]
    clusters = mc.get_clusters(mc.run_mcl(source))

    quality = mc.modularity(source, clusters)
    assert np.isclose(quality, target)
Exemplo n.º 2
0
def MCL(cdr3, edgelist=None, mcl_hyper=[1.2, 2], outfile=None):
    """
    Perform clustering on a network of CDR3 amino acid sequences with
    a known hamming distance, using the Markov clustering (MCL) algorithm.
    For more info about the inflation and expansion parameters,
    visit: https://micans.org/mcl/


    Parameters
    ----------
    edgelist : set, optional
        Tab-separated edgelist. The default is None.
    mcl_hyper : list, optional
        MCL hyperparameters: inflation and expansion.
        The default is [1.2,2].
    outfile : str, optional
        Name of outfile. The default is None.

    Returns
    -------
    clusters : pd.DataFrame
        pd.DataFrame containing two columns: 'CDR3' and 'cluster'.
        The first column contains CDR3 sequences, the second column
        contains the corresponding cluster ids.
    """
    if edgelist is None:
        edgelist = create_edgelist(cdr3)

    try:
        G = nx.parse_adjlist(edgelist, nodetype=str)
        m = nx.to_scipy_sparse_array(G)
    
        # Run MCL
        result = mcl.run_mcl(m, inflation=mcl_hyper[0], expansion=mcl_hyper[1])
        mcl_output = mcl.get_clusters(result)
        identifiers = list(G.nodes())
    
        # Map cluster ids back to seqs
        cluster_ids = dict()
        for i in range(len(mcl_output)):
            cluster_ids[i] = list(identifiers[i] for i in mcl_output[i])
    
        # Generate nodelist
        clusters = {"CDR3": [], "cluster": []}
        for c in cluster_ids:
            for seq in cluster_ids[c]:
                clusters["CDR3"].append(seq)
                clusters["cluster"].append(c)
        clusters = pd.DataFrame(data=clusters)
    
        # Write to file
        if outfile is not None:
            clusters.to_csv(outfile, sep="\t", index=False)
    except nx.NetworkXError:
        clusters = pd.DataFrame({"CDR3": [], "cluster": []})

    return clusters