def test_modularity(): source = np.matrix(test_matrices[4][0]) target = test_matrices[4][1] clusters = mc.get_clusters(mc.run_mcl(source)) quality = mc.modularity(source, clusters) assert np.isclose(quality, target)
def MCL(cdr3, edgelist=None, mcl_hyper=[1.2, 2], outfile=None): """ Perform clustering on a network of CDR3 amino acid sequences with a known hamming distance, using the Markov clustering (MCL) algorithm. For more info about the inflation and expansion parameters, visit: https://micans.org/mcl/ Parameters ---------- edgelist : set, optional Tab-separated edgelist. The default is None. mcl_hyper : list, optional MCL hyperparameters: inflation and expansion. The default is [1.2,2]. outfile : str, optional Name of outfile. The default is None. Returns ------- clusters : pd.DataFrame pd.DataFrame containing two columns: 'CDR3' and 'cluster'. The first column contains CDR3 sequences, the second column contains the corresponding cluster ids. """ if edgelist is None: edgelist = create_edgelist(cdr3) try: G = nx.parse_adjlist(edgelist, nodetype=str) m = nx.to_scipy_sparse_array(G) # Run MCL result = mcl.run_mcl(m, inflation=mcl_hyper[0], expansion=mcl_hyper[1]) mcl_output = mcl.get_clusters(result) identifiers = list(G.nodes()) # Map cluster ids back to seqs cluster_ids = dict() for i in range(len(mcl_output)): cluster_ids[i] = list(identifiers[i] for i in mcl_output[i]) # Generate nodelist clusters = {"CDR3": [], "cluster": []} for c in cluster_ids: for seq in cluster_ids[c]: clusters["CDR3"].append(seq) clusters["cluster"].append(c) clusters = pd.DataFrame(data=clusters) # Write to file if outfile is not None: clusters.to_csv(outfile, sep="\t", index=False) except nx.NetworkXError: clusters = pd.DataFrame({"CDR3": [], "cluster": []}) return clusters