def test_msa2str(self): aranger = '{body}{meta}' # read msa traditionally into an object msa_a = MSA(test_data('harry.msa')) # read msa from dictionary msa_b = qlc.read_msa(test_data('harry.msa')) # read msa with IDs msa_c = qlc.read_msa(test_data('harry_with_ids.msa'), ids=True, header=False) # we adjust the dataset and the seq_id since otherwise we won't have # similar output msa_c['seq_id'] = 'test' msa_c['dataset'] = 'file' # when converting these different objects to string with the same body # and the like, they should be identical, so we check this here str_a = msa2str(msa_a, _arange=aranger) str_b = msa2str(msa_b, _arange=aranger) str_c = msa2str(msa_c, _arange=aranger, wordlist=False) assert str_a == str_b == str_c # we next test for converting with the merging attribute str_d = msa2str(msa_c, _arange=aranger, wordlist=True, merge=True) str_e = msa2str(msa_c, _arange=aranger, wordlist=True, merge=False) # remove tabstops for checking similar strings str_d_st = str_d.replace('\t', '') str_e_st = str_e.replace('\t', '') # get index until 'COLUMN' idx = str_d_st.index('COLUMNID') assert str_d != str_e and str_d_st[:idx] == str_e_st[:idx] # add a consensus string to all msa objects consensus_a = get_consensus(MSA(msa_b), gaps=True) consensus_b = get_consensus(MSA(msa_c), gaps=True) msa_b['consensus'] = consensus_a msa_c['consensus'] = consensus_b assert msa2str(msa_b) == msa2str(msa_c, wordlist=False)
#printTree(cognateGuideTree,0,names=[germanicNameTable[lang] for lang in cognateLangs]) cognateNameTable = [ nameTable[lang] for lang in cognateLangs ] tree_mtx = convert.newick.nwk2guidetree( str(cognateGuideTree)) multi.prog_align(model=internal_asjp, gop=-4, scale=0.9, guide_tree=tree_mtx) #print(multi) cons = get_consensus(multi, cognateGuideTree, recon_alg="sankoff_parsimony", gaps=True, classes=False, rep_weights=rep_weights, local="gap") #collect correction estimates based on the assumption that the reconstruction is correct for node in cognateGuideTree.postorder(): if not node.isTip(): for i in range(len(node.reconstructed)): #OLD VERSION, this lead to locally suboptimal values because it built on the most parsimonious reconstruction! #minValue = node.sankoffTable[i][node.reconstructed[i]] #optimalChar = node.reconstructed[i] minValue = min(node.sankoffTable[i].values()) optimalChar = [ key for key in node.sankoffTable[i].keys() if node.sankoffTable[i][key] == minValue
def strict_compatibility_graph(wordlist, ref='partial_ids', pos='T', mintax=3, verbose=False, use_taxa=[ "Old_Burmese", "Burmese", "Written_Burmese", "Rangoon", "Achang_Longchuan", "Xiandao", "Lashi", "Atsi", "Bola", "Maru" ]): if [x for x in use_taxa if x not in wordlist.taxa]: raise ValueError( "Your list of taxa contains taxa not in the wordlist.") G = nx.Graph() stats = [0, 0] alignments, cogids, cstrings = [], [], [] for cogid, msa in wordlist.msa[ref].items(): taxa = msa['taxa'] if len(set(taxa)) >= mintax: stats[0] += 1 consensus = get_consensus(msa['alignment'], gaps=True) prostring = prosodic_string(consensus) pidx = prostring.find(pos) if pidx != -1: stats[1] += 1 reflexes = [] for t in use_taxa: if t not in taxa: reflexes += ['Ø'] else: reflexes += [msa['alignment'][taxa.index(t)][pidx]] alignments += [reflexes] cogids += [cogid] cstrings += [consensus[pidx]] G.add_node(str(cogid), column=' '.join(alignments[-1]), consensus=consensus[pidx], clique=0, cliquesize=0, color=tokens2class(consensus, color)[0], fuzzy=[]) if verbose: print('Patterns in total: {0}\nPatterns with condition: {1}'.format( stats[0], stats[1])) input('<OK>') for (cogA, colA, consA), (cogB, colB, consB) in combinations(zip(cogids, alignments, cstrings), r=2): cc = compatible_columns(colA, colB, gap="Ø") if cc > 0: G.add_edge(str(cogA), str(cogB), weight=cc) # find cliques cliques = [ x for x in sorted( nx.find_cliques(G), key=lambda x: len(x), reverse=False) if len(x) > 1 ] # assign to clique with highest compatibility clique_dict = {} for i, clique in enumerate(cliques): weight = 0 for nA, nB in combinations(clique, r=2): weight += G.edge[nA][nB]['weight'] clique_dict[i + 1] = weight / len(clique) # assemble fuzzy nodes for i, clique in enumerate(cliques): for node in clique: G.node[node]['fuzzy'] += [i + 1] # assign to clique with highest compatibility for i, (n, d) in enumerate(sorted(G.nodes(data=True))): if d['fuzzy']: cliques = sorted(d['fuzzy'], reverse=True, key=lambda x: clique_dict[x]) G.node[n]['clique'] = cliques[0] G.node[n]['cliquesize'] = clique_dict[cliques[0]] G.node[n]['fuzzy'] = cliques # recount number of cliques current_cliques = defaultdict(list) for n, d in G.nodes(data=True): if d['clique']: current_cliques[d['clique']] += [n] # recalculate weights nclique_dict = {} for clique, nodes in current_cliques.items(): weight = 0 for nA, nB in combinations(nodes, r=2): weight += G.edge[nA][nB]['weight'] nclique_dict[clique] = weight / len(nodes) for n, d in G.nodes(data=True): if d['clique']: fuzzies = sorted(d['fuzzy'], key=lambda x: nclique_dict.get(x, 0), reverse=True) d['clique'] = fuzzies[0] d['cliquesize'] = nclique_dict[fuzzies[0]] # make a compatibility check again for all cliques with each other # recount number of cliques current_cliques = defaultdict(list) for n, d in G.nodes(data=True): if d['clique']: current_cliques[d['clique']] += [n] new_nodes = {} visited = [] for (c1, nodes1), (c2, nodes2) in sorted(combinations(current_cliques.items(), r=2), key=lambda x: (len(x[0][1]), len(x[1][1]))): if c1 not in visited and c2 not in visited: nnodes1 = new_nodes.get(c1, nodes1) nnodes2 = new_nodes.get(c2, nodes2) # consensus 1 cons1 = pattern_consensus( [G.node[n]['column'].split(' ') for n in nnodes1]) cons2 = pattern_consensus( [G.node[n]['column'].split(' ') for n in nnodes2]) comp = compatible_columns(cons1, cons2, gap='Ø') if comp > 0: if len(nnodes1) > len(nnodes2) and len(nnodes1) >= 1: for n in nnodes2: G.node[n]['clique'] = c1 new_nodes[c1] = nnodes1 + nnodes2 new_nodes[c2] = nnodes1 + nnodes2 visited += [c1, c2] #print('merged', c1, c2) #for n in new_nodes[c1]: # print(G.node[n]['column']) #input() elif len(nnodes2) > len(nnodes1) and len(nnodes1) >= 1: for n in nodes1: G.node[n]['clique'] = c2 new_nodes[c1] = nnodes1 + nnodes2 new_nodes[c2] = nnodes1 + nnodes2 visited += [c1, c2] #print(':merged', c2, c1) #for n in new_nodes[c1]: # print(G.node[n]['column']) #input() # re-calculate cliques and weights current_cliques = defaultdict(list) for n, d in G.nodes(data=True): if d['clique']: current_cliques[d['clique']] += [n] # recalculate weights nclique_dict = {} for clique, nodes in current_cliques.items(): weight = 0 for nA, nB in combinations(nodes, r=2): weight += G.edge[nA][nB]['weight'] if nB in G.edge[nA] else 0 nclique_dict[clique] = weight / len(nodes) # determine clique sizes for node, data in G.nodes(data=True): data['fuzzy'] = '/'.join(sorted([str(x) for x in data['fuzzy']])) if data['clique']: data['cliquesize'] = nclique_dict[data['clique']] for node, data in G.nodes(data=True): data['commons'] = '{0}-{1}'.format(data['cliquesize'], data['clique']) return G, nclique_dict
for newCognateID in range(1,numCognates + 1): multi = MSA("cognates/" + phylName + "/" + familyName + "/" + str(conceptID - 3) + "." + str(newCognateID) + ".msq",merge_vowels=False,unique_seqs=False) cognateLangs = [nameToID[taxon] - langs[0] for taxon in multi.taxa] #cognateLangs = [int(lexdict[IDList[0]][0]) - langs[0] for IDList in etym_dict[cognateID] if IDList != 0] #print "cognate set " + str(cognateID) + " - cognate langs: " + str(cognateLangs) if len(cognateLangs) > 1: #cognate sets of size 1 are useless cognateGuideTree = subGuideTree(familyGuideTree,cognateLangs) #print("\nAligning cognate " + str(cognateID) + ":") #print " cognate langs = " + str(cognateLangs) #printTree(cognateGuideTree,0,names=[germanicNameTable[lang] for lang in cognateLangs]) cognateNameTable = [nameTable[lang] for lang in cognateLangs] tree_mtx = convert.newick.nwk2guidetree(str(cognateGuideTree)) multi.prog_align(model=internal_asjp,gop=-4,scale=0.9,guide_tree=tree_mtx) #print(multi) cons = get_consensus(multi, cognateGuideTree, recon_alg="sankoff_parsimony", gaps=True, classes=False, rep_weights = rep_weights, local = "gap") #collect correction estimates based on the assumption that the reconstruction is correct for node in cognateGuideTree.postorder(): if not node.isTip(): for i in range(len(node.reconstructed)): #OLD VERSION, this lead to locally suboptimal values because it built on the most parsimonious reconstruction! #minValue = node.sankoffTable[i][node.reconstructed[i]] #optimalChar = node.reconstructed[i] minValue = min(node.sankoffTable[i].values()) optimalChar = [key for key in node.sankoffTable[i].keys() if node.sankoffTable[i][key]==minValue][0] if len(optimalChar) == 1: if len(node.sankoffTable[i].keys()) == 1: secondPlaceValue = 0 else: secondPlaceValue = 65000 #simulating Integer.maxInt
def strict_compatibility_graph(wordlist, ref='partial_ids', pos='T', mintax=3, verbose=False, use_taxa=["Old_Burmese", "Burmese", "Written_Burmese", "Rangoon", "Achang_Longchuan", "Xiandao", "Lashi", "Atsi", "Bola", "Maru"]): if [x for x in use_taxa if x not in wordlist.taxa]: raise ValueError("Your list of taxa contains taxa not in the wordlist.") G = nx.Graph() stats = [0, 0] alignments, cogids, cstrings = [], [], [] for cogid, msa in wordlist.msa[ref].items(): taxa = msa['taxa'] if len(set(taxa)) >= mintax: stats[0] += 1 consensus = get_consensus(msa['alignment'], gaps=True) prostring = prosodic_string(consensus) pidx = prostring.find(pos) if pidx != -1: stats[1] += 1 reflexes = [] for t in use_taxa: if t not in taxa: reflexes += ['Ø'] else: reflexes += [msa['alignment'][taxa.index(t)][pidx]] alignments += [reflexes] cogids += [cogid] cstrings += [consensus[pidx]] G.add_node(str(cogid), column = ' '.join(alignments[-1]), consensus=consensus[pidx], clique=0, cliquesize=0, color = tokens2class(consensus, color)[0], fuzzy=[] ) if verbose: print('Patterns in total: {0}\nPatterns with condition: {1}'.format(stats[0], stats[1])) input('<OK>') for (cogA, colA, consA), (cogB, colB, consB) in combinations( zip(cogids, alignments, cstrings), r=2): cc = compatible_columns(colA, colB, gap="Ø") if cc > 0: G.add_edge(str(cogA), str(cogB), weight=cc) # find cliques cliques = [x for x in sorted(nx.find_cliques(G), key=lambda x: len(x), reverse=False) if len(x) > 1] # assign to clique with highest compatibility clique_dict = {} for i, clique in enumerate(cliques): weight = 0 for nA, nB in combinations(clique, r=2): weight += G.edge[nA][nB]['weight'] clique_dict[i+1] = weight / len(clique) # assemble fuzzy nodes for i, clique in enumerate(cliques): for node in clique: G.node[node]['fuzzy'] += [i+1] # assign to clique with highest compatibility for i,(n, d) in enumerate(sorted(G.nodes(data=True))): if d['fuzzy']: cliques = sorted(d['fuzzy'], reverse=True, key=lambda x: clique_dict[x]) G.node[n]['clique'] = cliques[0] G.node[n]['cliquesize'] = clique_dict[cliques[0]] G.node[n]['fuzzy'] = cliques # recount number of cliques current_cliques = defaultdict(list) for n, d in G.nodes(data=True): if d['clique']: current_cliques[d['clique']] += [n] # recalculate weights nclique_dict = {} for clique, nodes in current_cliques.items(): weight = 0 for nA, nB in combinations(nodes, r=2): weight += G.edge[nA][nB]['weight'] nclique_dict[clique] = weight / len(nodes) for n, d in G.nodes(data=True): if d['clique']: fuzzies = sorted(d['fuzzy'], key=lambda x: nclique_dict.get(x, 0), reverse=True) d['clique'] = fuzzies[0] d['cliquesize'] = nclique_dict[fuzzies[0]] # make a compatibility check again for all cliques with each other # recount number of cliques current_cliques = defaultdict(list) for n, d in G.nodes(data=True): if d['clique']: current_cliques[d['clique']] += [n] new_nodes = {} visited = [] for (c1, nodes1), (c2, nodes2) in sorted( combinations(current_cliques.items(), r=2), key=lambda x: ( len(x[0][1]), len(x[1][1]))): if c1 not in visited and c2 not in visited: nnodes1 = new_nodes.get(c1, nodes1) nnodes2 = new_nodes.get(c2, nodes2) # consensus 1 cons1 = pattern_consensus([G.node[n]['column'].split(' ') for n in nnodes1]) cons2 = pattern_consensus([G.node[n]['column'].split(' ') for n in nnodes2]) comp = compatible_columns(cons1, cons2, gap='Ø') if comp > 0: if len(nnodes1) > len(nnodes2) and len(nnodes1) >= 1: for n in nnodes2: G.node[n]['clique'] = c1 new_nodes[c1] = nnodes1 + nnodes2 new_nodes[c2] = nnodes1 + nnodes2 visited += [c1, c2] #print('merged', c1, c2) #for n in new_nodes[c1]: # print(G.node[n]['column']) #input() elif len(nnodes2) > len(nnodes1) and len(nnodes1) >= 1: for n in nodes1: G.node[n]['clique'] = c2 new_nodes[c1] = nnodes1 + nnodes2 new_nodes[c2] = nnodes1 + nnodes2 visited += [c1, c2] #print(':merged', c2, c1) #for n in new_nodes[c1]: # print(G.node[n]['column']) #input() # re-calculate cliques and weights current_cliques = defaultdict(list) for n, d in G.nodes(data=True): if d['clique']: current_cliques[d['clique']] += [n] # recalculate weights nclique_dict = {} for clique, nodes in current_cliques.items(): weight = 0 for nA, nB in combinations(nodes, r=2): weight += G.edge[nA][nB]['weight'] if nB in G.edge[nA] else 0 nclique_dict[clique] = weight / len(nodes) # determine clique sizes for node, data in G.nodes(data=True): data['fuzzy'] = '/'.join(sorted([str(x) for x in data['fuzzy']])) if data['clique']: data['cliquesize'] = nclique_dict[data['clique']] for node, data in G.nodes(data=True): data['commons'] = '{0}-{1}'.format(data['cliquesize'], data['clique']) return G, nclique_dict