Exemplo n.º 1
0
    def test_msa2str(self):
        aranger = '{body}{meta}'

        # read msa traditionally into an object
        msa_a = MSA(test_data('harry.msa'))

        # read msa from dictionary
        msa_b = qlc.read_msa(test_data('harry.msa'))

        # read msa with IDs
        msa_c = qlc.read_msa(test_data('harry_with_ids.msa'),
                             ids=True,
                             header=False)

        # we adjust the dataset and the seq_id since otherwise we won't have
        # similar output
        msa_c['seq_id'] = 'test'
        msa_c['dataset'] = 'file'

        # when converting these different objects to string with the same body
        # and the like, they should be identical, so we check this here
        str_a = msa2str(msa_a, _arange=aranger)
        str_b = msa2str(msa_b, _arange=aranger)
        str_c = msa2str(msa_c, _arange=aranger, wordlist=False)

        assert str_a == str_b == str_c

        # we next test for converting with the merging attribute
        str_d = msa2str(msa_c, _arange=aranger, wordlist=True, merge=True)
        str_e = msa2str(msa_c, _arange=aranger, wordlist=True, merge=False)

        # remove tabstops for checking similar strings
        str_d_st = str_d.replace('\t', '')
        str_e_st = str_e.replace('\t', '')

        # get index until 'COLUMN'
        idx = str_d_st.index('COLUMNID')
        assert str_d != str_e and str_d_st[:idx] == str_e_st[:idx]

        # add a consensus string to all msa objects
        consensus_a = get_consensus(MSA(msa_b), gaps=True)
        consensus_b = get_consensus(MSA(msa_c), gaps=True)

        msa_b['consensus'] = consensus_a
        msa_c['consensus'] = consensus_b

        assert msa2str(msa_b) == msa2str(msa_c, wordlist=False)
Exemplo n.º 2
0
                    #printTree(cognateGuideTree,0,names=[germanicNameTable[lang] for lang in cognateLangs])
                    cognateNameTable = [
                        nameTable[lang] for lang in cognateLangs
                    ]
                    tree_mtx = convert.newick.nwk2guidetree(
                        str(cognateGuideTree))
                    multi.prog_align(model=internal_asjp,
                                     gop=-4,
                                     scale=0.9,
                                     guide_tree=tree_mtx)
                    #print(multi)

                    cons = get_consensus(multi,
                                         cognateGuideTree,
                                         recon_alg="sankoff_parsimony",
                                         gaps=True,
                                         classes=False,
                                         rep_weights=rep_weights,
                                         local="gap")

                    #collect correction estimates based on the assumption that the reconstruction is correct
                    for node in cognateGuideTree.postorder():
                        if not node.isTip():
                            for i in range(len(node.reconstructed)):
                                #OLD VERSION, this lead to locally suboptimal values because it built on the most parsimonious reconstruction!
                                #minValue = node.sankoffTable[i][node.reconstructed[i]]
                                #optimalChar = node.reconstructed[i]
                                minValue = min(node.sankoffTable[i].values())
                                optimalChar = [
                                    key for key in node.sankoffTable[i].keys()
                                    if node.sankoffTable[i][key] == minValue
Exemplo n.º 3
0
def strict_compatibility_graph(wordlist,
                               ref='partial_ids',
                               pos='T',
                               mintax=3,
                               verbose=False,
                               use_taxa=[
                                   "Old_Burmese", "Burmese", "Written_Burmese",
                                   "Rangoon", "Achang_Longchuan", "Xiandao",
                                   "Lashi", "Atsi", "Bola", "Maru"
                               ]):
    if [x for x in use_taxa if x not in wordlist.taxa]:
        raise ValueError(
            "Your list of taxa contains taxa not in the wordlist.")
    G = nx.Graph()
    stats = [0, 0]
    alignments, cogids, cstrings = [], [], []
    for cogid, msa in wordlist.msa[ref].items():
        taxa = msa['taxa']
        if len(set(taxa)) >= mintax:
            stats[0] += 1
            consensus = get_consensus(msa['alignment'], gaps=True)
            prostring = prosodic_string(consensus)
            pidx = prostring.find(pos)
            if pidx != -1:
                stats[1] += 1
                reflexes = []
                for t in use_taxa:
                    if t not in taxa:
                        reflexes += ['Ø']
                    else:
                        reflexes += [msa['alignment'][taxa.index(t)][pidx]]
                alignments += [reflexes]
                cogids += [cogid]
                cstrings += [consensus[pidx]]
                G.add_node(str(cogid),
                           column=' '.join(alignments[-1]),
                           consensus=consensus[pidx],
                           clique=0,
                           cliquesize=0,
                           color=tokens2class(consensus, color)[0],
                           fuzzy=[])
    if verbose:
        print('Patterns in total: {0}\nPatterns with condition: {1}'.format(
            stats[0], stats[1]))
        input('<OK>')

    for (cogA, colA,
         consA), (cogB, colB,
                  consB) in combinations(zip(cogids, alignments, cstrings),
                                         r=2):
        cc = compatible_columns(colA, colB, gap="Ø")
        if cc > 0:
            G.add_edge(str(cogA), str(cogB), weight=cc)

    # find cliques
    cliques = [
        x for x in sorted(
            nx.find_cliques(G), key=lambda x: len(x), reverse=False)
        if len(x) > 1
    ]

    # assign to clique with highest compatibility
    clique_dict = {}
    for i, clique in enumerate(cliques):
        weight = 0
        for nA, nB in combinations(clique, r=2):
            weight += G.edge[nA][nB]['weight']
        clique_dict[i + 1] = weight / len(clique)

    # assemble fuzzy nodes
    for i, clique in enumerate(cliques):
        for node in clique:
            G.node[node]['fuzzy'] += [i + 1]

    # assign to clique with highest compatibility
    for i, (n, d) in enumerate(sorted(G.nodes(data=True))):
        if d['fuzzy']:
            cliques = sorted(d['fuzzy'],
                             reverse=True,
                             key=lambda x: clique_dict[x])
            G.node[n]['clique'] = cliques[0]
            G.node[n]['cliquesize'] = clique_dict[cliques[0]]
            G.node[n]['fuzzy'] = cliques

    # recount number of cliques
    current_cliques = defaultdict(list)
    for n, d in G.nodes(data=True):
        if d['clique']:
            current_cliques[d['clique']] += [n]

    # recalculate weights
    nclique_dict = {}
    for clique, nodes in current_cliques.items():
        weight = 0
        for nA, nB in combinations(nodes, r=2):
            weight += G.edge[nA][nB]['weight']
        nclique_dict[clique] = weight / len(nodes)
    for n, d in G.nodes(data=True):
        if d['clique']:
            fuzzies = sorted(d['fuzzy'],
                             key=lambda x: nclique_dict.get(x, 0),
                             reverse=True)
            d['clique'] = fuzzies[0]
            d['cliquesize'] = nclique_dict[fuzzies[0]]

    # make a compatibility check again for all cliques with each other
    # recount number of cliques
    current_cliques = defaultdict(list)
    for n, d in G.nodes(data=True):
        if d['clique']:
            current_cliques[d['clique']] += [n]
    new_nodes = {}
    visited = []
    for (c1, nodes1), (c2,
                       nodes2) in sorted(combinations(current_cliques.items(),
                                                      r=2),
                                         key=lambda x:
                                         (len(x[0][1]), len(x[1][1]))):
        if c1 not in visited and c2 not in visited:
            nnodes1 = new_nodes.get(c1, nodes1)
            nnodes2 = new_nodes.get(c2, nodes2)
            # consensus 1
            cons1 = pattern_consensus(
                [G.node[n]['column'].split(' ') for n in nnodes1])
            cons2 = pattern_consensus(
                [G.node[n]['column'].split(' ') for n in nnodes2])
            comp = compatible_columns(cons1, cons2, gap='Ø')
            if comp > 0:
                if len(nnodes1) > len(nnodes2) and len(nnodes1) >= 1:
                    for n in nnodes2:
                        G.node[n]['clique'] = c1
                    new_nodes[c1] = nnodes1 + nnodes2
                    new_nodes[c2] = nnodes1 + nnodes2
                    visited += [c1, c2]
                    #print('merged', c1, c2)
                    #for n in new_nodes[c1]:
                    #    print(G.node[n]['column'])
                    #input()
                elif len(nnodes2) > len(nnodes1) and len(nnodes1) >= 1:
                    for n in nodes1:
                        G.node[n]['clique'] = c2
                    new_nodes[c1] = nnodes1 + nnodes2
                    new_nodes[c2] = nnodes1 + nnodes2
                    visited += [c1, c2]
                    #print(':merged', c2, c1)
                    #for n in new_nodes[c1]:
                    #    print(G.node[n]['column'])
                    #input()
    # re-calculate cliques and weights
    current_cliques = defaultdict(list)
    for n, d in G.nodes(data=True):
        if d['clique']:
            current_cliques[d['clique']] += [n]
    # recalculate weights
    nclique_dict = {}
    for clique, nodes in current_cliques.items():
        weight = 0
        for nA, nB in combinations(nodes, r=2):
            weight += G.edge[nA][nB]['weight'] if nB in G.edge[nA] else 0
        nclique_dict[clique] = weight / len(nodes)
    # determine clique sizes
    for node, data in G.nodes(data=True):
        data['fuzzy'] = '/'.join(sorted([str(x) for x in data['fuzzy']]))
        if data['clique']:
            data['cliquesize'] = nclique_dict[data['clique']]
    for node, data in G.nodes(data=True):
        data['commons'] = '{0}-{1}'.format(data['cliquesize'], data['clique'])

    return G, nclique_dict
 for newCognateID in range(1,numCognates + 1):
     multi = MSA("cognates/" + phylName + "/" + familyName + "/" + str(conceptID - 3) + "." + str(newCognateID) + ".msq",merge_vowels=False,unique_seqs=False)
     cognateLangs = [nameToID[taxon] - langs[0] for taxon in multi.taxa]
     #cognateLangs = [int(lexdict[IDList[0]][0]) - langs[0] for IDList in etym_dict[cognateID] if IDList != 0]
     #print "cognate set " + str(cognateID) + " - cognate langs: " + str(cognateLangs)
     if len(cognateLangs) > 1:  #cognate sets of size 1 are useless
         cognateGuideTree = subGuideTree(familyGuideTree,cognateLangs)       
         #print("\nAligning cognate " + str(cognateID) + ":")
         #print "  cognate langs = " + str(cognateLangs)
         #printTree(cognateGuideTree,0,names=[germanicNameTable[lang] for lang in cognateLangs])
         cognateNameTable = [nameTable[lang] for lang in cognateLangs]
         tree_mtx = convert.newick.nwk2guidetree(str(cognateGuideTree))
         multi.prog_align(model=internal_asjp,gop=-4,scale=0.9,guide_tree=tree_mtx)
         #print(multi)
         
         cons = get_consensus(multi, cognateGuideTree, recon_alg="sankoff_parsimony", gaps=True, classes=False, rep_weights = rep_weights, local = "gap")
         
         #collect correction estimates based on the assumption that the reconstruction is correct
         for node in cognateGuideTree.postorder():
             if not node.isTip():
                 for i in range(len(node.reconstructed)):
                     #OLD VERSION, this lead to locally suboptimal values because it built on the most parsimonious reconstruction!
                     #minValue = node.sankoffTable[i][node.reconstructed[i]]
                     #optimalChar = node.reconstructed[i]
                     minValue = min(node.sankoffTable[i].values())
                     optimalChar = [key for key in node.sankoffTable[i].keys() if node.sankoffTable[i][key]==minValue][0]
                     if len(optimalChar) == 1:
                         if len(node.sankoffTable[i].keys()) == 1:
                             secondPlaceValue = 0
                         else:
                             secondPlaceValue = 65000 #simulating Integer.maxInt   
Exemplo n.º 5
0
def strict_compatibility_graph(wordlist, ref='partial_ids', pos='T', mintax=3,
        verbose=False, use_taxa=["Old_Burmese", "Burmese", "Written_Burmese",
        "Rangoon", "Achang_Longchuan", "Xiandao", "Lashi", "Atsi", "Bola", "Maru"]):
    if [x for x in use_taxa if x not in wordlist.taxa]:
        raise ValueError("Your list of taxa contains taxa not in the wordlist.")
    G = nx.Graph()
    stats = [0, 0]
    alignments, cogids, cstrings = [], [], []
    for cogid, msa in wordlist.msa[ref].items():
        taxa = msa['taxa']
        if len(set(taxa)) >= mintax:
            stats[0] += 1
            consensus = get_consensus(msa['alignment'], gaps=True)
            prostring = prosodic_string(consensus)
            pidx = prostring.find(pos)
            if pidx != -1:
                stats[1] += 1
                reflexes = []
                for t in use_taxa:
                    if t not in taxa:
                        reflexes += ['Ø']
                    else:
                        reflexes += [msa['alignment'][taxa.index(t)][pidx]]
                alignments += [reflexes]
                cogids += [cogid]
                cstrings += [consensus[pidx]]
                G.add_node(str(cogid), column = ' '.join(alignments[-1]),
                        consensus=consensus[pidx], clique=0, cliquesize=0,
                        color = tokens2class(consensus, color)[0], 
                        fuzzy=[]
                        )
    if verbose: 
        print('Patterns in total: {0}\nPatterns with condition: {1}'.format(stats[0], 
            stats[1]))
        input('<OK>')

    for (cogA, colA, consA), (cogB, colB, consB) in combinations(
            zip(cogids, alignments, cstrings), r=2):
        cc = compatible_columns(colA, colB, gap="Ø")
        if cc > 0:
            G.add_edge(str(cogA), str(cogB), weight=cc)

    # find cliques
    cliques = [x for x in sorted(nx.find_cliques(G), key=lambda x: len(x),
        reverse=False) if len(x) > 1]

    # assign to clique with highest compatibility
    clique_dict = {}
    for i, clique in enumerate(cliques):
        weight = 0
        for nA, nB in combinations(clique, r=2):
            weight += G.edge[nA][nB]['weight']
        clique_dict[i+1] = weight / len(clique)
    
    # assemble fuzzy nodes
    for i, clique in enumerate(cliques): 
        for node in clique:
            G.node[node]['fuzzy'] += [i+1]
    
    # assign to clique with highest compatibility
    for i,(n, d) in enumerate(sorted(G.nodes(data=True))):
        if d['fuzzy']:
            cliques = sorted(d['fuzzy'],
                    reverse=True,
                    key=lambda x: clique_dict[x])
            G.node[n]['clique'] = cliques[0]
            G.node[n]['cliquesize'] = clique_dict[cliques[0]]
            G.node[n]['fuzzy'] = cliques

    # recount number of cliques
    current_cliques = defaultdict(list)
    for n, d in G.nodes(data=True):
        if d['clique']:
            current_cliques[d['clique']] += [n]

    # recalculate weights
    nclique_dict = {}
    for clique, nodes in current_cliques.items():
        weight = 0
        for nA, nB in combinations(nodes, r=2):
            weight += G.edge[nA][nB]['weight']
        nclique_dict[clique] = weight / len(nodes)
    for n, d in G.nodes(data=True):
        if d['clique']:
            fuzzies = sorted(d['fuzzy'], key=lambda x: nclique_dict.get(x, 0),
                reverse=True)
            d['clique'] = fuzzies[0]
            d['cliquesize'] = nclique_dict[fuzzies[0]]

    # make a compatibility check again for all cliques with each other
    # recount number of cliques
    current_cliques = defaultdict(list)
    for n, d in G.nodes(data=True):
        if d['clique']:
            current_cliques[d['clique']] += [n]
    new_nodes = {}
    visited = []
    for (c1, nodes1), (c2, nodes2) in sorted(
            combinations(current_cliques.items(), r=2), key=lambda x: (
                len(x[0][1]), len(x[1][1]))):
        if c1 not in visited and c2 not in visited:
            nnodes1 = new_nodes.get(c1, nodes1)
            nnodes2 = new_nodes.get(c2, nodes2)
            # consensus 1
            cons1 = pattern_consensus([G.node[n]['column'].split(' ') for n in nnodes1])
            cons2 = pattern_consensus([G.node[n]['column'].split(' ') for n in nnodes2])
            comp = compatible_columns(cons1, cons2, gap='Ø')
            if comp > 0:
                if len(nnodes1) > len(nnodes2) and len(nnodes1) >= 1:
                    for n in nnodes2:
                        G.node[n]['clique'] = c1
                    new_nodes[c1] = nnodes1 + nnodes2
                    new_nodes[c2] = nnodes1 + nnodes2
                    visited += [c1, c2]
                    #print('merged', c1, c2)
                    #for n in new_nodes[c1]:
                    #    print(G.node[n]['column'])
                    #input()
                elif len(nnodes2) > len(nnodes1) and len(nnodes1) >= 1:
                    for n in nodes1:
                        G.node[n]['clique'] = c2
                    new_nodes[c1] = nnodes1 + nnodes2
                    new_nodes[c2] = nnodes1 + nnodes2
                    visited += [c1, c2]
                    #print(':merged', c2, c1)
                    #for n in new_nodes[c1]:
                    #    print(G.node[n]['column'])
                    #input()
    # re-calculate cliques and weights
    current_cliques = defaultdict(list)
    for n, d in G.nodes(data=True):
        if d['clique']:
            current_cliques[d['clique']] += [n]
    # recalculate weights
    nclique_dict = {}
    for clique, nodes in current_cliques.items():
        weight = 0
        for nA, nB in combinations(nodes, r=2):
            weight += G.edge[nA][nB]['weight'] if nB in G.edge[nA] else 0
        nclique_dict[clique] = weight / len(nodes)
    # determine clique sizes
    for node, data in G.nodes(data=True):
        data['fuzzy'] = '/'.join(sorted([str(x) for x in data['fuzzy']]))
        if data['clique']:
            data['cliquesize'] = nclique_dict[data['clique']]
    for node, data in G.nodes(data=True):
        data['commons'] = '{0}-{1}'.format(data['cliquesize'], data['clique'])

    return G, nclique_dict