Exemplo n.º 1
0
def retrieve_alignment(tre, alnpath, taxonset=range(0, 101), delimiter='_'):
    """
    Parameters
    ----------------
    tre : single-copy treeswift tree generated from James's code
    alnpath : path to the phylip formatted alignment of the genes. The row labels should be a superset of the leafset of 'tre'
    seqlen : sequence length parameter, only the first seqlen columns are taken from the MSA
    taxonset: set, the taxon set of the entire dataset

    Returns the MSA that corresponds to the input tree.
    """
    aln = AlignIO.read(open(alnpath), "phylip")
    seqlen = len(aln[0].seq)
    blank = "-" * seqlen
    whitelist = set(tre.labels(True, False))
    rest = set(taxonset)
    #print(rest)
    res = MultipleSeqAlignment([])
    for r in aln[:, :seqlen]:
        if r.id in whitelist:
            rid = r.id.split(delimiter)[0]
            rid_i = rid
            res.append(SeqRecord(r.seq, id=rid))
            rest.remove(rid_i)
    for rst in rest:
        res.append(SeqRecord(Seq(blank), id=str(rst)))
    res.sort()
    return res
Exemplo n.º 2
0
 def dict_to_bioalignment(d, alphabet='generic_alphabet', sorted=True):
     """
     Create a BioPython MultipleSequenceAlignment
     from a dict with pairs consisting of id and sequence.
     """
     alignment = MultipleSeqAlignment([])
     bio_alphabet = getattr(Bio.Alphabet, alphabet)
     for id, seq in d.items():
         seq_record = SeqRecord(Seq(seq, alphabet=bio_alphabet), id=id)
         alignment.append(seq_record)
     if sorted:
         alignment.sort()
     return alignment