ltr_seqs += list(fasta_reader(handle))
print len(ltr_seqs)

# <codecell>

conb_ltr = """TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAA
GGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGC
TAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCA
TGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAG
CTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGC
GTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTC
TCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCC
TTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTG
TGGAAAATCTCT""".replace('\n', '')

ltr_align = list(seq_align_to_ref(ltr_seqs, conb_ltr, max_workers = 20))

# <codecell>

ltr_bin_align = []
align_inds = []
for key, seq in ltr_align:
    align_inds.append(tuple(key.split('-')))
    hit_first = False
    bins = []
    for g, cb in zip(seq, conb_ltr):
        if ~hit_first and (g == '-'):
            bins.append(np.nan)
            continue
        hit_first = True
        bins.append(1.0 if g==cb else 0.0)
示例#2
0
oseqs = []
if os.path.exists('aligned_seq.pkl'):
    with open('aligned_seq.pkl') as handle:
        oseqs = pickle.load(handle)
else:
    for f in files:
        fname = f.rsplit(os.sep,1)[-1].split('.')[0]
        parts = fname.split('-')
        prot_name = parts[1].replace('_','-')
        if prot_name == 'V3':
            continue
        with open(f) as handle:
        
            prot_seqs = list(trans_seq(handle, set(trop_data.index), trans=prot_name != 'LTR'))
            print prot_name, len(prot_seqs)
            aligned_seqs = seq_align_to_ref(prot_seqs, ref_seqs[prot_name], max_workers=20)
            for name, seq in aligned_seqs:
                oseqs.append({
                              'Accession':name,
                              'Seq':seq,
                              'Prot':prot_name
                              })
            

# <codecell>

aligned_seqs = pd.pivot_table(pd.DataFrame(oseqs),
                              rows='Accession',
                              cols='Prot',
                              values='Seq',
                              aggfunc='first')
示例#3
0
------------AGCTGCATCCGGAGTACTTC---------AAGAACTGCT-----------------------------
-------GACATCGA------------------------GCTTG---CT----------------------ACAA---GG
GACTTTCCGCTGGGGACTTTCCAG-------------GGAGGCGTGGCCTGGGCGGGACT---GGGGAGTGGCGA---GC
CCTCAGATCCTGCATATAAGCAGC---TGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAG
CTCTCTGGCT---AACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCT---TGAGTGCTTC---AAGTAGTGTG
TGC---CCGTCTG---TTGTGTGACTCTGGT---AACTAGAGATCCC---TCAGAC---CCT---TTTAGTCAGTGTGG-
--AAAATCTCT""".replace(
    "\n", ""
).replace(
    "-", ""
)

# <codecell>

aln_seqs = []
for gi, aln_seq in seq_align_to_ref(ltr_seqs, hxb2, max_workers=50):
    try:
        aln_seqs.append((gi2date[gi], aln_seq))
    except KeyError:
        pass

# <codecell>

sorted_alns = sorted(aln_seqs, key=lambda x: x[0])

# <codecell>

from itertools import groupby
from collections import defaultdict

found_seqs = set()