Exemplo n.º 1
0
def construct_hg38_map(n2nl_aln, hg38_bam):
    """Constructs a map of hg38 position -> sequence alignment position -> MSA position"""
    # construct sequence alignment position -> MSA position map using the MSA
    aln_f = Fasta(n2nl_aln)
    seq_aln_map = defaultdict(dict)
    for name, seq in aln_f.iteritems():
        seq_pos = 0
        for aln_pos, x in enumerate(str(seq)):
            seq_aln_map[name][seq_pos] = aln_pos
            if x != '-':
                seq_pos += 1

    # find maximum position for reversing negative strand
    max_pos = {x: max(y.keys()) for x, y in seq_aln_map.iteritems()}

    # construct a hg38 -> sequence positions using the sequences trivially mapped back to hg38
    hg38_map = {}
    for rec in pysam.Samfile(hg38_bam):
        m = {y: x for x, y in rec.aligned_pairs}
        # invert positions for negative strand genes
        if rec.qname in ['NOTCH2', 'NOTCH2NL-A', 'NOTCH2NL-B']:
            m = {x: max_pos[rec.qname] - y for x, y in m.iteritems()}
        hg38_map[rec.qname] = m

    # construct a table mapping each alignment position to all hg38 positions
    r = defaultdict(dict)
    for name, pos_map in hg38_map.iteritems():
        for hg38_pos, seq_pos in pos_map.iteritems():
            aln_pos = seq_aln_map[name][seq_pos]
            r[name][aln_pos] = hg38_pos

    # now invert this map, so that we have our hg38 -> aln map
    final_map = {}
    for name in r:
        for aln_pos in r[name]:
            hg38_pos = r[name][aln_pos]
            assert hg38_pos not in final_map
            final_map[hg38_pos] = aln_pos

    return final_map
Exemplo n.º 2
0
from collections import *
from tools.intervals import *
from tools.misc import *
from tools.procOps import *
from tools.fileOps import *
from tools.bio import *
from tools.psl import *
from itertools import *
import bisect

# In[5]:

# first, construct a map of sequence positions to alignment positions
aln_f = Fasta('notch2nl_alignment.fa')
seq_aln_map = defaultdict(dict)
for name, seq in aln_f.iteritems():
    seq_pos = 0
    for aln_pos, x in enumerate(str(seq)):
        seq_aln_map[name][seq_pos] = aln_pos
        if x != '-':
            seq_pos += 1

# In[182]:

# find maximum position for reversing negative strand
max_pos = {x: max(y.keys()) for x, y in seq_aln_map.iteritems()}

# In[193]:

# next, construct a map of hg38 positions to sequence positions using the alignment
hg38_map = {}