Python get_consensus_filename 예제들, hivwholeseq.sequencing.filenames.get_consensus_filename Python 예제들

예제 #1

0

파일 보기

파일: check_overlaps.py 프로젝트: iosonofabio/hivwholeseq

def get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=0):
    """Find the overlap coordinates for the two fragments"""
    from hivwholeseq.utils.mapping import align_muscle

    seq1 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag1), "fasta")
    seq2 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag2), "fasta")
    sm1 = np.array(seq1)
    sm2 = np.array(seq2)

    # Find the beginning of s2 in s1
    seed_len = 20
    matches_min = 16
    seed = sm2[:seed_len]
    found = False
    trials = 0
    while (not found) and (trials < 3):
        for pos in xrange(len(seq1) - 700, len(seq1) - seed_len):
            if (sm1[pos : pos + seed_len] == seed).sum() >= matches_min - trials:
                found = True
                start_s2 = pos
                break
        if not found:
            trials += 1

    if not found:
        return None

    if VERBOSE >= 3:
        print "Beginning of " + frag2 + " found in " + frag1

    # In an ideal world, the overlap is a holy place in which no indels happen.
    # We cannot assume that, sadly. However, we can search from the other side
    # and align: find the end of s1 in s2
    found = False
    seed = sm1[-seed_len:]
    trials = 0
    while (not found) and (trials < 3):
        for pos in xrange(700):
            if (sm2[pos : pos + seed_len] == seed).sum() >= matches_min - trials:
                found = True
                end_s1 = pos + seed_len
                break
        if not found:
            trials += 1
    if not found:
        return None

    if VERBOSE >= 3:
        print "End of " + frag1 + " found in " + frag2

    # Align
    ali = align_muscle(seq1[start_s2:], seq2[:end_s1])
    return (start_s2, end_s1, ali)

예제 #2

0

파일 보기

def get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=0):
    '''Find the overlap coordinates for the two fragments'''
    from hivwholeseq.utils.mapping import align_muscle

    seq1 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag1), 'fasta')
    seq2 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag2), 'fasta')
    sm1 = np.array(seq1)
    sm2 = np.array(seq2)

    # Find the beginning of s2 in s1
    seed_len = 20
    matches_min = 16
    seed = sm2[:seed_len]
    found = False
    trials = 0
    while (not found) and (trials < 3):
        for pos in xrange(len(seq1) - 700, len(seq1) - seed_len):
            if (sm1[pos: pos + seed_len] == seed).sum() >= matches_min - trials:
                found = True
                start_s2 = pos
                break
        if not found:
            trials += 1

    if not found:
        return None

    if VERBOSE >= 3:
        print 'Beginning of '+frag2+' found in '+frag1

    # In an ideal world, the overlap is a holy place in which no indels happen.
    # We cannot assume that, sadly. However, we can search from the other side
    # and align: find the end of s1 in s2
    found = False
    seed = sm1[-seed_len:]
    trials = 0
    while (not found) and (trials < 3):
        for pos in xrange(700):
            if (sm2[pos: pos + seed_len] == seed).sum() >= matches_min - trials:
                found = True
                end_s1 = pos + seed_len
                break
        if not found:
            trials += 1
    if not found:
        return None

    if VERBOSE >= 3:
        print 'End of '+frag1+' found in '+frag2

    # Align
    ali = align_muscle(seq1[start_s2:], seq2[:end_s1])
    return (start_s2, end_s1, ali)

예제 #3

0

파일 보기

파일: get_allele_counts.py 프로젝트: 5l1v3r1/hivwholeseq

def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10):
    '''Extract allele and insert counts from a bamfile'''

    # Read reference
    reffilename = get_consensus_filename(data_folder,
                                         adaID,
                                         fragment,
                                         trim_primers=True)
    refseq = SeqIO.read(reffilename, 'fasta')

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      fragment,
                                      type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Call lower-level function
    return get_allele_counts_insertions_from_file(bamfilename,
                                                  len(refseq),
                                                  qual_min=qual_min,
                                                  maxreads=maxreads,
                                                  VERBOSE=VERBOSE)

예제 #4

0

파일 보기

파일: check_mapped_coverage.py 프로젝트: 5l1v3r1/hivwholeseq

def check_coverage(data_folder,
                   adaID,
                   fragment,
                   seq_run,
                   qual_min=35,
                   reference='HXB2',
                   maxreads=-1,
                   VERBOSE=0,
                   rescue=False,
                   minor_allele=False):
    '''Check division into fragments: coverage, etc.'''
    ref_fn = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(ref_fn, 'fasta')

    input_filename = get_mapped_filename(data_folder,
                                         adaID,
                                         fragment,
                                         type='bam',
                                         rescue=rescue)

    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(
        input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE)

    # Plot results
    title = ', '.join(
        map(lambda x: ' '.join([x[0], str(x[1])]), [
            ['run', seq_run],
            ['adaID', adaID],
            ['fragment', fragment],
            ['maxreads', maxreads],
        ]))
    plot_coverage(counts, suptitle=title, minor_allele=minor_allele)

예제 #5

0

파일 보기

파일: store_initial_reference.py 프로젝트: iosonofabio/hivwholeseq

def complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=0):
    '''Complement consensus from PCR2 with wings from later PCR1 sample'''
    from hivwholeseq.utils.sequence import find_seed_imperfect, rfind_seed_imperfect

    found = False
    for _, sampletmp in patient.samples.iloc[samplen + 1:].iterrows():
        for _, sampleseqtmp in sampletmp['samples seq'].iterrows():
            sampleseqtmp = SampleSeq(sampleseqtmp)
            if int(sampleseqtmp.PCR) == 1:
                sampleseq_later = sampleseqtmp
                found = True
                break
        if found:
            break

    adaID_later = sampleseq_later['adapter']
    data_folder_later = sampleseq_later.sequencing_run.folder
    cons_rec_later = SeqIO.read(get_consensus_filename(data_folder_later, adaID_later, fragment), 'fasta')
    conss_later = str(cons_rec_later.seq)

    start = find_seed_imperfect(cons_rec_later, cons_rec[:20])
    end = rfind_seed_imperfect(cons_rec_later, cons_rec[-20:]) + 20

    if VERBOSE >= 1:
        print 'Complementing PCR2 consensus with later PCR1:',
        print sampleseq_later.name, sampleseq_later['seq run'], sampleseq_later.adapter

    frag_spec = sampleseq_later.regions_complete[sampleseq_later.regions_generic.index(fragment)]

    return (frag_spec, conss_later[:start]+cons_rec+conss_later[end:])

예제 #6

0

파일 보기

def get_distance_histogram(data_folder, adaID, fragment, maxreads=1000, VERBOSE=0,
                           filtered=False):
    '''Get the distance of reads from their consensus'''
    reffilename = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=filtered)

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        n_pairs = 0
        read_pairs = []
        for (i, rp) in enumerate(pair_generator(bamfile)):
            if n_pairs >= maxreads:
                break

            r1 = rp[0]
            if not r1.is_proper_pair:
                continue

            read_pairs.append(rp)
            n_pairs += 1

        ds = get_distance_from_reference(ref, read_pairs, threshold=30)

    h = np.bincount(ds)
    return h

예제 #7

0

파일 보기

파일: map_to_consensus.py 프로젝트: iosonofabio/hivwholeseq

def make_index_and_hash(data_folder, adaID, fragment, VERBOSE=0, summary=True):
    '''Make index and hash files for consensus'''
    frag_gen = fragment[:2]

    # NOTE: we can use --overwrite here, because there is no concurrency (every
    # job has its own hash)
    # 1. Make genome index file
    sp.call([stampy_bin,
             '--species="HIV fragment '+frag_gen+'"',
             '--overwrite',
             '-G', get_index_file(data_folder, adaID, frag_gen, ext=False),
             get_consensus_filename(data_folder, adaID, frag_gen, trim_primers=True),
             ])
    if VERBOSE:
        print 'Built index: '+adaID+' '+frag_gen
    
    # 2. Build a hash file
    sp.call([stampy_bin,
             '--overwrite',
             '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
             '-H', get_hash_file(data_folder, adaID, frag_gen, ext=False),
             ])
    if VERBOSE:
        print 'Built hash: '+adaID+' '+frag_gen

    if summary:
        with open(get_map_summary_filename(data_folder, adaID, frag_gen), 'a') as f:
            f.write('\n')
            f.write('Stampy index and hash written.')
            f.write('\n')

예제 #8

0

파일 보기

파일: store_initial_reference.py 프로젝트: 5l1v3r1/hivwholeseq

def complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=0):
    '''Complement consensus from PCR2 with wings from later PCR1 sample'''
    from hivwholeseq.utils.sequence import find_seed_imperfect, rfind_seed_imperfect

    found = False
    for _, sampletmp in patient.samples.iloc[samplen + 1:].iterrows():
        for _, sampleseqtmp in sampletmp['samples seq'].iterrows():
            sampleseqtmp = SampleSeq(sampleseqtmp)
            if int(sampleseqtmp.PCR) == 1:
                sampleseq_later = sampleseqtmp
                found = True
                break
        if found:
            break

    adaID_later = sampleseq_later['adapter']
    data_folder_later = sampleseq_later.sequencing_run.folder
    cons_rec_later = SeqIO.read(
        get_consensus_filename(data_folder_later, adaID_later, fragment),
        'fasta')
    conss_later = str(cons_rec_later.seq)

    start = find_seed_imperfect(cons_rec_later, cons_rec[:20])
    end = rfind_seed_imperfect(cons_rec_later, cons_rec[-20:]) + 20

    if VERBOSE >= 1:
        print 'Complementing PCR2 consensus with later PCR1:',
        print sampleseq_later.name, sampleseq_later[
            'seq run'], sampleseq_later.adapter

    frag_spec = sampleseq_later.regions_complete[
        sampleseq_later.regions_generic.index(fragment)]

    return (frag_spec, conss_later[:start] + cons_rec + conss_later[end:])

예제 #9

0

파일 보기

파일: build_consensus_iterative.py 프로젝트: iosonofabio/hivwholeseq

def write_consensus_final(seq_run, adaID, fragment, consensus):
    '''Write the final consensus (fragments are now called F5 instead of F5ai)'''
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']
    samplename = dataset['samples'][dataset['adapters'].index(adaID)]

    frag_out = fragment[:2]
    name = samplename+'_seqrun_'+seq_run+'_adaID_'+adaID+'_'+frag_out+'_consensus'
    consensusseq = SeqRecord(Seq(consensus), id=name, name=name)

    outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True)
    SeqIO.write(consensusseq, outfile, 'fasta')

    # Align all consensi via muscle and store
    seqs = list(SeqIO.parse(get_reference_all_filename(data_folder, adaID, fragment), 'fasta'))
    ali = align_muscle(*seqs)
    AlignIO.write(ali, get_reference_all_filename(data_folder, adaID, fragment), 'fasta')

예제 #10

0

파일 보기

파일: get_allele_counts.py 프로젝트: iosonofabio/hivwholeseq

def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10):
    """Extract allele and insert counts from a bamfile"""

    # Read reference
    reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True)
    refseq = SeqIO.read(reffilename, "fasta")

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type="bam", filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Call lower-level function
    return get_allele_counts_insertions_from_file(
        bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE
    )

예제 #11

0

파일 보기

파일: build_genomewide_consensus.py 프로젝트: 5l1v3r1/hivwholeseq

def merge_consensi(data_folder, adaID, fragments, VERBOSE=0):
    '''Merge consensi at overlapping pairs'''
    import warnings

    consensi = {frag: SeqIO.read(get_consensus_filename(data_folder, adaID, frag,
                                             trim_primers=True), 'fasta')
                for frag in fragments}

    pairs = get_overlapping_fragments(fragments)
    overlaps = {}
    for (frag1, frag2) in pairs:
        overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE)
        is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2,
                                          overlap, VERBOSE=VERBOSE)
        if is_diff:
            warnings.warn(frag1+' and '+frag2+' have different consensi.', RuntimeWarning)
        overlaps[(frag1, frag2)] = overlap

    consensus = []
    fragments = sorted(fragments)
    for i, frag in enumerate(fragments):
        # If the start is not an overlap, start a new consensus and copy all
        if (i == 0) or (fragments[i-1], frag) not in overlaps:
            cons = [[frag], str(consensi[frag].seq)]
            consensus.append(cons)

        # copy from the end of the overlap on
        else:
            cons = consensus[-1]
            cons[0].append(frag)
            tmp = overlaps[(fragments[i-1], frag)]
            if tmp is not None:
                (_, start, _) = tmp
                cons[1] = cons[1]+str(consensi[frag][start:].seq)
            else:
                cons[1] = cons[1]+('N' * 10)+str(consensi[frag].seq)

    # Make SeqRecords out of consensi
    for i, (frags, cons) in enumerate(consensus):
        name = 'adaID_'+str(adaID)+'_'+'-'.join(frags)
        rec = SeqRecord(Seq(cons, IUPAC.ambiguous_dna),
                        id=name, name=name)
        consensus[i] = (frags, rec)

    return consensus

예제 #12

0

파일 보기

파일: build_genomewide_consensus.py 프로젝트: 5l1v3r1/hivwholeseq

def merge_allele_frequencies(data_folder, adaID, fragments, VERBOSE=0):
    '''Merge allele frequencies at overlapping pairs'''
    import warnings
    import numpy as np

    consensi = {frag: SeqIO.read(get_consensus_filename(data_folder, adaID, frag,
                                             trim_primers=True), 'fasta')
                for frag in fragments}
    nus = {frag: np.load(get_allele_frequencies_filename(data_folder, adaID, frag))
           for frag in fragments}

    pairs = get_overlapping_fragments(fragments)
    overlaps = {}
    for (frag1, frag2) in pairs:
        overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE)
        is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2,
                                          overlap, VERBOSE=VERBOSE)
        if is_diff:
            warnings.warn(frag1+' and '+frag2+' have different consensi.', RuntimeWarning)
        overlaps[(frag1, frag2)] = overlap

    nu = []
    fragments = sorted(fragments)
    for i, frag in enumerate(fragments):
        # If the start is not an overlap, start a new chunk and copy all
        if (i == 0) or (fragments[i-1], frag) not in overlaps:
            nuf = [[frag], nus[frag]]
            nu.append(nuf)

        # else, copy from the end of the overlap on
        # FIXME: we could average the consensus zone out of indels...
        else:
            nuf = nu[-1]
            nuf[0].append(frag)
            tmp = overlaps[(fragments[i-1], frag)]
            if tmp is not None:
                (_, start, _) = tmp
                #(recursion is not the most efficient but -- oh, well)
                nuf[1] = np.concatenate([nuf[1], nus[frag][:, start:]], axis=1)
            else:
                tmp = np.zeros((nuf[1].shape[0], 10), float)
                tmp[-1] = 1
                nuf[1] = np.concatenate([nuf[1], tmp, nus[frag][:, start:]], axis=1)

    return nu

예제 #13

0

파일 보기

파일: align_consensi_dataset.py 프로젝트: 5l1v3r1/hivwholeseq

def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0):
    '''Align consensi from different samples in a dataset'''

    data_folder = dataset['folder']

    # Collect consensi
    if VERBOSE >= 1:
        print 'Collecting consensi...',
    consensi = defaultdict(dict)
    for adaID in adaIDs:
        samplename = dataset['samples'][dataset['adapters'].index(adaID)]
        fragments_sample = samples[samplename]['fragments']
        for frag in fragments_sample:
            frag_gen = frag[:2]
            if frag_gen not in fragments:
                continue
            con_fn = get_consensus_filename(data_folder, adaID, frag_gen)
            if os.path.isfile(con_fn):
                con = SeqIO.read(con_fn, 'fasta')
                consensi[frag_gen][adaID] = con

        if 'genomewide' in fragments:
            frag_gens = [frag[:2] for frag in fragments_sample]
            con_gw_fn = get_merged_consensus_filename(data_folder, adaID,
                                                      frag_gens)
            if os.path.isfile(con_gw_fn):
                con = SeqIO.read(con_gw_fn, 'fasta')
                consensi['genomewide'][adaID] = con

    if VERBOSE >= 1:
        print 'done.'
        print 'Aligning...',

    # Align
    alis = {}
    for (frag, con_dict) in consensi.iteritems():
        if VERBOSE >= 2:
            print frag,
        ali_frag = align_muscle(*(con_dict.values()))
        alis[frag] = ali_frag

    if VERBOSE >= 1:
        print 'done.'

    return alis

예제 #14

0

파일 보기

파일: align_consensi_dataset.py 프로젝트: iosonofabio/hivwholeseq

def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0):
    '''Align consensi from different samples in a dataset'''

    data_folder = dataset['folder']

    # Collect consensi
    if VERBOSE >= 1:
        print 'Collecting consensi...',
    consensi = defaultdict(dict)
    for adaID in adaIDs:
        samplename = dataset['samples'][dataset['adapters'].index(adaID)]
        fragments_sample = samples[samplename]['fragments']
        for frag in fragments_sample:
            frag_gen = frag[:2]
            if frag_gen not in fragments:
                continue
            con_fn = get_consensus_filename(data_folder, adaID, frag_gen)
            if os.path.isfile(con_fn):
                con = SeqIO.read(con_fn, 'fasta')
                consensi[frag_gen][adaID] = con

        if 'genomewide' in fragments:
            frag_gens = [frag[:2] for frag in fragments_sample]
            con_gw_fn = get_merged_consensus_filename(data_folder, adaID, frag_gens)
            if os.path.isfile(con_gw_fn):
                con = SeqIO.read(con_gw_fn, 'fasta')
                consensi['genomewide'][adaID] = con
    
    if VERBOSE >= 1:
        print 'done.'
        print 'Aligning...',

    # Align
    alis = {}
    for (frag, con_dict) in consensi.iteritems():
        if VERBOSE >= 2:
            print frag,
        ali_frag = align_muscle(*(con_dict.values()))
        alis[frag] = ali_frag

    if VERBOSE >= 1:
        print 'done.'

    return alis

예제 #15

0

파일 보기

파일: print_consensi.py 프로젝트: 5l1v3r1/hivwholeseq

def score_consensus(sample, VERBOSE=0):
    '''Score a consensus based on completeness and quality'''
    data_folder = sample.sequencing_run.folder
    adaID = sample.adapter

    frag_spec = filter(lambda x: fragment in x, sample.regions_complete)
    if not len(frag_spec):
        field = ''
        return (True, '')

    fn = get_consensus_filename(data_folder, adaID, fragment)
    if not os.path.isfile(fn):
        return (False, 'MISS')

    frag_spec = frag_spec[0]
    fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec)
    if not os.path.isfile(fn_ref):
        if frag_spec[:3] == 'F3a':
            frag_spec = frag_spec.replace('a', '')
            fn_ref = get_reference_premap_filename(data_folder, adaID,
                                                   frag_spec)
            if not os.path.isfile(fn_ref):
                return (False, 'MISSREF')
        else:
            return (False, 'MISSREF')

    ref = SeqIO.read(fn_ref, 'fasta')
    cons = SeqIO.read(fn, 'fasta')
    if len(cons) < len(ref) - 200:
        return (False, 'SHORT')
    elif len(cons) > len(ref) + 200:
        return (False, 'LONG')

    #ali = align_global(str(ref.seq), str(cons.seq), band=200)
    #alim1 = np.fromstring(ali[1], 'S1')
    #alim2 = np.fromstring(ali[2], 'S1')
    #if (alim1 != alim2).sum() >
    return (True, 'OK')

예제 #16

0

파일 보기

def write_consensus_final(seq_run, adaID, fragment, consensus):
    '''Write the final consensus (fragments are now called F5 instead of F5ai)'''
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']
    samplename = dataset['samples'][dataset['adapters'].index(adaID)]

    frag_out = fragment[:2]
    name = samplename + '_seqrun_' + seq_run + '_adaID_' + adaID + '_' + frag_out + '_consensus'
    consensusseq = SeqRecord(Seq(consensus), id=name, name=name)

    outfile = get_consensus_filename(data_folder,
                                     adaID,
                                     frag_out,
                                     trim_primers=True)
    SeqIO.write(consensusseq, outfile, 'fasta')

    # Align all consensi via muscle and store
    seqs = list(
        SeqIO.parse(get_reference_all_filename(data_folder, adaID, fragment),
                    'fasta'))
    ali = align_muscle(*seqs)
    AlignIO.write(ali, get_reference_all_filename(data_folder, adaID,
                                                  fragment), 'fasta')

예제 #17

0

파일 보기

def make_index_and_hash(data_folder, adaID, fragment, VERBOSE=0, summary=True):
    '''Make index and hash files for consensus'''
    frag_gen = fragment[:2]

    # NOTE: we can use --overwrite here, because there is no concurrency (every
    # job has its own hash)
    # 1. Make genome index file
    sp.call([
        stampy_bin,
        '--species="HIV fragment ' + frag_gen + '"',
        '--overwrite',
        '-G',
        get_index_file(data_folder, adaID, frag_gen, ext=False),
        get_consensus_filename(data_folder, adaID, frag_gen,
                               trim_primers=True),
    ])
    if VERBOSE:
        print 'Built index: ' + adaID + ' ' + frag_gen

    # 2. Build a hash file
    sp.call([
        stampy_bin,
        '--overwrite',
        '-g',
        get_index_file(data_folder, adaID, frag_gen, ext=False),
        '-H',
        get_hash_file(data_folder, adaID, frag_gen, ext=False),
    ])
    if VERBOSE:
        print 'Built hash: ' + adaID + ' ' + frag_gen

    if summary:
        with open(get_map_summary_filename(data_folder, adaID, frag_gen),
                  'a') as f:
            f.write('\n')
            f.write('Stampy index and hash written.')
            f.write('\n')

예제 #18

0

파일 보기

파일: print_consensi.py 프로젝트: iosonofabio/hivwholeseq

def score_consensus(sample, VERBOSE=0):
    '''Score a consensus based on completeness and quality'''
    data_folder = sample.sequencing_run.folder
    adaID = sample.adapter

    frag_spec = filter(lambda x: fragment in x, sample.regions_complete)
    if not len(frag_spec):
        field = ''
        return (True, '')

    fn = get_consensus_filename(data_folder, adaID, fragment)
    if not os.path.isfile(fn):
        return (False, 'MISS')

    frag_spec = frag_spec[0]
    fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec)
    if not os.path.isfile(fn_ref):
        if frag_spec[:3] == 'F3a':
            frag_spec = frag_spec.replace('a', '')
            fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec)
            if not os.path.isfile(fn_ref):
                return (False, 'MISSREF')
        else:
            return (False, 'MISSREF')

    ref = SeqIO.read(fn_ref, 'fasta')
    cons = SeqIO.read(fn, 'fasta')
    if len(cons) < len(ref) - 200:
        return (False, 'SHORT')
    elif len(cons) > len(ref) + 200:
        return (False, 'LONG')

    #ali = align_global(str(ref.seq), str(cons.seq), band=200)
    #alim1 = np.fromstring(ali[1], 'S1')
    #alim2 = np.fromstring(ali[2], 'S1')
    #if (alim1 != alim2).sum() >
    return (True, 'OK')

예제 #19

0

파일 보기

파일: check_mapped_coverage.py 프로젝트: iosonofabio/hivwholeseq

def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35,
                   reference='HXB2', maxreads=-1, VERBOSE=0,
                   rescue=False,
                   minor_allele=False):
    '''Check division into fragments: coverage, etc.'''
    ref_fn = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(ref_fn, 'fasta')

    input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                         rescue=rescue)

    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename,
                                                                        len(refseq),
                                                                        maxreads=maxreads,
                                                                        VERBOSE=VERBOSE)

    # Plot results
    title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]),
                        [['run', seq_run],
                         ['adaID', adaID],
                         ['fragment', fragment],
                         ['maxreads', maxreads],
                        ]))
    plot_coverage(counts, suptitle=title, minor_allele=minor_allele)

예제 #20

0

파일 보기

파일: build_consensus.py 프로젝트: 5l1v3r1/hivwholeseq

            if VERBOSE >= 2:
                print ali[:, :30]
                print ali[:, -30:]
                print 'Lenghts: ref', len(refseq), 'consensus', len(
                    consensusseq)
                len_ali = ali.get_alignment_length()
                n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali))
                print 'Differences from ref:', n_diff, '(' + '{:3.1f}'.format(
                    100.0 * n_diff / len_ali) + '%)'

            # Ungap consensus
            consensusseq = SeqRecord(ali[1].seq, id=name, name=name)
            if '-' in consensusseq:
                consensusseq.seq = consensusseq.seq.ungap('-')

            # Write output
            outfile = get_consensus_filename(data_folder,
                                             adaID,
                                             frag_out,
                                             trim_primers=True)
            SeqIO.write(consensusseq, outfile, 'fasta')

            AlignIO.write(
                ali,
                get_reference_consensus_ali_filename(data_folder, adaID,
                                                     fragment), 'fasta')

            if store_allele_counts:
                allele_counts.dump(
                    get_allele_counts_filename(data_folder, adaID, frag_out))

예제 #21

0

파일 보기

파일: correct_consensus_mismatches.py 프로젝트: iosonofabio/hivwholeseq

    # If the script is called with no adaID, iterate over all
    if not adaIDs:
        adaIDs = MiSeq_runs[seq_run]['adapters']
    if VERBOSE >= 3:
        print 'adaIDs', adaIDs

    # If the script is called with no fragment, iterate over all
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over samples and fragments
    for adaID in adaIDs:
        for fragment in fragments:
            consensus = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment),
                                   'fasta')
            cmat = np.array(consensus)

            counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
            coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
            nu = filter_nus(counts, coverage, VERBOSE=VERBOSE)
            cmat_af = alpha[nu.argmax(axis=0)]

            if len(cmat) != len(cmat_af):
                raise ValueError('The two consensi have a different length!')

            pos_diff = (cmat != cmat_af).nonzero()[0]

            # If they are the same, do nothing (we do not want useless backup files)
            if len(pos_diff) == 0:

예제 #22

0

파일 보기

    if not adaIDs:
        adaIDs = load_adapter_table(data_folder)['ID']
    if VERBOSE >= 3:
        print 'adaIDs', adaIDs

    # Select fragment and primers
    fragment = 'F3'
    # Look for the F3 rev primer (already reversed)
    primer_old = 'GATTGTGTGGCAAGTAGACAGG'
    primer_new = 'TATGGAAAACAGATGGCAGGTG'

    # Iterate over all requested samples
    for adaID in adaIDs:

        # Read reference (fragmented)
        reffilename = get_consensus_filename(data_folder, adaID, fragment)
        refseq = SeqIO.read(reffilename, 'fasta')
        ref = np.array(refseq)

        # read file
        bamfilename = get_mapped_filename(data_folder,
                                          adaID,
                                          fragment,
                                          type='bam',
                                          filtered=True)

        if not os.path.isfile(bamfilename):
            convert_sam_to_bam(bamfilename)
        bamfile = pysam.Samfile(bamfilename, 'rb')

        # Get the coverage for reads which have long insert sizes

예제 #23

0

파일 보기

파일: store_initial_reference.py 프로젝트: iosonofabio/hivwholeseq

    else:
        sample = load_sample_sequenced(samplename)

    for fragment in fragments:
        sample_seq = SampleSeq(sample.samples_seq.iloc[repn])

        seq_run = sample_seq['seq run']
        adaID = sample_seq['adapter']
        dataset = sample_seq.sequencing_run
        data_folder = dataset.folder

        if VERBOSE:
            print 'Initial sample:', sample_seq.name, sample_seq['seq run'],
            print sample_seq.adapter

        cons_rec = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment),
                              'fasta')
        frag_spec = sample_seq.regions_complete[\
                            sample_seq.regions_generic.index(fragment)]

        # Complement PCR2 initial reference with tails from a later sample
        if int(sample_seq.PCR) == 2:
            (frag_spec, cons_rec) = complement_consensus_PCR2(cons_rec, patient,
                                                              fragment,
                                                              samplen,
                                                              VERBOSE=VERBOSE)

        conss = str(cons_rec.seq)
        output_filename = get_initial_reference_filename(pname, fragment)

        seq_in = SeqRecord(Seq(conss, unambiguous_dna),

예제 #24

0

파일 보기

파일: histogram_insert_sizes.py 프로젝트: iosonofabio/hivwholeseq

        adaIDs = load_adapter_table(data_folder)['ID']
    if VERBOSE >= 3:
        print 'adaIDs', adaIDs

    # If the script is called with no fragment, iterate over all
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over all requested samples
    for adaID in adaIDs:
        for fragment in fragments:

            # Read reference
            reffilename = get_consensus_filename(data_folder, adaID, fragment)
            refseq = SeqIO.read(reffilename, 'fasta')
            ref = np.array(refseq)
        
            # Open BAM
            bamfilename = get_mapped_filename(data_folder, adaID, fragment,
                                              filtered=False)
            if not os.path.isfile(bamfilename):
                convert_sam_to_bam(bamfilename)
            with pysam.Samfile(bamfilename, 'rb') as bamfile:
        
                # Iterate through reads
                for i, read in enumerate(bamfile):
                
                    # Limit to the first reads
                    if i >= maxreads: break

예제 #25

0

파일 보기

파일: check_consensus.py 프로젝트: 5l1v3r1/hivwholeseq

    if not adaIDs:
        adaIDs = MiSeq_runs[seq_run]['adapters']
    if VERBOSE >= 3:
        print 'adaIDs', adaIDs

    # If the script is called with no fragment, iterate over all
    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over samples and fragments
    for adaID in adaIDs:
        for fragment in fragments:
            consensus = SeqIO.read(
                get_consensus_filename(data_folder, adaID, fragment), 'fasta')
            cmat = np.array(consensus)

            counts = np.load(
                get_allele_counts_filename(data_folder, adaID, fragment))
            coverage = np.load(
                get_coverage_filename(data_folder, adaID, fragment))
            nu = filter_nus(counts, coverage, VERBOSE=VERBOSE)

            # Note: not-covered positions are filtered, but argmax cannot work
            # with masked arrays
            cmat_af = alpha[nu.argmax(axis=0)]
            if hasattr(nu, 'mask'):
                cmat_af[nu.mask.all(axis=0)] = 'N'

            # Check for consistency first

예제 #26

0

파일 보기

파일: build_consensus.py 프로젝트: iosonofabio/hivwholeseq

            ali = align_muscle(refseq, consensusseq, sort=True)

            if ali[0][-1] == '-':
                start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-'))
                end_nongap = len(ali[0].seq.rstrip('-'))
                ali = ali[:, start_nongap: end_nongap]

            if VERBOSE >= 2:
                print ali[:, :30]
                print ali[:, -30:]
                print 'Lenghts: ref', len(refseq), 'consensus', len(consensusseq)
                len_ali = ali.get_alignment_length()
                n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali))
                print 'Differences from ref:', n_diff, '('+'{:3.1f}'.format(100.0 * n_diff / len_ali)+'%)'

            # Ungap consensus
            consensusseq = SeqRecord(ali[1].seq, id=name, name=name)
            if '-' in consensusseq:
                consensusseq.seq = consensusseq.seq.ungap('-')

            # Write output
            outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True)
            SeqIO.write(consensusseq, outfile, 'fasta')

            AlignIO.write(ali, get_reference_consensus_ali_filename(data_folder, adaID, fragment), 'fasta')

            if store_allele_counts:
                allele_counts.dump(get_allele_counts_filename(data_folder, adaID, frag_out))

예제 #27

0

파일 보기

파일: get_coallele_counts.py 프로젝트: iosonofabio/hivwholeseq

def get_coallele_counts(data_folder, adaID, fragment, VERBOSE=0):
    '''Extract allele and insert counts from a bamfile'''

    # Read reference
    reffilename = get_consensus_filename(data_folder, adaID, fragment,
                                         trim_primers=True)
    refseq = SeqIO.read(reffilename, 'fasta')
    
    # Allele counts and inserts (TODO: compress this data?)
    # Note: the pair is of 2 types only, while the single reads usually are of 4
    counts = np.zeros((len(read_pair_types),
                       len(alpha), len(alpha),
                       len(refseq), len(refseq)), int)
    positions = np.zeros(501, int)
    ais = np.zeros_like(positions)
    # TODO: no inserts for now

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Iterate over read pairs
        for i, reads in enumerate(pair_generator(bamfile)):

            # Limit to some reads for testing
            if i > maxreads:
                if VERBOSE:
                    print 'Max read number reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10)):
                print (i+1) 

            # Divide by read 1/2 and forward/reverse
            js = reads[0].is_reverse
            count = counts[js]

            # List of mutations
            positions[:] = -1
            ais[:] = -1
            imut = 0

            # Collect from the pair of reads
            for read in reads:
        
                # Sequence and position
                # Note: stampy takes the reverse complement already
                seq = read.seq
                pos = read.pos
    
                # Iterate over CIGARs
                len_cig = len(read.cigar)
                for ic, (block_type, block_len) in enumerate(read.cigar):
    
                    # Check for pos: it should never exceed the length of the fragment
                    if (block_type in [0, 1, 2]) and (pos > len(refseq)):
                        raise ValueError('Pos exceeded the length of the fragment')
                
                    # Inline block
                    if block_type == 0:
 
                        # Get the mutations and add them
                        indb = map(alphal.index, seq)
                        positions[imut: imut + len(indb)] = \
                                pos + np.arange(len(indb))
                        ais[imut: imut + len(indb)] = indb
                        imut += len(indb)

                        # Chop off this block
                        if ic != len_cig - 1:
                            seq = seq[block_len:]
                            pos += block_len
 
                    # Deletion
                    elif block_type == 2:                
                        # Chop off pos, but not sequence
                        pos += block_len
                
                    # Insertion
                    # an insert @ pos 391 means that seq[:391] is BEFORE the insert,
                    # THEN the insert, FINALLY comes seq[391:]
                    elif block_type == 1:
                        # Chop off seq, but not pos
                        if ic != len_cig - 1:
                            seq = seq[block_len:]
                
                    # Other types of cigar?
                    else:
                        raise ValueError('CIGAR type '+str(block_type)+' not recognized')

            if VERBOSE >= 4:
                for pos, ai in izip(positions, ais):
                    if pos == -1:
                        break
                    print pos, ai

            # Put the mutations into the matrix
            for ai1 in xrange(len(alpha)):
                for ai2 in xrange(len(alpha)):
                    coun = count[ai1, ai2]
                    pos1 = positions[ais == ai1]
                    if ai1 == ai2: pos2 = pos1
                    else: pos2 = positions[ais == ai2]
                    coords = np.meshgrid(pos1, pos2)
                    ind = coords[0].ravel() * coun.shape[0] + coords[1].ravel()
                    coun.ravel()[ind] += 1                                        

    return counts

예제 #28

0

파일 보기

def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True,
                 plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder,
                                          adaID,
                                          frag_gen,
                                          type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: ' + adaID + ', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(
        data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros(
                (len(ref) // binsize + 1, n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break

                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair ' + str(irp) +
                                     ': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue

                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads,
                                             contaminants,
                                             VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads,
                                      match_len_min=match_len_min,
                                      trim_left=trim_bad_cigars,
                                      trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(
            data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID ' + adaID + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Suspect contaminations:\t' + str(n_suspect) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')

    if plot:
        plot_distance_histogram(data_folder,
                                adaID,
                                frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder,
                                               adaID,
                                               frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)

예제 #29

0

파일 보기

파일: filter_mapped_reads.py 프로젝트: iosonofabio/hivwholeseq

def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True, plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: '+adaID+', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                     filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4]+'_trashed.bam'
 
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros((len(ref) // binsize + 1,
                                             n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break
            
                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair '+str(irp)+': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue
            
                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue
                    
                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads, match_len_min=match_len_min,
                                       trim_left=trim_bad_cigars,
                                       trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID '+adaID+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Suspect contaminations:\t'+str(n_suspect)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')


    if plot:
        plot_distance_histogram(data_folder, adaID, frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)