Python get_mapped_filename 예제들, hivwholeseq.sequencing.filenames.get_mapped_filename Python 예제들

예제 #1

0

파일 보기

파일: check_mapped_coverage.py 프로젝트: 5l1v3r1/hivwholeseq

def check_coverage(data_folder,
                   adaID,
                   fragment,
                   seq_run,
                   qual_min=35,
                   reference='HXB2',
                   maxreads=-1,
                   VERBOSE=0,
                   rescue=False,
                   minor_allele=False):
    '''Check division into fragments: coverage, etc.'''
    ref_fn = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(ref_fn, 'fasta')

    input_filename = get_mapped_filename(data_folder,
                                         adaID,
                                         fragment,
                                         type='bam',
                                         rescue=rescue)

    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(
        input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE)

    # Plot results
    title = ', '.join(
        map(lambda x: ' '.join([x[0], str(x[1])]), [
            ['run', seq_run],
            ['adaID', adaID],
            ['fragment', fragment],
            ['maxreads', maxreads],
        ]))
    plot_coverage(counts, suptitle=title, minor_allele=minor_allele)

예제 #2

0

파일 보기

def get_distance_histogram(data_folder, adaID, fragment, maxreads=1000, VERBOSE=0,
                           filtered=False):
    '''Get the distance of reads from their consensus'''
    reffilename = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=filtered)

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        n_pairs = 0
        read_pairs = []
        for (i, rp) in enumerate(pair_generator(bamfile)):
            if n_pairs >= maxreads:
                break

            r1 = rp[0]
            if not r1.is_proper_pair:
                continue

            read_pairs.append(rp)
            n_pairs += 1

        ds = get_distance_from_reference(ref, read_pairs, threshold=30)

    h = np.bincount(ds)
    return h

예제 #3

0

파일 보기

파일: get_allele_counts.py 프로젝트: 5l1v3r1/hivwholeseq

def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10):
    '''Extract allele and insert counts from a bamfile'''

    # Read reference
    reffilename = get_consensus_filename(data_folder,
                                         adaID,
                                         fragment,
                                         trim_primers=True)
    refseq = SeqIO.read(reffilename, 'fasta')

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      fragment,
                                      type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Call lower-level function
    return get_allele_counts_insertions_from_file(bamfilename,
                                                  len(refseq),
                                                  qual_min=qual_min,
                                                  maxreads=maxreads,
                                                  VERBOSE=VERBOSE)

예제 #4

0

파일 보기

def get_insert_size_distribution(data_folder, adaID, fragment, bins=None,
                                 maxreads=-1, VERBOSE=0, density=True):
    '''Get the distribution of insert sizes'''

    if maxreads <= 0:
        maxreads = 1e6
    
    insert_sizes = np.zeros(maxreads, np.int16)

    # Open BAM file
    if fragment == 'premapped':
        bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    else:
        bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                          filtered=True)

    # Convert from SAM if necessary
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Open file
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        # Iterate over single reads (no linkage info needed)
        n_written = 0
        for i, reads in enumerate(pair_generator(bamfile)):

            if i == maxreads:
                if VERBOSE >= 2:
                    print 'Max reads reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10000)):
                print (i+1)

            # If unmapped or unpaired, mini, or insert size mini, discard
            if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \
               reads[1].is_unmapped or (not reads[1].is_proper_pair):
                continue
            
            # Store insert size
            i_fwd = reads[0].is_reverse
            insert_sizes[i] = reads[i_fwd].isize
            n_written += 1

    insert_sizes = insert_sizes[:n_written]
    insert_sizes.sort()

    # Bin it
    if bins is None:
        h = np.histogram(insert_sizes, density=density)
    else:
        h = np.histogram(insert_sizes, bins=bins, density=density)

    return insert_sizes, h

예제 #5

0

파일 보기

파일: coverage_tuples.py 프로젝트: iosonofabio/hivwholeseq

def get_coverage_tuples(data_folder, adaID, fragment, mtuples,
                       maxreads=-1, VERBOSE=0):
    '''Get the joint coverage of a list of positions'''
    # Prepare data structures
    mtuples = [np.asarray(tup, int) for tup in mtuples]
    coverage = np.zeros(len(mtuples), int)

    # TODO: what to do if it is covered multiple times? or only some sites?
    covs_pair = [np.zeros(len(tup), bool) for tup in mtuples]

    # Open BAM
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
    
        # Iterate over all pairs
        for irp, reads in enumerate(pair_generator(bamfile)):

            # Limit to the first reads
            if irp == maxreads:
                if VERBOSE:
                    print 'Max reads reached:', maxreads
                break

            if VERBOSE >= 3:
                if not ((irp + 1) % 10000):
                    print irp + 1

            # Reinitialize temporary structure
            for cov_pair in covs_pair: cov_pair[:] = False

            # Look in both reads
            for read in reads:

                # NOTE: deletions count as covered, because in principle
                # we see that part of the reference
                cigar = read.cigar
                ref_start = read.pos
                ref_end = ref_start + sum(bl for (bt, bl) in cigar if bt in (0, 2))

                # Use numba to accelerate? better not
                if False:
                    add_read(ref_start, ref_end, mtuples, covs_pair)
                else:
                    for cov_pair, mtuple in izip(covs_pair, mtuples):
                        cov_pair[(mtuple >= ref_start) & (mtuple < ref_end)] = True

            # Check which tuples are fully covered
            for i, cov_pair in enumerate(covs_pair):
                if cov_pair.all():
                    coverage[i] += 1

    return coverage

예제 #6

0

파일 보기

파일: map_to_consensus.py 프로젝트: iosonofabio/hivwholeseq

def make_output_folders(data_folder, adaID, VERBOSE=0):
    '''Make the output folders if necessary for hash and map'''
    hash_foldername = os.path.dirname(get_hash_file(data_folder, adaID, 'F0'))
    map_foldername = os.path.dirname(get_mapped_filename(data_folder, adaID, 'F0'))
    foldernames = [hash_foldername, map_foldername]

    # Make the folders
    for dirname in foldernames:
        if not os.path.isdir(dirname):
            os.mkdir(dirname)
        if VERBOSE:
            print 'Folder created:', dirname

예제 #7

0

파일 보기

파일: map_to_initial_reference.py 프로젝트: 5l1v3r1/hivwholeseq

def get_input_filename(data_folder, adaID, frag_spec, type='bam', only_chunk=None,
                       filtered=True):
    '''Get filename of input for mapping to initial reference'''
    # We should take reads filtered after mapping to the auto-consensus
    if filtered:
        from hivwholeseq.sequencing.filenames import get_mapped_filename
        frag_gen = frag_spec[:2]
        fn = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True)
    else:
        from hivwholeseq.sequencing.filenames import get_divided_filename
        fn = get_divided_filename(data_folder, adaID, frag_spec, type='bam', chunk=only_chunk)
    return fn

예제 #8

0

파일 보기

def make_output_folders(data_folder, adaID, VERBOSE=0):
    '''Make the output folders if necessary for hash and map'''
    hash_foldername = os.path.dirname(get_hash_file(data_folder, adaID, 'F0'))
    map_foldername = os.path.dirname(
        get_mapped_filename(data_folder, adaID, 'F0'))
    foldernames = [hash_foldername, map_foldername]

    # Make the folders
    for dirname in foldernames:
        if not os.path.isdir(dirname):
            os.mkdir(dirname)
        if VERBOSE:
            print 'Folder created:', dirname

예제 #9

0

파일 보기

파일: clean_temp_files.py 프로젝트: 5l1v3r1/hivwholeseq

def remove_mapped_tempfiles(data_folder, adaID, fragment='F', VERBOSE=0, rescue=False):
    '''Remove the part files of multi-threaded mapping'''
    from hivwholeseq.sequencing.filenames import get_mapped_filename

    dirname = os.path.dirname(get_mapped_filename(data_folder, adaID, 'F1'))+'/'
    fns = glob.glob(dirname+fragment+'*_part*') + \
          glob.glob(dirname+fragment+'*_unsorted*') + \
          glob.glob(dirname+fragment+'*.00*.bam')
    fns.append(dirname+fragment+'.sam')
    if rescue:
        fns.append(dirname+fragment+'_rescue.sam')

    for fn in fns:
        os.remove(fn)
        if VERBOSE >= 3:
            print 'File removed:', fn

예제 #10

0

파일 보기

파일: get_allele_counts.py 프로젝트: iosonofabio/hivwholeseq

def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10):
    """Extract allele and insert counts from a bamfile"""

    # Read reference
    reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True)
    refseq = SeqIO.read(reffilename, "fasta")

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type="bam", filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Call lower-level function
    return get_allele_counts_insertions_from_file(
        bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE
    )

예제 #11

0

파일 보기

파일: check_mapped_coverage.py 프로젝트: iosonofabio/hivwholeseq

def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35,
                   reference='HXB2', maxreads=-1, VERBOSE=0,
                   rescue=False,
                   minor_allele=False):
    '''Check division into fragments: coverage, etc.'''
    ref_fn = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(ref_fn, 'fasta')

    input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                         rescue=rescue)

    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename,
                                                                        len(refseq),
                                                                        maxreads=maxreads,
                                                                        VERBOSE=VERBOSE)

    # Plot results
    title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]),
                        [['run', seq_run],
                         ['adaID', adaID],
                         ['fragment', fragment],
                         ['maxreads', maxreads],
                        ]))
    plot_coverage(counts, suptitle=title, minor_allele=minor_allele)

예제 #12

0

파일 보기

def get_read_lengths(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1):
    '''Get the read lengths'''

    # Lengths from 1 to 250
    lengths = np.zeros((len(read_types), 250), int)

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Iterate over single reads (no linkage info needed)
        for i, read in enumerate(bamfile):

            # Max number of reads
            if i == maxreads:
                if VERBOSE >= 2:
                    print 'Max reads reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10000)):
                print (i+1)
        
            # Divide by read 1/2 and forward/reverse
            js = 2 * read.is_read2 + read.is_reverse

            # Increment counter
            lengths[js, read.rlen - 1] += 1

            # Note: we do not delve into CIGARs because the reads are trimmed

    return lengths

예제 #13

0

파일 보기

파일: map_to_consensus.py 프로젝트: iosonofabio/hivwholeseq

def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1,
               cluster_time='23:59:59', maxreads=-1, summary=True,
               rescue=False, dry=False):
    '''Map using stampy'''
    frag_gen = fragment[:2]

    if summary:
        summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen,
                                                    rescue=rescue)

    # Set mapping penalty scores: softer for rescues and F3 and F5
    global subsrate
    if rescue:
        subsrate = '0.2'
        stampy_gapopen = 5	    # Default: 40
        stampy_gapextend = 1 	    # Default: 3

    elif frag_gen not in ('F3', 'F5'):
        stampy_gapopen = 60	    # Default: 40
        stampy_gapextend = 5 	    # Default: 3
    else:
        stampy_gapopen = 30	    # Default: 40
        stampy_gapextend = 2 	    # Default: 3

    if VERBOSE:
        print 'Map via stampy: '+adaID+' '+frag_gen

    if not rescue: 
        input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam')

        # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
        if not os.path.isfile(input_filename):
            if frag_gen == 'F3':
                input_filename = input_filename.replace('F3a', 'F3')

    else:
        input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename+', fragment '+fragment+': input file not found.')

    # parallelize if requested
    if threads == 1:

        output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                              rescue=rescue)

        # Map
        call_list = [stampy_bin,
                     '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
                     '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), 
                     '-o', output_filename,
                     '--overwrite',
                     '--substitutionrate='+subsrate,
                     '--gapopen', stampy_gapopen,
                     '--gapextend', stampy_gapextend]
        if stampy_sensitive:
            call_list.append('--sensitive')

        # Take only a (random) subsample: stampy uses the fraction of reads
        # intead of the number
        if maxreads > 0:
            # FIXME: figure out the -s option and the --numrecords option
            call_list.extend(['--numrecords', maxreads])
            
            #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2
            #frac_pairs = 1.0 * maxreads / n_pairs_tot
            #random_seed = np.random.randint(1e5)
            #call_list.extend(['-s', frac_pairs + random_seed])

        call_list = call_list + ['-M', input_filename]
        call_list = map(str, call_list)
        if VERBOSE >=2:
            print ' '.join(call_list)

        if not dry:
            sp.call(call_list)

            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Stampy mapped (single thread).\n')

            output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                                  rescue=rescue)
            convert_sam_to_bam(output_filename)
        else:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (single thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (single thread)'

            return

    else:

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')
        for j in xrange(threads):
        
            # Get output filename
            output_filename =  get_mapped_filename(data_folder, adaID, frag_gen,
                                               type='sam', part=(j+1), rescue=rescue)
            # Map
            call_list = ['qsub','-cwd',
                         '-b', 'y',
                         '-S', '/bin/bash',
                         '-o', JOBLOGOUT,
                         '-e', JOBLOGERR,
                         '-N', 'm'+adaID.replace('-', '')+frag_gen+str(j+1),
                         '-l', 'h_rt='+cluster_time,
                         '-l', 'h_vmem='+vmem,
                         stampy_bin,
                         '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
                         '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), 
                         '-o', output_filename,
                         '--overwrite',
                         '--processpart='+str(j+1)+'/'+str(threads),
                         '--substitutionrate='+subsrate,
                         '--gapopen', stampy_gapopen,
                         '--gapextend', stampy_gapextend]
            if stampy_sensitive:
                call_list.append('--sensitive')
            call_list = call_list + ['-M', input_filename]
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)

            if not dry:
                job_ID = sp.check_output(call_list)
                job_ID = job_ID.split()[2]
                job_IDs[j] = job_ID

        if dry:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (multi thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (multi thread)'
            return

        # Monitor output
        output_file_parts = [get_mapped_filename(data_folder, adaID, frag_gen,
                                                 type='bam', part=(j+1), rescue=rescue)
                              for j in xrange(threads)]
        time_wait = 10 # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10 # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert mapped reads to BAM for merging: adaID '+\
                               adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy mapped ('+str(threads)+' threads).\n')

        # Concatenate output files
        output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                              unsorted=True, rescue=rescue)
        if VERBOSE >= 1:
            print 'Concatenate mapped reads: adaID '+adaID+', fragment '+frag_gen
        pysam.cat('-o', output_filename, *output_file_parts)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen,
                                                     type='bam',
                                                     unsorted=False,
                                                     rescue=rescue)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort mapped reads: adaID '+adaID+', fragment '+frag_gen
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader mapped reads: adaID '+adaID+', fragment '+frag_gen
        header_filename = get_mapped_filename(data_folder, adaID, frag_gen,
                                              type='sam', part=1, rescue=rescue)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    # FIXME: check whether temp files are all deleted
    if VERBOSE >= 1:
        print 'Remove temporary files: adaID '+adaID+', fragment '+frag_gen
    remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')

예제 #14

0

파일 보기

            if VERBOSE >= 1:
                print 'PCR type not found, skipping'
            continue

        if not fragments:
            fragments_sample = sample.regions_generic
        else:
            fragments_sample = [fr for fr in fragments if fr in sample.regions_generic]
        if VERBOSE >= 3:
            print 'adaID '+adaID+': fragments '+' '.join(fragments_sample)

        for fragment in fragments_sample:
            if VERBOSE >= 1:
                print fragment,

            bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                              filtered=use_filtered)
            if not os.path.isfile(bamfilename):
                if VERBOSE >= 1:
                    print 'missing mapped file, skipping'
                continue

            dist_hist = get_distance_histogram(data_folder, adaID, fragment,
                                               VERBOSE=VERBOSE, maxreads=maxreads,
                                               filtered=use_filtered)
            label = [seq_run, adaID, samplename, fragment, dist_hist.sum()]
            hists.append((dist_hist, label))

        if VERBOSE >= 1:
            print ''
    
    if len(hists) == 1:

예제 #15

0

파일 보기

파일: examine_distant_reads.py 프로젝트: 5l1v3r1/hivwholeseq

                if not found:
                    if VERBOSE >= 1:
                        print 'not filtered (probably no HIV reads)'
                    continue

                frac_dist = 1.0 * n_distant / n_good
                if frac_dist < 0.01:
                    if VERBOSE >= 1:
                        print '< 1% of reads are distant'
            
                else:
                    if VERBOSE >= 1:
                        print '{:3.0%}'.format(frac_dist), 'of reads are distant'

            consrec = sample.get_consensus(fragment)
            bamfilename = get_mapped_filename(data_folder, adaID, fragment,
                                              filtered=False)

            (ds, edges, seqs) = fish_distant_reads(bamfilename, consrec, VERBOSE=VERBOSE,
                                                   min_mismatches=min_mismatches,
                                                   max_mismatches=max_mismatches,
                                                   maxseqs=maxseqs)
            indrandom = np.arange(len(ds))
            np.random.shuffle(indrandom)
            ds = ds[indrandom]
            edges = np.array(edges)[indrandom]
            seqs = [seqs[i] for i in indrandom]

            for irp, (dpair, edgepair, seqpair) in enumerate(izip(ds, edges, seqs)):
                # NOTE: Take only the most distant read of a pair
                print irp, dpair

예제 #16

0

파일 보기

def map_stampy(data_folder,
               adaID,
               fragment,
               VERBOSE=0,
               threads=1,
               cluster_time='23:59:59',
               maxreads=-1,
               summary=True,
               rescue=False,
               dry=False):
    '''Map using stampy'''
    frag_gen = fragment[:2]

    if summary:
        summary_filename = get_map_summary_filename(data_folder,
                                                    adaID,
                                                    frag_gen,
                                                    rescue=rescue)

    # Set mapping penalty scores: softer for rescues and F3 and F5
    global subsrate
    if rescue:
        subsrate = '0.2'
        stampy_gapopen = 5  # Default: 40
        stampy_gapextend = 1  # Default: 3

    elif frag_gen not in ('F3', 'F5'):
        stampy_gapopen = 60  # Default: 40
        stampy_gapextend = 5  # Default: 3
    else:
        stampy_gapopen = 30  # Default: 40
        stampy_gapextend = 2  # Default: 3

    if VERBOSE:
        print 'Map via stampy: ' + adaID + ' ' + frag_gen

    if not rescue:
        input_filename = get_divided_filename(data_folder,
                                              adaID,
                                              fragment,
                                              type='bam')

        # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
        if not os.path.isfile(input_filename):
            if frag_gen == 'F3':
                input_filename = input_filename.replace('F3a', 'F3')

    else:
        input_filename = get_divided_filename(data_folder,
                                              adaID,
                                              'unmapped',
                                              type='bam')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename + ', fragment ' + fragment +
                         ': input file not found.')

    # parallelize if requested
    if threads == 1:

        output_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='sam',
                                              rescue=rescue)

        # Map
        call_list = [
            stampy_bin, '-g',
            get_index_file(data_folder, adaID, frag_gen, ext=False), '-h',
            get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o',
            output_filename, '--overwrite', '--substitutionrate=' + subsrate,
            '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend
        ]
        if stampy_sensitive:
            call_list.append('--sensitive')

        # Take only a (random) subsample: stampy uses the fraction of reads
        # intead of the number
        if maxreads > 0:
            # FIXME: figure out the -s option and the --numrecords option
            call_list.extend(['--numrecords', maxreads])

            #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2
            #frac_pairs = 1.0 * maxreads / n_pairs_tot
            #random_seed = np.random.randint(1e5)
            #call_list.extend(['-s', frac_pairs + random_seed])

        call_list = call_list + ['-M', input_filename]
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)

        if not dry:
            sp.call(call_list)

            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Stampy mapped (single thread).\n')

            output_filename = get_mapped_filename(data_folder,
                                                  adaID,
                                                  frag_gen,
                                                  type='bam',
                                                  rescue=rescue)
            convert_sam_to_bam(output_filename)
        else:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (single thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (single thread)'

            return

    else:

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')
        for j in xrange(threads):

            # Get output filename
            output_filename = get_mapped_filename(data_folder,
                                                  adaID,
                                                  frag_gen,
                                                  type='sam',
                                                  part=(j + 1),
                                                  rescue=rescue)
            # Map
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N',
                'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l',
                'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin,
                '-g',
                get_index_file(data_folder, adaID, frag_gen, ext=False), '-h',
                get_hash_file(data_folder, adaID, frag_gen,
                              ext=False), '-o', output_filename, '--overwrite',
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen,
                '--gapextend', stampy_gapextend
            ]
            if stampy_sensitive:
                call_list.append('--sensitive')
            call_list = call_list + ['-M', input_filename]
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)

            if not dry:
                job_ID = sp.check_output(call_list)
                job_ID = job_ID.split()[2]
                job_IDs[j] = job_ID

        if dry:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (multi thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (multi thread)'
            return

        # Monitor output
        output_file_parts = [
            get_mapped_filename(data_folder,
                                adaID,
                                frag_gen,
                                type='bam',
                                part=(j + 1),
                                rescue=rescue) for j in xrange(threads)
        ]
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert mapped reads to BAM for merging: adaID '+\
                               adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy mapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        output_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='bam',
                                              unsorted=True,
                                              rescue=rescue)
        if VERBOSE >= 1:
            print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        pysam.cat('-o', output_filename, *output_file_parts)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        output_filename_sorted = get_mapped_filename(data_folder,
                                                     adaID,
                                                     frag_gen,
                                                     type='bam',
                                                     unsorted=False,
                                                     rescue=rescue)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        header_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='sam',
                                              part=1,
                                              rescue=rescue)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    # FIXME: check whether temp files are all deleted
    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen
    remove_mapped_tempfiles(data_folder,
                            adaID,
                            frag_gen,
                            VERBOSE=VERBOSE,
                            rescue=rescue)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')

예제 #17

0

파일 보기

파일: get_coallele_counts.py 프로젝트: iosonofabio/hivwholeseq

def get_coallele_counts(data_folder, adaID, fragment, VERBOSE=0):
    '''Extract allele and insert counts from a bamfile'''

    # Read reference
    reffilename = get_consensus_filename(data_folder, adaID, fragment,
                                         trim_primers=True)
    refseq = SeqIO.read(reffilename, 'fasta')
    
    # Allele counts and inserts (TODO: compress this data?)
    # Note: the pair is of 2 types only, while the single reads usually are of 4
    counts = np.zeros((len(read_pair_types),
                       len(alpha), len(alpha),
                       len(refseq), len(refseq)), int)
    positions = np.zeros(501, int)
    ais = np.zeros_like(positions)
    # TODO: no inserts for now

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Iterate over read pairs
        for i, reads in enumerate(pair_generator(bamfile)):

            # Limit to some reads for testing
            if i > maxreads:
                if VERBOSE:
                    print 'Max read number reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10)):
                print (i+1) 

            # Divide by read 1/2 and forward/reverse
            js = reads[0].is_reverse
            count = counts[js]

            # List of mutations
            positions[:] = -1
            ais[:] = -1
            imut = 0

            # Collect from the pair of reads
            for read in reads:
        
                # Sequence and position
                # Note: stampy takes the reverse complement already
                seq = read.seq
                pos = read.pos
    
                # Iterate over CIGARs
                len_cig = len(read.cigar)
                for ic, (block_type, block_len) in enumerate(read.cigar):
    
                    # Check for pos: it should never exceed the length of the fragment
                    if (block_type in [0, 1, 2]) and (pos > len(refseq)):
                        raise ValueError('Pos exceeded the length of the fragment')
                
                    # Inline block
                    if block_type == 0:
 
                        # Get the mutations and add them
                        indb = map(alphal.index, seq)
                        positions[imut: imut + len(indb)] = \
                                pos + np.arange(len(indb))
                        ais[imut: imut + len(indb)] = indb
                        imut += len(indb)

                        # Chop off this block
                        if ic != len_cig - 1:
                            seq = seq[block_len:]
                            pos += block_len
 
                    # Deletion
                    elif block_type == 2:                
                        # Chop off pos, but not sequence
                        pos += block_len
                
                    # Insertion
                    # an insert @ pos 391 means that seq[:391] is BEFORE the insert,
                    # THEN the insert, FINALLY comes seq[391:]
                    elif block_type == 1:
                        # Chop off seq, but not pos
                        if ic != len_cig - 1:
                            seq = seq[block_len:]
                
                    # Other types of cigar?
                    else:
                        raise ValueError('CIGAR type '+str(block_type)+' not recognized')

            if VERBOSE >= 4:
                for pos, ai in izip(positions, ais):
                    if pos == -1:
                        break
                    print pos, ai

            # Put the mutations into the matrix
            for ai1 in xrange(len(alpha)):
                for ai2 in xrange(len(alpha)):
                    coun = count[ai1, ai2]
                    pos1 = positions[ais == ai1]
                    if ai1 == ai2: pos2 = pos1
                    else: pos2 = positions[ais == ai2]
                    coords = np.meshgrid(pos1, pos2)
                    ind = coords[0].ravel() * coun.shape[0] + coords[1].ravel()
                    coun.ravel()[ind] += 1                                        

    return counts

예제 #18

0

파일 보기

def get_coverage_tuples(data_folder,
                        adaID,
                        fragment,
                        mtuples,
                        maxreads=-1,
                        VERBOSE=0):
    '''Get the joint coverage of a list of positions'''
    # Prepare data structures
    mtuples = [np.asarray(tup, int) for tup in mtuples]
    coverage = np.zeros(len(mtuples), int)

    # TODO: what to do if it is covered multiple times? or only some sites?
    covs_pair = [np.zeros(len(tup), bool) for tup in mtuples]

    # Open BAM
    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      fragment,
                                      type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Iterate over all pairs
        for irp, reads in enumerate(pair_generator(bamfile)):

            # Limit to the first reads
            if irp == maxreads:
                if VERBOSE:
                    print 'Max reads reached:', maxreads
                break

            if VERBOSE >= 3:
                if not ((irp + 1) % 10000):
                    print irp + 1

            # Reinitialize temporary structure
            for cov_pair in covs_pair:
                cov_pair[:] = False

            # Look in both reads
            for read in reads:

                # NOTE: deletions count as covered, because in principle
                # we see that part of the reference
                cigar = read.cigar
                ref_start = read.pos
                ref_end = ref_start + sum(
                    bl for (bt, bl) in cigar if bt in (0, 2))

                # Use numba to accelerate? better not
                if False:
                    add_read(ref_start, ref_end, mtuples, covs_pair)
                else:
                    for cov_pair, mtuple in izip(covs_pair, mtuples):
                        cov_pair[(mtuple >= ref_start)
                                 & (mtuple < ref_end)] = True

            # Check which tuples are fully covered
            for i, cov_pair in enumerate(covs_pair):
                if cov_pair.all():
                    coverage[i] += 1

    return coverage

예제 #19

0

파일 보기

def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True,
                 plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder,
                                          adaID,
                                          frag_gen,
                                          type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: ' + adaID + ', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(
        data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros(
                (len(ref) // binsize + 1, n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break

                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair ' + str(irp) +
                                     ': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue

                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads,
                                             contaminants,
                                             VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads,
                                      match_len_min=match_len_min,
                                      trim_left=trim_bad_cigars,
                                      trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(
            data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID ' + adaID + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Suspect contaminations:\t' + str(n_suspect) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')

    if plot:
        plot_distance_histogram(data_folder,
                                adaID,
                                frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder,
                                               adaID,
                                               frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)

예제 #20

0

파일 보기

파일: filter_mapped_reads.py 프로젝트: iosonofabio/hivwholeseq

def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True, plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: '+adaID+', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                     filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4]+'_trashed.bam'
 
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros((len(ref) // binsize + 1,
                                             n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break
            
                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair '+str(irp)+': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue
            
                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue
                    
                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads, match_len_min=match_len_min,
                                       trim_left=trim_bad_cigars,
                                       trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID '+adaID+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Suspect contaminations:\t'+str(n_suspect)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')


    if plot:
        plot_distance_histogram(data_folder, adaID, frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)