def make_consensus(data_folder, adaID, fragment, n_iter, qual_min=20, VERBOSE=0,
                   coverage_min=10, summary=True):
    '''Make consensus sequence from the mapped reads'''
    if VERBOSE:
        print 'Build consensus: '+adaID+' '+fragment+' iteration '+str(n_iter)
    
    # Read reference
    reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    # Open BAM file
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\
                                len(refseq), qual_min=qual_min,
                                match_len_min=match_len_min)

    consensus_final = build_consensus(counts, inserts,
                                      coverage_min=coverage_min,
                                      VERBOSE=VERBOSE)

    if summary:
        with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f:
            f.write('Consensus built for iteration '+str(n_iter))
            f.write('\n')

    return refseq, consensus_final
예제 #2
0
def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10):
    '''Extract allele and insert counts from a bamfile'''

    # Read reference
    reffilename = get_consensus_filename(data_folder,
                                         adaID,
                                         fragment,
                                         trim_primers=True)
    refseq = SeqIO.read(reffilename, 'fasta')

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      fragment,
                                      type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Call lower-level function
    return get_allele_counts_insertions_from_file(bamfilename,
                                                  len(refseq),
                                                  qual_min=qual_min,
                                                  maxreads=maxreads,
                                                  VERBOSE=VERBOSE)
예제 #3
0
def get_insert_size_distribution(data_folder, adaID, fragment, bins=None,
                                 maxreads=-1, VERBOSE=0, density=True):
    '''Get the distribution of insert sizes'''

    if maxreads <= 0:
        maxreads = 1e6
    
    insert_sizes = np.zeros(maxreads, np.int16)

    # Open BAM file
    if fragment == 'premapped':
        bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    else:
        bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                          filtered=True)

    # Convert from SAM if necessary
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Open file
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        # Iterate over single reads (no linkage info needed)
        n_written = 0
        for i, reads in enumerate(pair_generator(bamfile)):

            if i == maxreads:
                if VERBOSE >= 2:
                    print 'Max reads reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10000)):
                print (i+1)

            # If unmapped or unpaired, mini, or insert size mini, discard
            if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \
               reads[1].is_unmapped or (not reads[1].is_proper_pair):
                continue
            
            # Store insert size
            i_fwd = reads[0].is_reverse
            insert_sizes[i] = reads[i_fwd].isize
            n_written += 1

    insert_sizes = insert_sizes[:n_written]
    insert_sizes.sort()

    # Bin it
    if bins is None:
        h = np.histogram(insert_sizes, density=density)
    else:
        h = np.histogram(insert_sizes, bins=bins, density=density)

    return insert_sizes, h
예제 #4
0
def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10):
    """Extract allele and insert counts from a bamfile"""

    # Read reference
    reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True)
    refseq = SeqIO.read(reffilename, "fasta")

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type="bam", filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Call lower-level function
    return get_allele_counts_insertions_from_file(
        bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE
    )
예제 #5
0
def make_consensus(data_folder,
                   adaID,
                   fragment,
                   n_iter,
                   qual_min=20,
                   VERBOSE=0,
                   coverage_min=10,
                   summary=True):
    '''Make consensus sequence from the mapped reads'''
    if VERBOSE:
        print 'Build consensus: ' + adaID + ' ' + fragment + ' iteration ' + str(
            n_iter)

    # Read reference
    reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    # Open BAM file
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\
                                len(refseq), qual_min=qual_min,
                                match_len_min=match_len_min)

    consensus_final = build_consensus(counts,
                                      inserts,
                                      coverage_min=coverage_min,
                                      VERBOSE=VERBOSE)

    if summary:
        with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f:
            f.write('Consensus built for iteration ' + str(n_iter))
            f.write('\n')

    return refseq, consensus_final
예제 #6
0
def get_read_lengths(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1):
    '''Get the read lengths'''

    # Lengths from 1 to 250
    lengths = np.zeros((len(read_types), 250), int)

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Iterate over single reads (no linkage info needed)
        for i, read in enumerate(bamfile):

            # Max number of reads
            if i == maxreads:
                if VERBOSE >= 2:
                    print 'Max reads reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10000)):
                print (i+1)
        
            # Divide by read 1/2 and forward/reverse
            js = 2 * read.is_read2 + read.is_reverse

            # Increment counter
            lengths[js, read.rlen - 1] += 1

            # Note: we do not delve into CIGARs because the reads are trimmed

    return lengths
예제 #7
0
def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True,
                 plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder,
                                          adaID,
                                          frag_gen,
                                          type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: ' + adaID + ', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(
        data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros(
                (len(ref) // binsize + 1, n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break

                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair ' + str(irp) +
                                     ': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue

                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads,
                                             contaminants,
                                             VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads,
                                      match_len_min=match_len_min,
                                      trim_left=trim_bad_cigars,
                                      trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(
            data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID ' + adaID + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Suspect contaminations:\t' + str(n_suspect) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')

    if plot:
        plot_distance_histogram(data_folder,
                                adaID,
                                frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder,
                                               adaID,
                                               frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)
예제 #8
0
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1,
               cluster_time='23:59:59', maxreads=-1, summary=True,
               rescue=False, dry=False):
    '''Map using stampy'''
    frag_gen = fragment[:2]

    if summary:
        summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen,
                                                    rescue=rescue)

    # Set mapping penalty scores: softer for rescues and F3 and F5
    global subsrate
    if rescue:
        subsrate = '0.2'
        stampy_gapopen = 5	    # Default: 40
        stampy_gapextend = 1 	    # Default: 3

    elif frag_gen not in ('F3', 'F5'):
        stampy_gapopen = 60	    # Default: 40
        stampy_gapextend = 5 	    # Default: 3
    else:
        stampy_gapopen = 30	    # Default: 40
        stampy_gapextend = 2 	    # Default: 3

    if VERBOSE:
        print 'Map via stampy: '+adaID+' '+frag_gen

    if not rescue: 
        input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam')

        # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
        if not os.path.isfile(input_filename):
            if frag_gen == 'F3':
                input_filename = input_filename.replace('F3a', 'F3')

    else:
        input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename+', fragment '+fragment+': input file not found.')

    # parallelize if requested
    if threads == 1:

        output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                              rescue=rescue)

        # Map
        call_list = [stampy_bin,
                     '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
                     '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), 
                     '-o', output_filename,
                     '--overwrite',
                     '--substitutionrate='+subsrate,
                     '--gapopen', stampy_gapopen,
                     '--gapextend', stampy_gapextend]
        if stampy_sensitive:
            call_list.append('--sensitive')

        # Take only a (random) subsample: stampy uses the fraction of reads
        # intead of the number
        if maxreads > 0:
            # FIXME: figure out the -s option and the --numrecords option
            call_list.extend(['--numrecords', maxreads])
            
            #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2
            #frac_pairs = 1.0 * maxreads / n_pairs_tot
            #random_seed = np.random.randint(1e5)
            #call_list.extend(['-s', frac_pairs + random_seed])

        call_list = call_list + ['-M', input_filename]
        call_list = map(str, call_list)
        if VERBOSE >=2:
            print ' '.join(call_list)

        if not dry:
            sp.call(call_list)

            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Stampy mapped (single thread).\n')

            output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                                  rescue=rescue)
            convert_sam_to_bam(output_filename)
        else:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (single thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (single thread)'

            return

    else:

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')
        for j in xrange(threads):
        
            # Get output filename
            output_filename =  get_mapped_filename(data_folder, adaID, frag_gen,
                                               type='sam', part=(j+1), rescue=rescue)
            # Map
            call_list = ['qsub','-cwd',
                         '-b', 'y',
                         '-S', '/bin/bash',
                         '-o', JOBLOGOUT,
                         '-e', JOBLOGERR,
                         '-N', 'm'+adaID.replace('-', '')+frag_gen+str(j+1),
                         '-l', 'h_rt='+cluster_time,
                         '-l', 'h_vmem='+vmem,
                         stampy_bin,
                         '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
                         '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), 
                         '-o', output_filename,
                         '--overwrite',
                         '--processpart='+str(j+1)+'/'+str(threads),
                         '--substitutionrate='+subsrate,
                         '--gapopen', stampy_gapopen,
                         '--gapextend', stampy_gapextend]
            if stampy_sensitive:
                call_list.append('--sensitive')
            call_list = call_list + ['-M', input_filename]
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)

            if not dry:
                job_ID = sp.check_output(call_list)
                job_ID = job_ID.split()[2]
                job_IDs[j] = job_ID

        if dry:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (multi thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (multi thread)'
            return

        # Monitor output
        output_file_parts = [get_mapped_filename(data_folder, adaID, frag_gen,
                                                 type='bam', part=(j+1), rescue=rescue)
                              for j in xrange(threads)]
        time_wait = 10 # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10 # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert mapped reads to BAM for merging: adaID '+\
                               adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy mapped ('+str(threads)+' threads).\n')

        # Concatenate output files
        output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                              unsorted=True, rescue=rescue)
        if VERBOSE >= 1:
            print 'Concatenate mapped reads: adaID '+adaID+', fragment '+frag_gen
        pysam.cat('-o', output_filename, *output_file_parts)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen,
                                                     type='bam',
                                                     unsorted=False,
                                                     rescue=rescue)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort mapped reads: adaID '+adaID+', fragment '+frag_gen
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader mapped reads: adaID '+adaID+', fragment '+frag_gen
        header_filename = get_mapped_filename(data_folder, adaID, frag_gen,
                                              type='sam', part=1, rescue=rescue)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    # FIXME: check whether temp files are all deleted
    if VERBOSE >= 1:
        print 'Remove temporary files: adaID '+adaID+', fragment '+frag_gen
    remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')
        print 'fragments', fragments

    # Iterate over all requested samples
    for adaID in adaIDs:
        for fragment in fragments:

            # Read reference
            reffilename = get_consensus_filename(data_folder, adaID, fragment)
            refseq = SeqIO.read(reffilename, 'fasta')
            ref = np.array(refseq)
        
            # Open BAM
            bamfilename = get_mapped_filename(data_folder, adaID, fragment,
                                              filtered=False)
            if not os.path.isfile(bamfilename):
                convert_sam_to_bam(bamfilename)
            with pysam.Samfile(bamfilename, 'rb') as bamfile:
        
                # Iterate through reads
                for i, read in enumerate(bamfile):
                
                    # Limit to the first reads
                    if i >= maxreads: break
            
                    # Print output
                    if VERBOSE and not ((i +1) % 10000):
                        print (i+1)
                
                    # Ignore unmapped reads
                    if read.is_unmapped or not read.is_proper_pair:
                        continue
예제 #10
0
def map_stampy(data_folder,
               adaID,
               fragment,
               VERBOSE=0,
               threads=1,
               cluster_time='23:59:59',
               maxreads=-1,
               summary=True,
               rescue=False,
               dry=False):
    '''Map using stampy'''
    frag_gen = fragment[:2]

    if summary:
        summary_filename = get_map_summary_filename(data_folder,
                                                    adaID,
                                                    frag_gen,
                                                    rescue=rescue)

    # Set mapping penalty scores: softer for rescues and F3 and F5
    global subsrate
    if rescue:
        subsrate = '0.2'
        stampy_gapopen = 5  # Default: 40
        stampy_gapextend = 1  # Default: 3

    elif frag_gen not in ('F3', 'F5'):
        stampy_gapopen = 60  # Default: 40
        stampy_gapextend = 5  # Default: 3
    else:
        stampy_gapopen = 30  # Default: 40
        stampy_gapextend = 2  # Default: 3

    if VERBOSE:
        print 'Map via stampy: ' + adaID + ' ' + frag_gen

    if not rescue:
        input_filename = get_divided_filename(data_folder,
                                              adaID,
                                              fragment,
                                              type='bam')

        # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
        if not os.path.isfile(input_filename):
            if frag_gen == 'F3':
                input_filename = input_filename.replace('F3a', 'F3')

    else:
        input_filename = get_divided_filename(data_folder,
                                              adaID,
                                              'unmapped',
                                              type='bam')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename + ', fragment ' + fragment +
                         ': input file not found.')

    # parallelize if requested
    if threads == 1:

        output_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='sam',
                                              rescue=rescue)

        # Map
        call_list = [
            stampy_bin, '-g',
            get_index_file(data_folder, adaID, frag_gen, ext=False), '-h',
            get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o',
            output_filename, '--overwrite', '--substitutionrate=' + subsrate,
            '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend
        ]
        if stampy_sensitive:
            call_list.append('--sensitive')

        # Take only a (random) subsample: stampy uses the fraction of reads
        # intead of the number
        if maxreads > 0:
            # FIXME: figure out the -s option and the --numrecords option
            call_list.extend(['--numrecords', maxreads])

            #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2
            #frac_pairs = 1.0 * maxreads / n_pairs_tot
            #random_seed = np.random.randint(1e5)
            #call_list.extend(['-s', frac_pairs + random_seed])

        call_list = call_list + ['-M', input_filename]
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)

        if not dry:
            sp.call(call_list)

            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Stampy mapped (single thread).\n')

            output_filename = get_mapped_filename(data_folder,
                                                  adaID,
                                                  frag_gen,
                                                  type='bam',
                                                  rescue=rescue)
            convert_sam_to_bam(output_filename)
        else:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (single thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (single thread)'

            return

    else:

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')
        for j in xrange(threads):

            # Get output filename
            output_filename = get_mapped_filename(data_folder,
                                                  adaID,
                                                  frag_gen,
                                                  type='sam',
                                                  part=(j + 1),
                                                  rescue=rescue)
            # Map
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N',
                'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l',
                'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin,
                '-g',
                get_index_file(data_folder, adaID, frag_gen, ext=False), '-h',
                get_hash_file(data_folder, adaID, frag_gen,
                              ext=False), '-o', output_filename, '--overwrite',
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen,
                '--gapextend', stampy_gapextend
            ]
            if stampy_sensitive:
                call_list.append('--sensitive')
            call_list = call_list + ['-M', input_filename]
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)

            if not dry:
                job_ID = sp.check_output(call_list)
                job_ID = job_ID.split()[2]
                job_IDs[j] = job_ID

        if dry:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (multi thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (multi thread)'
            return

        # Monitor output
        output_file_parts = [
            get_mapped_filename(data_folder,
                                adaID,
                                frag_gen,
                                type='bam',
                                part=(j + 1),
                                rescue=rescue) for j in xrange(threads)
        ]
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert mapped reads to BAM for merging: adaID '+\
                               adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy mapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        output_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='bam',
                                              unsorted=True,
                                              rescue=rescue)
        if VERBOSE >= 1:
            print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        pysam.cat('-o', output_filename, *output_file_parts)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        output_filename_sorted = get_mapped_filename(data_folder,
                                                     adaID,
                                                     frag_gen,
                                                     type='bam',
                                                     unsorted=False,
                                                     rescue=rescue)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        header_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='sam',
                                              part=1,
                                              rescue=rescue)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    # FIXME: check whether temp files are all deleted
    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen
    remove_mapped_tempfiles(data_folder,
                            adaID,
                            frag_gen,
                            VERBOSE=VERBOSE,
                            rescue=rescue)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')
예제 #11
0
    for adaID in adaIDs:

        # Read reference (fragmented)
        reffilename = get_consensus_filename(data_folder, adaID, fragment)
        refseq = SeqIO.read(reffilename, 'fasta')
        ref = np.array(refseq)

        # read file
        bamfilename = get_mapped_filename(data_folder,
                                          adaID,
                                          fragment,
                                          type='bam',
                                          filtered=True)

        if not os.path.isfile(bamfilename):
            convert_sam_to_bam(bamfilename)
        bamfile = pysam.Samfile(bamfilename, 'rb')

        # Get the coverage for reads which have long insert sizes
        # (to be sure about their identity)
        cov_new = 0
        cov_old = 0
        for i_pairs, reads in enumerate(pair_generator(bamfile)):
            if i_pairs > 5000000:
                break

            if reads[0].isize < 300:
                continue

            for read in reads:
                if read.seq.find(primer_new) != -1:
예제 #12
0
def premap_stampy(data_folder,
                  adaID,
                  VERBOSE=0,
                  threads=1,
                  summary=True,
                  maxreads=-1,
                  subsrate=0.05,
                  gapopen=40,
                  gapextend=3):
    '''Call stampy for actual mapping'''
    if VERBOSE:
        print 'Premapping: adaID ', adaID

    if summary:
        summary_filename = get_premap_summary_filename(data_folder, adaID)

    # Stampy can handle both gzipped and uncompressed fastq inputs
    input_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(input_filenames[0]):
        input_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    if not all(map(os.path.isfile, input_filenames)):
        raise OSError('Input files for mapping not found: ' +
                      input_filenames[0])

    # parallelize if requested
    if threads == 1:
        call_list = [
            stampy_bin,
            '--overwrite',
            '-g',
            get_reference_premap_index_filename(data_folder, adaID, ext=False),
            '-h',
            get_reference_premap_hash_filename(data_folder, adaID, ext=False),
            '-o',
            get_premapped_filename(data_folder, adaID, type='sam'),
            '--insertsize=450',
            '--insertsd=100',
            '--substitutionrate=' + str(subsrate),
            '--gapopen=' + str(gapopen),
            '--gapextend=' + str(gapextend),
        ]
        if maxreads > 0:
            call_list.append('--numrecords=' + str(maxreads))
        call_list.extend(['-M'] + input_filenames)
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        sp.call(call_list)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write('\nStampy premapped (single thread).\n')

        # Convert to compressed BAM
        convert_sam_to_bam(
            get_premapped_filename(data_folder, adaID, type='bam'))

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('\nSAM file converted to compressed BAM: '+\
                        get_premapped_filename(data_folder, adaID, type='bam')+'\n')

    else:

        # Multithreading works as follows: call qsub + stampy, monitor the process
        # IDs with qstat at regular intervals, and finally merge results with pysam
        output_file_parts = [
            get_premapped_filename(
                data_folder, adaID, type='bam', part=(j + 1))
            for j in xrange(threads)
        ]

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')

        # Submit map call
        import hivwholeseq
        JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/'
        JOBLOGOUT = JOBDIR + 'logout'
        JOBLOGERR = JOBDIR + 'logerr'
        cluster_time = ['23:59:59', '1:59:59']
        vmem = '8G'
        for j in xrange(threads):
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l',
                'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem,
                stampy_bin, '--overwrite', '-g',
                get_reference_premap_index_filename(
                    data_folder, adaID, ext=False), '-h',
                get_reference_premap_hash_filename(
                    data_folder, adaID, ext=False), '-o',
                get_premapped_filename(
                    data_folder, adaID, type='sam', part=(j + 1)),
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--insertsize=450', '--insertsd=100', '--substitutionrate=' +
                str(subsrate), '--gapopen=' + str(gapopen),
                '--gapextend=' + str(gapextend), '-M'
            ] + input_filenames
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)
            job_ID = sp.check_output(call_list)
            job_ID = job_ID.split()[2]
            job_IDs[j] = job_ID

        # Monitor output
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if VERBOSE >= 3:
                print qstat_output
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert premapped reads to BAM for merging: adaID '+\
                               adaID+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy premapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        if VERBOSE >= 1:
            print 'Concatenate premapped reads: adaID ' + adaID + '...',
        output_filename = get_premapped_filename(
            data_folder, adaID, type='bam', unsorted=True)
        pysam.cat('-o', output_filename, *output_file_parts)
        if VERBOSE >= 1:
            print 'done.'
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort premapped reads: adaID ' + adaID
        output_filename_sorted = get_premapped_filename(
            data_folder, adaID, type='bam', unsorted=False)
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader premapped reads: adaID ' + adaID
        header_filename = get_premapped_filename(
            data_folder, adaID, type='sam', part=1)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID
    remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp premapping files removed.\n')
            f.write('\n')
예제 #13
0
def filter_mapped_reads(sample, fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment,
                                               type='bam', PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4]+'_trashed.bam'

    infilenames = [get_mapped_to_initial_filename(pname, samplename_pat,
                                                 samplename, fragment,
                                                 type='bam', PCR=PCR)
                   for samplename in samplenames_seq]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat,
                                                              fragment, str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)
 
    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()
    
                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(reads, ref,
                                                     hist_distance_from_consensus,
                                                     hist_dist_along,
                                                     binsize,
                                                     max_mismatches=max_mismatches,
                                                     match_len_min=match_len_min,
                                                     trim_bad_cigars=trim_bad_cigars,
                                                     VERBOSE=VERBOSE)
                    
                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')
            f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')
예제 #14
0
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments,
                          maxreads=-1, VERBOSE=0,
                          minisize=100,
                          include_tests=False, summary=True):
    '''Trim reads and divide them into fragments'''
    if VERBOSE:
        print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\
                ' '.join(fragments)

    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Fragments used: '+' '.join(fragments)+'\n')

    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')
    smat = np.array(refseq, 'S1')
    len_reference = len(refseq)

    # Get the positions of fragment start/end, w/ and w/o primers
    frags_pos = get_fragment_positions(smat, fragments)
    store_reference_fragmented(data_folder, adaID, refseq,
                               dict(zip(fragments, frags_pos['trim'])))
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Primer positions (for fragments):\n')
            for (fragment, poss_full, poss_trim) in izip(fragments,
                                                         frags_pos['full'],
                                                         frags_pos['trim']):
                f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\
                                 ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n')
    write_fragment_positions(data_folder, adaID, fragments, frags_pos)

    # Get the positions of the unwanted outer primers (in case we DO nested PCR
    # for that fragment)
    # NOTE: the LTRs make no problem, because the rev outer primer of F6
    # is not in the reference anymore if F6 has undergone nested PCR
    # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting
    from re import findall
    primers_out = {'fwd': [], 'rev': []}
    for i, fr in enumerate(fragments):
        if (i != 0) and findall(r'F[2-6][a-z]?i', fr):
            primers_out['fwd'].append(fr[:-1]+'o')
        if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr):
            primers_out['rev'].append(fr[:-1]+'o')

    # Get all possible unambiguous primers for the unwanted outer primers
    from hivwholeseq.data.primers import primers_PCR
    from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
    primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])),
                                        'S1', ndmin=2)
                               for fr in primers_out['fwd']],
                       'rev': [np.array(map(list, eas(primers_PCR[fr][1])),
                                        'S1', ndmin=2)
                               for fr in primers_out['rev']],
                      }
    primers_out_pos = {'fwd': [], 'rev': []}
    if primers_out['fwd']:
        primers_out_pos['fwd'] = map(itemgetter(0),
                                     get_primer_positions(smat,
                                                          primers_out['fwd'], 'fwd'))
    if primers_out['rev']:
        primers_out_pos['rev'] = map(itemgetter(1),
                                     get_primer_positions(smat,
                                                          primers_out['rev'], 'rev'))

    # Input and output files
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        convert_sam_to_bam(input_filename)
    output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:

        try:
            file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile)
                            for ofn in output_filenames[:len(fragments)]]
    
            fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile)
            fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile)
            fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile)
            fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile)

            # Iterate over the mapped reads and assign fragments
            n_mapped = [0 for fragment in fragments]
            n_unmapped = 0
            n_crossfrag = 0
            n_ambiguous = 0
            n_outer = 0
            n_lowq = 0
            for irp, reads in enumerate(pair_generator(bamfile)):

                if irp == maxreads:
                    if VERBOSE:
                        print 'Maximal number of read pairs reached:', maxreads
                    break

                if VERBOSE >= 2:
                    if not ((irp+1) % 10000):
                        print irp+1

                i_fwd = reads[0].is_reverse

                # If unmapped or unpaired, mini, or insert size mini, or
                # divergent read pair (fully cross-overlapping), discard
                if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \
                   reads[1].is_unmapped or (not reads[1].is_proper_pair) or \
                   (reads[0].rlen < 50) or (reads[1].rlen < 50) or \
                   (reads[i_fwd].isize < minisize):
                    if VERBOSE >= 3:
                        print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname
                    n_unmapped += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # If the insert is a misamplification from the outer primers
                # in fragments that underwent nested PCR,
                # trash it (it will have skewed amplification anyway). We cannot
                # find all of those, rather only the ones still carrying the
                # primer itself (some others have lost it while shearing). For
                # those, no matter what happens at the end (reading into adapters,
                # etc.), ONE of the reads in the pair will start exactly with one
                # outer primer: if the rev read with a rev primer, if the fwd
                # with a fwd one. Test all six.
                if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \
                   test_outer_primer(reads,
                                     primers_out_pos, primers_out_seq,
                                     len_reference):
                    if VERBOSE >= 3:
                        print 'Read pair from outer primer:', reads[0].qname
                    n_outer += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # FIXME: the following becomes a bit harder when we mix parallel
                # PCRs, e.g. F5a+b, to get more product

                # Assign to a fragment now, so that primer trimming is faster 
                pair_identity = assign_to_fragment(reads, frags_pos['full'],
                                                   VERBOSE=VERBOSE)

                # 1. If no fragments are possible (e.g. one read crosses the
                # fragment boundary, they map to different fragments), dump it
                # into a special bucket
                if pair_identity == 'cross':
                    n_crossfrag += 1
                    fo_cm.write(reads[0])
                    fo_cm.write(reads[1])
                    continue

                # 2. If 2+ fragments are possible (tie), put into a special bucket
                # (essentially excluded, because we want two independent measurements
                # in the overlapping region, but we might want to recover them)
                elif pair_identity == 'ambiguous':
                    n_ambiguous += 1
                    fo_am.write(reads[0])
                    fo_am.write(reads[1])
                    continue

                # 3. If the intersection is a single fragment, good: trim the primers
                # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0
                n_frag = int(pair_identity)
                frag_pos = frags_pos['trim'][n_frag]
                if not np.isscalar(frag_pos[0]):
                    frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']]
                trashed_primers = trim_primers(reads, frag_pos,
                                               include_tests=include_tests)
                if trashed_primers or (reads[i_fwd].isize < 100):
                    n_unmapped += 1
                    if VERBOSE >= 3:
                        print 'Read pair is mismapped:', reads[0].qname
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # Quality trimming: if no decently long pair survives, trash
                #trashed_quality = main_block_low_quality(reads, phred_min=20,
                #                                         include_tests=include_tests)
                trashed_quality = trim_low_quality(reads, phred_min=20,
                                                   include_tests=include_tests)
                if trashed_quality or (reads[i_fwd].isize < 100):
                    n_lowq += 1
                    if VERBOSE >= 3:
                        print 'Read pair has low phred quality:', reads[0].qname
                    fo_lq.write(reads[0])
                    fo_lq.write(reads[1])
                    continue

                # Check for cross-overhangs or COH (reading into the adapters)
                #        --------------->
                #    <-----------
                # In that case, trim to perfect overlap.
                if test_coh(reads, VERBOSE=False):
                    trim_coh(reads, trim=0, include_tests=include_tests)

                # Change coordinates into the fragmented reference (primer-trimmed)
                for read in reads:
                    read.pos -= frag_pos[0]
                    read.mpos -= frag_pos[0]

                # Here the tests
                if include_tests:
                    lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0]
                    if test_sanity(reads, n_frag, lfr):
                        print 'Tests failed:', reads[0].qname
                        import ipdb; ipdb.set_trace()

                # There we go!
                n_mapped[n_frag] += 1
                file_handles[n_frag].write(reads[0])
                file_handles[n_frag].write(reads[1])

        finally:
            for f in file_handles:
                f.close()
            fo_am.close()
            fo_cm.close()
            fo_um.close()
            fo_lq.close()


    if VERBOSE:
        print 'Trim and divide results: adaID '+adaID
        print 'Total:\t\t', irp
        print 'Mapped:\t\t', sum(n_mapped), n_mapped
        print 'Unmapped/unpaired/tiny:\t', n_unmapped
        print 'Outer primer\t', n_outer
        print 'Crossfrag:\t', n_crossfrag
        print 'Ambiguous:\t', n_ambiguous
        print 'Low-quality:\t', n_lowq

    # Write summary to file
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Trim and divide results: adaID '+adaID+'\n')
            f.write('Total:\t\t'+str(irp + 1)+'\n')
            f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n')
            f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n')
            f.write('Outer primer\t'+str(n_outer)+'\n')
            f.write('Crossfrag:\t'+str(n_crossfrag)+'\n')
            f.write('Ambiguous:\t'+str(n_ambiguous)+'\n')
            f.write('Low-quality:\t'+str(n_lowq)+'\n')
예제 #15
0
def filter_mapped_reads(sample,
                        fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname,
                                               samplename_pat,
                                               fragment,
                                               type='bam',
                                               PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    infilenames = [
        get_mapped_to_initial_filename(pname,
                                       samplename_pat,
                                       samplename,
                                       fragment,
                                       type='bam',
                                       PCR=PCR)
        for samplename in samplenames_seq
    ]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print('WARNING: No mapped files found: ' +
              ', '.join([pname, samplename_pat, fragment,
                         str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)

    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1),
                                       int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()

                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(
                            reads,
                            ref,
                            hist_distance_from_consensus,
                            hist_dist_along,
                            binsize,
                            max_mismatches=max_mismatches,
                            match_len_min=match_len_min,
                            trim_bad_cigars=trim_bad_cigars,
                            VERBOSE=VERBOSE)

                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname,
                                                      samplename_pat,
                                                      fragment,
                                                      PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname ' + pname + ', ' + samplename_pat +
                    ', ' + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')
            f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def map_stampy_multithread(sample, fragment, VERBOSE=0, threads=2, summary=True,
                           filtered=True):
    '''Map using stampy, multithread (via cluster requests, queueing race conditions possible)'''
    import hivwholeseq
    JOBDIR = hivwholeseq.__path__[0].rstrip('/')+'/'
    JOBLOGOUT = JOBDIR+'logout/'
    JOBLOGERR = JOBDIR+'logerr/'
    cluster_time = ['23:59:59', '0:59:59']
    vmem = '8G'

    pname = patient.id
    sample = patient.sample_table.loc[samplename]
    seq_run = sample['run']
    data_folder = MiSeq_runs[seq_run]['folder']
    adaID = sample['adaID']

    if VERBOSE:
        print 'Map via stampy: '+pname+' '+samplename+' '+fragment

    if summary:
        summary_filename = get_map_initial_summary_filename(pname, samplename, fragment)

    # Specific fragment (e.g. F5 --> F5bi)
    frag_spec = filter(lambda x: fragment in x, sample['fragments'])
    if not len(frag_spec):
        raise ValueError(str(patient)+', '+samplename+': fragment '+fragment+' not found.')
    frag_spec = frag_spec[0]

    input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam')

    # Submit map scripts in parallel to the cluster
    jobs_done = np.zeros(threads, bool)
    job_IDs = np.zeros(threads, 'S30')
    for j in xrange(threads):
    
        output_filename = get_mapped_to_initial_filename(pname, samplename,
                                                         fragment,
                                                         type='sam', part=(j+1))
        # Map
        call_list = ['qsub','-cwd',
                     '-b', 'y',
                     '-S', '/bin/bash',
                     '-o', JOBLOGOUT,
                     '-e', JOBLOGERR,
                     '-N', 'm '+samplename+fragment+' p'+str(j+1),
                     '-l', 'h_rt='+cluster_time[threads >= 10],
                     '-l', 'h_vmem='+vmem,
                     stampy_bin,
                     '--overwrite',
                     '-g', get_initial_index_filename(pname, fragment, ext=False),
                     '-h', get_initial_hash_filename(pname, fragment, ext=False),
                     '-o', output_filename,
                     '--processpart='+str(j+1)+'/'+str(threads),
                     '--substitutionrate='+subsrate,
                     '--gapopen', stampy_gapopen,
                     '--gapextend', stampy_gapextend]
        if stampy_sensitive:
            call_list.append('--sensitive')
        call_list = call_list + ['-M', input_filename]

        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        job_ID = sp.check_output(call_list)
        job_ID = job_ID.split()[2]
        job_IDs[j] = job_ID

    # Monitor output
    output_file_parts = [get_mapped_to_initial_filename(pname, samplename,
                                                        fragment,
                                                        type='bam', part=(j+1))
                         for j in xrange(threads)]
    time_wait = 10 # secs
    while not jobs_done.all():

        # Sleep some time
        time.sleep(time_wait)

        # Get the output of qstat to check the status of jobs
        qstat_output = sp.check_output(['qstat'])
        qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line
        if len(qstat_output) < 3:
            jobs_done[:] = True
            break
        else:
            qstat_output = [line.split()[0] for line in qstat_output[2:]]

        time_wait = 10 # secs
        for j in xrange(threads):
            if jobs_done[j]:
                continue

            if job_IDs[j] not in qstat_output:
                # Convert to BAM for merging
                if VERBOSE >= 1:
                    print 'Convert mapped reads to BAM for merging: sample '+\
                           samplename+', part '+str(j+1)+ ' of '+ \
                           str(threads)
                convert_sam_to_bam(output_file_parts[j])
                # We do not need to wait if we did the conversion (it takes
                # longer than some secs)
                time_wait = 0
                jobs_done[j] = True

    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Stampy mapped ('+str(threads)+' threads).\n')

    # Concatenate output files
    output_filename = get_mapped_to_initial_filename(pname, samplename,
                                                     fragment,
                                                     type='bam', unsorted=True)
    if VERBOSE >= 1:
        print 'Concatenate premapped reads: sample '+samplename
    pysam.cat('-o', output_filename, *output_file_parts)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('BAM files concatenated (unsorted).\n')

    # Sort the file by read names (to ensure the pair_generator)
    output_filename_sorted = get_mapped_to_initial_filename(pname, samplename,
                                                            fragment,
                                                            type='bam')
    # NOTE: we exclude the extension and the option -f because of a bug in samtools
    if VERBOSE >= 1:
        print 'Sort mapped reads: sample '+samplename
    pysam.sort('-n', output_filename, output_filename_sorted[:-4])
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Joint BAM file sorted.\n')

    # Reheader the file without BAM -> SAM -> BAM
    if VERBOSE >= 1:
        print 'Reheader mapped reads: sample '+samplename
    header_filename = get_mapped_to_initial_filename(pname, samplename,
                                                     fragment,
                                                     type='sam', part=1)
    pysam.reheader(header_filename, output_filename_sorted)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: sample '+samplename
    remove_mapped_init_tempfiles(pname, samplename, fragment, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')
def map_stampy_singlethread(sample, fragment, VERBOSE=0, n_pairs=-1,
                            summary=True, only_chunk=None, filtered=True):
    '''Map using stampy, single thread (no cluster queueing race conditions)'''
    pname = sample.patient
    samplename_pat = sample['patient sample']
    seq_run = sample['seq run']
    data_folder = sample.sequencing_run['folder']
    adaID = sample['adapter']
    PCR = int(sample.PCR)

    if VERBOSE:
        print 'Map via stampy (single thread): '+samplename+' '+fragment

    if summary:
        summary_filename = get_map_initial_summary_filename(pname, samplename_pat, 
                                                            samplename, fragment,
                                                            PCR=PCR)

    # Specific fragment (e.g. F5 --> F5bi)
    frag_spec = filter(lambda x: fragment in x, sample.regions_complete)
    if not len(frag_spec):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (specific fragment for '+fragment+'not found).\n')

        raise ValueError(samplename+': fragment '+fragment+' not found.')
    else:
        frag_spec = frag_spec[0]

    input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam',
                                        only_chunk=only_chunk, filtered=filtered)

    # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
    if not os.path.isfile(input_filename):
        if fragment == 'F3':
            input_filename = input_filename.replace('F3a', 'F3')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename+', fragment '+fragment+': input file not found.')

    # Extract subsample of reads if requested
    if n_pairs > 0:
        from hivwholeseq.utils.mapping import extract_mapped_reads_subsample
        input_filename_sub = get_mapped_to_initial_filename(pname, samplename_pat,
                                                            samplename, fragment,
                                                            PCR=PCR,
                                                            type='bam')[:-4]+\
                '_unmapped.bam'
        n_written = extract_mapped_reads_subsample(input_filename,
                                                   input_filename_sub,
                                                   n_pairs, VERBOSE=VERBOSE)

    # Get output filename
    output_filename = get_mapped_to_initial_filename(pname, samplename_pat, 
                                                     samplename, fragment,
                                                     PCR=PCR,
                                                     type='sam', only_chunk=only_chunk)

    # Map
    call_list = [stampy_bin,
                 '-g', get_initial_index_filename(pname, fragment, ext=False),
                 '-h', get_initial_hash_filename(pname, fragment, ext=False),
                 '-o', output_filename,
                 '--overwrite',
                 '--substitutionrate='+subsrate,
                 '--gapopen', stampy_gapopen,
                 '--gapextend', stampy_gapextend]
    if stampy_sensitive:
        call_list.append('--sensitive')

    if n_pairs > 0:
        call_list = call_list + ['-M', input_filename_sub]
    else:
        call_list = call_list + ['-M', input_filename]
    call_list = map(str, call_list)
    if VERBOSE >=2:
        print ' '.join(call_list)
    sp.call(call_list)

    output_filename_bam = get_mapped_to_initial_filename(pname, samplename_pat,
                                                         samplename, fragment,
                                                         type='bam',
                                                         PCR=PCR,
                                                         only_chunk=only_chunk)
    convert_sam_to_bam(output_filename_bam)

    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Stampy mapped (single thread).\n')

    if only_chunk is None:
        if VERBOSE >= 1:
            print 'Remove temporary files: sample '+samplename
        remove_mapped_init_tempfiles(pname, samplename_pat,
                                     samplename, fragment,
                                     PCR=PCR,
                                     VERBOSE=VERBOSE, only_chunk=only_chunk)

    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')

    if n_pairs > 0:
        os.remove(input_filename_sub)
예제 #18
0
def premap_stampy(data_folder,
                  adaID,
                  VERBOSE=0,
                  threads=1,
                  summary=True,
                  maxreads=-1,
                  subsrate=0.05,
                  gapopen=40,
                  gapextend=3):
    '''Call stampy for actual mapping'''
    if VERBOSE:
        print 'Premapping: adaID ', adaID

    if summary:
        summary_filename = get_premap_summary_filename(data_folder, adaID)

    # Stampy can handle both gzipped and uncompressed fastq inputs
    input_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(input_filenames[0]):
        input_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    if not all(map(os.path.isfile, input_filenames)):
        raise OSError('Input files for mapping not found: ' +
                      input_filenames[0])

    # parallelize if requested
    if threads == 1:
        call_list = [
            stampy_bin,
            '--overwrite',
            '-g',
            get_reference_premap_index_filename(data_folder, adaID, ext=False),
            '-h',
            get_reference_premap_hash_filename(data_folder, adaID, ext=False),
            '-o',
            get_premapped_filename(data_folder, adaID, type='sam'),
            '--insertsize=450',
            '--insertsd=100',
            '--substitutionrate=' + str(subsrate),
            '--gapopen=' + str(gapopen),
            '--gapextend=' + str(gapextend),
        ]
        if maxreads > 0:
            call_list.append('--numrecords=' + str(maxreads))
        call_list.extend(['-M'] + input_filenames)
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        sp.call(call_list)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write('\nStampy premapped (single thread).\n')

        # Convert to compressed BAM
        convert_sam_to_bam(
            get_premapped_filename(data_folder, adaID, type='bam'))

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('\nSAM file converted to compressed BAM: '+\
                        get_premapped_filename(data_folder, adaID, type='bam')+'\n')

    else:

        # Multithreading works as follows: call qsub + stampy, monitor the process
        # IDs with qstat at regular intervals, and finally merge results with pysam
        output_file_parts = [
            get_premapped_filename(data_folder,
                                   adaID,
                                   type='bam',
                                   part=(j + 1)) for j in xrange(threads)
        ]

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')

        # Submit map call
        import hivwholeseq
        JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/'
        JOBLOGOUT = JOBDIR + 'logout'
        JOBLOGERR = JOBDIR + 'logerr'
        cluster_time = ['23:59:59', '1:59:59']
        vmem = '8G'
        for j in xrange(threads):
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l',
                'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem,
                stampy_bin, '--overwrite', '-g',
                get_reference_premap_index_filename(
                    data_folder, adaID, ext=False), '-h',
                get_reference_premap_hash_filename(
                    data_folder, adaID, ext=False), '-o',
                get_premapped_filename(
                    data_folder, adaID, type='sam', part=(j + 1)),
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--insertsize=450', '--insertsd=100', '--substitutionrate=' +
                str(subsrate), '--gapopen=' + str(gapopen),
                '--gapextend=' + str(gapextend), '-M'
            ] + input_filenames
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)
            job_ID = sp.check_output(call_list)
            job_ID = job_ID.split()[2]
            job_IDs[j] = job_ID

        # Monitor output
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if VERBOSE >= 3:
                print qstat_output
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert premapped reads to BAM for merging: adaID '+\
                               adaID+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy premapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        if VERBOSE >= 1:
            print 'Concatenate premapped reads: adaID ' + adaID + '...',
        output_filename = get_premapped_filename(data_folder,
                                                 adaID,
                                                 type='bam',
                                                 unsorted=True)
        pysam.cat('-o', output_filename, *output_file_parts)
        if VERBOSE >= 1:
            print 'done.'
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort premapped reads: adaID ' + adaID
        output_filename_sorted = get_premapped_filename(data_folder,
                                                        adaID,
                                                        type='bam',
                                                        unsorted=False)
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader premapped reads: adaID ' + adaID
        header_filename = get_premapped_filename(data_folder,
                                                 adaID,
                                                 type='sam',
                                                 part=1)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID
    remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp premapping files removed.\n')
            f.write('\n')
예제 #19
0
def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True, plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: '+adaID+', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                     filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4]+'_trashed.bam'
 
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros((len(ref) // binsize + 1,
                                             n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break
            
                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair '+str(irp)+': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue
            
                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue
                    
                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads, match_len_min=match_len_min,
                                       trim_left=trim_bad_cigars,
                                       trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID '+adaID+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Suspect contaminations:\t'+str(n_suspect)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')


    if plot:
        plot_distance_histogram(data_folder, adaID, frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)