Пример #1
0
def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True,
                 plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder,
                                          adaID,
                                          frag_gen,
                                          type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: ' + adaID + ', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(
        data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros(
                (len(ref) // binsize + 1, n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break

                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair ' + str(irp) +
                                     ': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue

                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads,
                                             contaminants,
                                             VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads,
                                      match_len_min=match_len_min,
                                      trim_left=trim_bad_cigars,
                                      trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(
            data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID ' + adaID + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Suspect contaminations:\t' + str(n_suspect) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')

    if plot:
        plot_distance_histogram(data_folder,
                                adaID,
                                frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder,
                                               adaID,
                                               frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)
Пример #2
0
    alis = {fr: AlignIO.read(get_consensi_alignment_filename('all', fr), 'fasta')
            for fr in fragments}

    for samplename, sample in samples.iterrows():
        sample = SampleSeq(sample)
        data_folder = sample.seqrun_folder
        adaID = sample.adapter
        pname = sample.patientname

        for fragment in fragments:
            if VERBOSE >= 1:
                print sample['seq run'], adaID, fragment, samplename,

            # Read the summary filename of the filter_mapped, and find out whether
            # there are many distant reads (a few are normal)
            fn = get_filter_mapped_summary_filename(data_folder, adaID, fragment)
            if os.path.isfile(fn):
                found = False
                with open(fn, 'r') as f:
                    for line in f:
                        line = line.rstrip('\n')
                        if line[:4] == 'Good':
                            n_good = int(line.split()[-1])

                        elif line[:14] == 'Many-mutations':
                            n_distant = int(line.split()[-1])
                            found = True
                            break

                if not found:
                    if VERBOSE >= 1:
def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True, plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: '+adaID+', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                     filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4]+'_trashed.bam'
 
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros((len(ref) // binsize + 1,
                                             n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break
            
                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair '+str(irp)+': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue
            
                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue
                    
                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads, match_len_min=match_len_min,
                                       trim_left=trim_bad_cigars,
                                       trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID '+adaID+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Suspect contaminations:\t'+str(n_suspect)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')


    if plot:
        plot_distance_histogram(data_folder, adaID, frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)