def get_mapped_filtered_filename(self, samplename, fragment, PCR=1): '''Get filename(s) of mapped and filtered reads for a sample''' from hivwholeseq.patients.filenames import get_mapped_filtered_filename return get_mapped_filtered_filename(self.patient, samplename, fragment, PCR=PCR)
def get_mapped_filtered_filename(self, fragment, PCR=1, **kwargs): '''Get filename(s) of mapped and filtered reads''' from hivwholeseq.patients.filenames import get_mapped_filtered_filename return get_mapped_filtered_filename(self.patient, self.name, fragment, PCR=PCR, **kwargs)
# Modules import os import argparse import datetime from hivwholeseq.patients.patients import load_samples_sequenced, Patient, \ SamplePat from hivwholeseq.patients.filenames import get_mapped_filtered_filename from hivwholeseq.utils.generic import modification_date from hivwholeseq.store.check_patients import ( pretty_print_info, pretty_print_info_genomewide) # Globals title_len = 15 cell_len = 7 get_decontaminated_filename = lambda *args, **kwargs: get_mapped_filtered_filename(*args, decontaminated=True, **kwargs) # Script if __name__ == '__main__': # Parse input args parser = argparse.ArgumentParser(description='Check patient samples') parser.add_argument('--samples', nargs='+', help='Samples to analyze') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-3]') args = parser.parse_args()
def check_pipeline_patient(p, VERBOSE=0): '''Check patient pipeline''' from hivwholeseq.utils.exceptions import PipelineError def print_info_summary(p): n_samples = len(p.samples) p.discard_nonsequenced_samples() n_samples_seq = len(p.samples) print p.name, '# samples:', str(n_samples) + ' (' + str( n_samples_seq) + ' sequenced)' title = 'Samples' line = ('{:<' + str(title_len) + '}').format(title + ':') line = line + ' '.join(p.samples.index.tolist()) print line fn = p.folder title = 'Folder' line = ('{:<' + str(title_len) + '}').format(title + ':') if os.path.isdir(fn): status = 'OK' else: status = 'MISS' line = line + ('{:<' + str(cell_len) + '}').format(status) print line if status != 'OK': print '' raise PipelineError('Missing patient folder') return status def print_info_references(p): '''Print info on references''' title = 'References' line = ('{:<' + str(title_len) + '}').format(title + ':') stati = [] for fragment in ('F' + str(i + 1) for i in xrange(6)): fn = p.get_reference_filename(fragment) if os.path.isfile(fn): status = 'OK' p.mod_dates[('reference', fragment)] = modification_date(fn) else: status = 'MISS' stati.append(status) line = line + fragment + ': ' + ( '{:>' + str(cell_len - len(fragment) - 1) + '}').format(status) + ' ' print line if frozenset(stati) != frozenset(['OK']): print '' raise PipelineError('Amplicon reference failed!') title = 'Genome ref' line = ('{:<' + str(title_len) + '}').format(title + ':') fn = p.get_reference_filename('genomewide', 'fasta') if os.path.isfile(fn): status = 'OK' p.mod_dates[('reference', 'genomewide')] = modification_date(fn) else: status = 'MISS' line = line + ('{:<' + str(cell_len) + '}').format(status) print line if status != 'OK': print '' raise PipelineError('Genomewide reference failed!') check_reference_overlap(p) title = 'Annotated' line = ('{:<' + str(title_len) + '}').format(title + ':') fn = p.get_reference_filename('genomewide', 'gb') if os.path.isfile(fn): md = modification_date(fn) if md >= p.mod_dates[('reference', 'genomewide')]: status = 'OK' else: status = 'OLD' else: status = 'MISS' line = line + ('{:<' + str(cell_len) + '}').format(status) print line if status != 'OK': print '' raise PipelineError('Annotated reference failed!') p.mod_dates = {} print_info_summary(p) print_info_references(p) from hivwholeseq.patients.filenames import get_mapped_filtered_filename print_info( p, 'Map + filter', 'filter', lambda pn, sn, fr: get_mapped_filtered_filename( pn, sn, fr, decontaminated=False), 'reference') print_info( p, 'Decontaminate', 'decontaminate', lambda pn, sn, fr: get_mapped_filtered_filename( pn, sn, fr, decontaminated=True), 'filter') print_info(p, 'Consensus', 'consensus', 'get_consensus_filename', 'decontaminate') print_info_genomewide(p, 'Cons genomewide', 'consensus', 'get_consensus_filename') print_info(p, 'Allele counts', 'allele counts', 'get_allele_counts_filename', 'decontaminate') print_info(p, 'Allele cocounts', 'allele cocounts', 'get_allele_cocounts_filename', 'decontaminate') print_info_genomewide(p, 'Allele counts genomewide', 'allele counts', 'get_allele_counts_filename', require_all=False) print_info_patient(p, 'Maps to HXB2', 'reference', 'get_map_coordinates_reference_filename', 'reference') print ''
''' # Modules import os import argparse import datetime from hivwholeseq.patients.patients import load_samples_sequenced, Patient, \ SamplePat from hivwholeseq.patients.filenames import get_mapped_filtered_filename from hivwholeseq.utils.generic import modification_date from hivwholeseq.store.check_patients import (pretty_print_info, pretty_print_info_genomewide) # Globals title_len = 15 cell_len = 7 get_decontaminated_filename = lambda *args, **kwargs: get_mapped_filtered_filename( *args, decontaminated=True, **kwargs) # Script if __name__ == '__main__': # Parse input args parser = argparse.ArgumentParser(description='Check patient samples') parser.add_argument('--samples', nargs='+', help='Samples to analyze') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-3]') args = parser.parse_args() VERBOSE = args.verbose samplenames = args.samples
def check_pipeline_patient(p, VERBOSE=0): '''Check patient pipeline''' from hivwholeseq.utils.exceptions import PipelineError def print_info_summary(p): n_samples = len(p.samples) p.discard_nonsequenced_samples() n_samples_seq = len(p.samples) print p.name, '# samples:', str(n_samples)+ ' ('+str(n_samples_seq)+' sequenced)' title = 'Samples' line = ('{:<'+str(title_len)+'}').format(title+':') line = line+' '.join(p.samples.index.tolist()) print line fn = p.folder title = 'Folder' line = ('{:<'+str(title_len)+'}').format(title+':') if os.path.isdir(fn): status = 'OK' else: status = 'MISS' line = line + ('{:<'+str(cell_len)+'}').format(status) print line if status != 'OK': print '' raise PipelineError('Missing patient folder') return status def print_info_references(p): '''Print info on references''' title = 'References' line = ('{:<'+str(title_len)+'}').format(title+':') stati = [] for fragment in ('F'+str(i+1) for i in xrange(6)): fn = p.get_reference_filename(fragment) if os.path.isfile(fn): status = 'OK' p.mod_dates[('reference', fragment)] = modification_date(fn) else: status = 'MISS' stati.append(status) line = line + fragment + ': ' + ('{:>'+str(cell_len - len(fragment) - 1)+'}').format(status) + ' ' print line if frozenset(stati) != frozenset(['OK']): print '' raise PipelineError('Amplicon reference failed!') title = 'Genome ref' line = ('{:<'+str(title_len)+'}').format(title+':') fn = p.get_reference_filename('genomewide', 'fasta') if os.path.isfile(fn): status = 'OK' p.mod_dates[('reference', 'genomewide')] = modification_date(fn) else: status = 'MISS' line = line + ('{:<'+str(cell_len)+'}').format(status) print line if status != 'OK': print '' raise PipelineError('Genomewide reference failed!') check_reference_overlap(p) title = 'Annotated' line = ('{:<'+str(title_len)+'}').format(title+':') fn = p.get_reference_filename('genomewide', 'gb') if os.path.isfile(fn): md = modification_date(fn) if md >= p.mod_dates[('reference', 'genomewide')]: status = 'OK' else: status = 'OLD' else: status = 'MISS' line = line + ('{:<'+str(cell_len)+'}').format(status) print line if status != 'OK': print '' raise PipelineError('Annotated reference failed!') p.mod_dates = {} print_info_summary(p) print_info_references(p) from hivwholeseq.patients.filenames import get_mapped_filtered_filename print_info(p, 'Map + filter', 'filter', lambda pn, sn, fr: get_mapped_filtered_filename(pn, sn, fr, decontaminated=False), 'reference') print_info(p, 'Decontaminate', 'decontaminate', lambda pn, sn, fr: get_mapped_filtered_filename(pn, sn, fr, decontaminated=True), 'filter') print_info(p, 'Consensus', 'consensus', 'get_consensus_filename', 'decontaminate') print_info_genomewide(p, 'Cons genomewide', 'consensus', 'get_consensus_filename') print_info(p, 'Allele counts', 'allele counts', 'get_allele_counts_filename', 'decontaminate') print_info(p, 'Allele cocounts', 'allele cocounts', 'get_allele_cocounts_filename', 'decontaminate') print_info_genomewide(p, 'Allele counts genomewide', 'allele counts', 'get_allele_counts_filename', require_all=False) print_info_patient(p, 'Maps to HXB2', 'reference', 'get_map_coordinates_reference_filename', 'reference') print ''
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4] + '_trashed.bam' infilenames = [ get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq ] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print('WARNING: No mapped files found: ' + ', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair( reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname ' + pname + ', ' + samplename_pat + ', ' + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4]+'_trashed.bam' infilenames = [get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair(reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')