def check_already_decontaminated(sample, fragment, PCR): """Check from the summary and output file whether decontamination has been done""" import os from hivwholeseq.utils.mapping import get_number_reads if (fragment == "F4") and (sample.name in maj_contnames): return True fn_sum = get_decontaminate_summary_filename(sample.patient, sample.name, fragment, PCR=PCR) fn_in = sample.get_mapped_filtered_filename(fragment, PCR=PCR, decontaminated=False) fn_out = sample.get_mapped_filtered_filename(fragment, PCR=PCR, decontaminated=True) if not all(map(os.path.isfile, [fn_sum, fn_in, fn_out])): print sample.patient, sample.name, fragment, PCR return False n_reads_in = get_number_reads(fn_in) n_reads_out = get_number_reads(fn_out) (n_reads_good, n_reads_cont) = get_number_reads_summary(fn_sum) if n_reads_out != n_reads_good: print sample.patient, sample.name, fragment, PCR, n_reads_in, n_reads_out, n_reads_good, n_reads_cont return False if n_reads_good < 0.5 * n_reads_in: print sample.patient, sample.name, fragment, PCR, n_reads_in, n_reads_out, n_reads_good, n_reads_cont return False return True
def check_already_decontaminated(sample, fragment, PCR): '''Check from the summary and output file whether decontamination has been done''' import os from hivwholeseq.utils.mapping import get_number_reads if (fragment == 'F4') and (sample.name in maj_contnames): return True fn_sum = get_decontaminate_summary_filename(sample.patient, sample.name, fragment, PCR=PCR) fn_in = sample.get_mapped_filtered_filename(fragment, PCR=PCR, decontaminated=False) fn_out = sample.get_mapped_filtered_filename(fragment, PCR=PCR, decontaminated=True) if not all(map(os.path.isfile, [fn_sum, fn_in, fn_out])): print sample.patient, sample.name, fragment, PCR return False n_reads_in = get_number_reads(fn_in) n_reads_out = get_number_reads(fn_out) (n_reads_good, n_reads_cont) = get_number_reads_summary(fn_sum) if n_reads_out != n_reads_good: print sample.patient, sample.name, fragment, PCR, n_reads_in, n_reads_out, n_reads_good, n_reads_cont return False if n_reads_good < 0.5 * n_reads_in: print sample.patient, sample.name, fragment, PCR, n_reads_in, n_reads_out, n_reads_good, n_reads_cont return False return True
def get_coallele_counts_from_file(bamfilename, length, qual_min=30, maxreads=-1, VERBOSE=0, use_tests=False): '''Get counts of join occurence of two alleles''' from .utils.mapping import (test_read_pair_exotic_cigars, test_read_pair_exceed_reference) if VERBOSE >= 1: print 'Getting coallele counts' # Precompute conversion tables alpha_mapping = {a: alphal.index(a) for a in alphal} SANGER_SCORE_OFFSET = ord("!") q_mapping = dict() for letter in xrange(0, 255): q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET if VERBOSE >= 2: print 'Initializing matrix of cocounts' # NOTE: we are ignoring fwd/rev and read1/2 counts = np.zeros((len(alpha), len(alpha), length, length), int) posall = np.zeros(1000, dtype=[('pos', int), ('aind', int)]) if VERBOSE >= 2: from hivwholeseq.utils.mapping import get_number_reads print 'Scanning read pairs (' + str( get_number_reads(bamfilename) // 2) + ')' # NOTE: the reads should already be filtered of unmapped stuff at this point with pysam.Samfile(bamfilename, 'rb') as bamfile: for ir, reads in enumerate(pair_generator(bamfile)): if ir == maxreads: if VERBOSE: print 'Max read number reached:', maxreads break if (VERBOSE >= 2) and (not ((ir + 1) % 10)): if (VERBOSE == 2) and (ir + 1 != 10): sys.stdout.write("\x1b[1A") print(ir + 1) if use_tests: if test_read_pair_exotic_cigars(reads): raise ValueError('CIGAR type ' + str(bt) + ' not recognized') if test_read_pair_exceed_reference(reads, length): raise ValueError('Read pair exceeds reference length of ' + str(length)) # Temp structures posall[:] = (-1, -1) iall = 0 # Collect alleles for read in reads: alleles_ind = np.fromiter((alpha_mapping[x] for x in read.seq), np.uint8, len(read.seq)) allqual_ind = np.fromiter((q_mapping[x] for x in read.qual), np.uint8, len(read.qual)) pos_ref = read.pos pos_read = 0 for (bt, bl) in read.cigar: if bt == 1: pos_read += bl elif bt == 2: # NOTE: no CIGAR can start NOR end with a deletion qual_deletion = allqual_ind[pos_read:pos_read + 2].min() if qual_deletion >= qual_min: posall['pos'][iall:iall + bl] = np.arange( pos_ref, pos_ref + bl) posall['aind'][iall:iall + bl] = 4 iall += bl pos_ref += bl else: alleles_indb = alleles_ind[pos_read:pos_read + bl] allqual_indb = allqual_ind[pos_read:pos_read + bl] for i in xrange(len(alpha)): aitmp = (alleles_indb == i) & (allqual_indb >= qual_min) aitmp = aitmp.nonzero()[0] + pos_ref aitmplen = len(aitmp) posall['pos'][iall:iall + aitmplen] = aitmp posall['aind'][iall:iall + aitmplen] = i iall += aitmplen pos_read += bl pos_ref += bl # Avoid doubles (paired reads are twice the same biological molecule) posall.sort(order=('pos', 'aind')) iall = (posall['pos'] != -1).nonzero()[0][0] while iall < len(posall) - 1: if posall['pos'][iall + 1] == posall['pos'][iall]: # If both reads agree @ an overlap allele, take a single call if posall['aind'][iall + 1] == posall['aind'][iall]: posall[iall + 1] = (-1, -1) # else, pick one at random (FIXME: pick the highest phred) else: ibin = np.random.randint(2) posall[iall + ibin] = (-1, -1) iall += 1 iall += 1 # Add allele cocounts to the matrix # NOTE: this already takes care of the symmetry poss = [ posall['pos'][posall['aind'] == i1] for i1 in xrange(len(alpha)) ] for i1 in xrange(len(alpha)): poss1 = poss[i1] if not len(poss1): continue for i2 in xrange(len(alpha)): poss2 = poss[i2] if not len(poss2): continue # Raveling vodoo for efficiency cobra = counts[i1, i2].ravel() ind = poss1.repeat(len(poss2)) * length + np.tile( poss2, len(poss1)) cobra[ind] += 1 return counts
def check_status(sample, step, detail=1): '''Check for a sample a certain step of the pipeline at a certain detail''' if detail == 1: if step == 'premapped': return [os.path.isfile(sample.get_premapped_filename())] elif step == 'divided': return [(fr, os.path.isfile(sample.get_divided_filename(fr))) for fr in sample.regions_complete] elif step == 'consensus': return [(fr, os.path.isfile(sample.get_consensus_filename(fr))) for fr in sample.regions_generic] elif step == 'mapped': return [ (fr, os.path.isfile(sample.get_mapped_filename(fr, filtered=False))) for fr in sample.regions_generic ] elif step == 'filtered': return [ (fr, os.path.isfile(sample.get_mapped_filename(fr, filtered=True))) for fr in sample.regions_generic ] elif step == 'mapped_initial': return [(fr, os.path.isfile(sample.get_mapped_to_initial_filename(fr))) for fr in sample.regions_generic] elif step == 'mapped_filtered': # Check whether the mapped filtered is older than the mapped_initial from hivwholeseq.utils.generic import modification_date out = [] for fr in sample.regions_generic: fn_mi = sample.get_mapped_to_initial_filename(fr) fn_mf = sample.get_mapped_filtered_filename(fr) if not os.path.isfile(fn_mf): out.append((fr, False)) continue if not os.path.isfile(fn_mi): out.append((fr, True)) continue md_mi = modification_date(fn_mi) md_mf = modification_date(fn_mf) if md_mf < md_mi: out.append((fr, 'OLD')) else: out.append((fr, True)) return out elif detail == 2: if step in ('filtered', 'consensus'): return check_status(sample, step, detail=3) else: return check_status(sample, step, detail=1) elif detail == 3: if step == 'premapped': if os.path.isfile(sample.get_premapped_filename()): return [get_number_reads(sample.get_premapped_filename())] else: return [False] elif step == 'divided': stati = [] for fr in sample.regions_complete: fn = sample.get_divided_filename(fr) if os.path.isfile(fn): status = (fr, get_number_reads(fn)) else: status = (fr, False) stati.append(status) return stati elif step == 'consensus': stati = [] for fr in sample.regions_generic: fn = sample.get_consensus_filename(fr) if os.path.isfile(fn): status = (fr, len(SeqIO.read(fn, 'fasta'))) else: status = (fr, False) stati.append(status) return stati elif step == 'mapped': stati = [] for fr in sample.regions_generic: fn = sample.get_mapped_filename(fr, filtered=False) if os.path.isfile(fn): status = (fr, get_number_reads(fn)) else: status = (fr, False) stati.append(status) return stati elif step == 'filtered': stati = [] for fr in sample.regions_generic: fn = sample.get_mapped_filename(fr, filtered=True) if os.path.isfile(fn): status = (fr, get_number_reads(fn)) else: status = (fr, False) stati.append(status) return stati # TODO: add mapped_to_initial and downstream elif step in ('mapped_initial', 'mapped_filtered'): return check_status(sample, step, detail=1)
def filter_contamination( bamfilename, bamfilename_out, contseqs, samplename, VERBOSE=0, deltascore_max_self=60, deltascore_max_other=24, maxreads=-1, **kwargs ): """Fish contaminated reads from mapped reads The function checks for a maximal distance to the expected consensus, and only if it's more than that it checks all other samples. Args: deltascore_max_self (int): the maximal delta in alignment score to the consensus to be considered pure deltascore_max_other (int): the maximal delta in alignment score to any other sample to be considered a contamination **kwargs: passed down to the pairwise alignment function """ import pysam from collections import defaultdict from operator import itemgetter from seqanpy import align_overlap from hivwholeseq.utils.mapping import pair_generator, get_number_reads if "score_match" in kwargs: score_match = kwargs["score_match"] else: score_match = 3 bamfilename_trash = bamfilename_out[:-4] + "_trashed.bam" contseqs = contseqs.copy() consseq = contseqs.pop(samplename) if VERBOSE >= 2: print "Scanning reads (" + str(get_number_reads(bamfilename) // 2) + ")" with pysam.Samfile(bamfilename, "rb") as bamfile: with pysam.Samfile(bamfilename_out, "wb", template=bamfile) as bamfileout, pysam.Samfile( bamfilename_trash, "wb", template=bamfile ) as bamfiletrash: n_good = 0 n_cont = defaultdict(int) for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 100): if not ((irp + 1) == 100): sys.stdout.write("\x1b[1A") print irp + 1 for read in reads: # Look for distance to the own consensus, it that's small move on alignments_read = {} deltas_read = {} (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs) (alis1, alis2) = trim_align_overlap((alis1, alis2)) scoremax = len(alis1) * score_match delta_read = scoremax - score deltas_read[samplename] = delta_read alignments_read[samplename] = (alis1, alis2) if delta_read <= deltascore_max_self: if VERBOSE >= 4: print "Read is very close to its own consensus", scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1="ref", name2="read") continue # Otherwise, move on to all other sequences and find the neighbour for contname, contseq in contseqs.iteritems(): (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs) (ali1, ali2) = trim_align_overlap((ali1, ali2)) scoremax = len(ali1) * score_match delta_read = scoremax - score deltas_read[contname] = delta_read alignments_read[contname] = (ali1, ali2) if VERBOSE >= 5: print samplename for key, d in deltas_read.iteritems(): print key, d (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1)) # Again, the correct consensus has precedence if deltas_read[samplename] == delta_read: contname = samplename (ali1, ali2) = alignments_read[contname] # The read may be closest to its own consensus, if not very close if contname == samplename: if VERBOSE >= 4: print "Read is closest to its consensus", scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") # The read may come from another consensus (contamination) elif delta_read <= deltascore_max_other: n_cont[contname] += 1 bamfiletrash.write(reads[0]) bamfiletrash.write(reads[1]) if VERBOSE >= 2: print "Contaminated read found! Good:", n_good, "cont:", sum( n_cont.itervalues() ), "sources:", n_cont if VERBOSE >= 3: print "Read is contaminated by", contname, scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1="self", name2="read") print "" pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") if VERBOSE >= 2: print "" break # Finally, the read is not really close to anything: accept else: if VERBOSE >= 4: print "Read is close to nothing really", scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") else: n_good += 1 bamfileout.write(reads[0]) bamfileout.write(reads[1]) n_cont = dict(n_cont) return (n_good, n_cont)
def print_info_genomewide(p, title, name, method, VERBOSE=0, require_all=True): '''Pretty printer for patient pipeline info''' mod_dates = p.mod_dates def check_requisite_genomewide(md, name_requisite, samplename, mod_dates, require_all=True): '''Check requisites for genomewide observables''' stati = [] fragments = ['F' + str(i + 1) for i in xrange(6)] for fragment in fragments: if (name_requisite, fragment, samplename) not in mod_dates: stati.append('MISS') elif md < mod_dates[(name_requisite, fragment, samplename)]: stati.append('OLD') else: stati.append('OK') if 'OLD' in stati: return 'OLD' else: if require_all: if 'MISS' in stati: return 'MISS' else: return 'OK' else: if 'OK' in stati: return 'OK' else: return 'MISS' def check_contamination_genomewide(sample): '''Check whether any of the fragment samples is contaminated''' fragments = ['F' + str(i + 1) for i in xrange(6)] for fragment in fragments: if 'contaminated' in sample[fragment]: return True return False import os, sys from hivwholeseq.patients.samples import SamplePat # NOTE: this function is used to check both entire patients and single samples if isinstance(p, SamplePat): sample_iter = [(p.name, p)] else: sample_iter = p.samples.iterrows() stati = set() line = ('{:<' + str(title_len) + '}').format(title + ':') print line for samplename, sample in sample_iter: sample = SamplePat(sample) title = sample.name line = ('{:<' + str(title_len) + '}').format(title + ':') if isinstance(method, basestring) and hasattr(sample, method): fun = getattr(sample, method) fn = fun('genomewide') else: fn = method(sample.patient, samplename, 'genomewide') if os.path.isfile(fn): md = modification_date(fn) mod_dates[(name, 'genomewide', samplename)] = md if name is None: status = 'OK' elif check_contamination_genomewide(sample): status = 'CONT' else: status = check_requisite_genomewide(md, name, samplename, mod_dates, require_all=require_all) else: status = 'MISS' # Check the number of reads if requested if (status == 'OK') and (fn[-3:] == 'bam') and (VERBOSE >= 3): status = str(get_number_reads(fn)) stati.add(status) line = line + ('{:<' + str(cell_len) + '}').format(status) print line if 'OLD' in stati: raise ValueError('OLD status found')
def print_info(p, title, name, method, name_requisite=None, VERBOSE=0): '''Pretty printer for patient pipeline info''' import os, sys from hivwholeseq.patients.samples import SamplePat from hivwholeseq.utils.mapping import get_number_reads mod_dates = p.mod_dates # NOTE: this function is used to check both entire patients and single samples if isinstance(p, SamplePat): sample_iter = [(p.name, p)] else: sample_iter = p.samples.iterrows() fragments = ['F' + str(i + 1) for i in xrange(6)] stati = set() line = ('{:<' + str(title_len) + '}').format(title + ':') print line for samplename, sample in sample_iter: sample = SamplePat(sample) title = sample.name line = ('{:<' + str(title_len) + '}').format(title + ':') for fragment in fragments: if isinstance(method, basestring) and hasattr(sample, method): fun = getattr(sample, method) fn = fun(fragment) else: fn = method(sample.patient, samplename, fragment) if os.path.isfile(fn): md = modification_date(fn) mod_dates[(name, fragment, samplename)] = md if name_requisite is None: status = 'OK' elif ((name_requisite, fragment, samplename) in mod_dates): if md > mod_dates[(name_requisite, fragment, samplename)]: status = 'OK' else: status = 'OLD' print fn, md, mod_dates[(name_requisite, fragment, samplename)] elif ((name_requisite, fragment) in mod_dates): if md > mod_dates[(name_requisite, fragment)]: status = 'OK' else: status = 'OLD' # NOTE: on Nov 13, 2014 I updated the mod dates of all # references by mistake, without actually changing the # sequences (ironically, probably testing a backup system # for the refs themselves). So if the requisite is a ref # seq and the date is this one, it's OK if ((name_requisite == 'reference') and mod_dates[(name_requisite, fragment)].date() == \ datetime.date(2014, 11, 13)): status = 'OK' elif 'contaminated' in sample[fragment]: status = 'CONT' else: status = 'ERROR' else: status = 'MISS' # Check the number of reads if requested if (status == 'OK') and (fn[-3:] == 'bam') and (VERBOSE >= 3): status = str(get_number_reads(fn)) stati.add(status) line = line+fragment+': '+\ ('{:>'+str(cell_len - len(fragment) - 1)+'}').format(status)+' ' print line if 'OLD' in stati: raise ValueError('OLD status found')
def filter_contamination(bamfilename, bamfilename_out, contseqs, samplename, VERBOSE=0, deltascore_max_self=60, deltascore_max_other=24, maxreads=-1, **kwargs): '''Fish contaminated reads from mapped reads The function checks for a maximal distance to the expected consensus, and only if it's more than that it checks all other samples. Args: deltascore_max_self (int): the maximal delta in alignment score to the consensus to be considered pure deltascore_max_other (int): the maximal delta in alignment score to any other sample to be considered a contamination **kwargs: passed down to the pairwise alignment function ''' import pysam from collections import defaultdict from operator import itemgetter from seqanpy import align_overlap from hivwholeseq.utils.mapping import pair_generator, get_number_reads if 'score_match' in kwargs: score_match = kwargs['score_match'] else: score_match = 3 bamfilename_trash = bamfilename_out[:-4] + '_trashed.bam' contseqs = contseqs.copy() consseq = contseqs.pop(samplename) if VERBOSE >= 2: print 'Scanning reads (' + str( get_number_reads(bamfilename) // 2) + ')' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(bamfilename_out, 'wb', template=bamfile) as bamfileout, \ pysam.Samfile(bamfilename_trash, 'wb', template=bamfile) as bamfiletrash: n_good = 0 n_cont = defaultdict(int) for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 100): if not ((irp + 1) == 100): sys.stdout.write('\x1b[1A') print irp + 1 for read in reads: # Look for distance to the own consensus, it that's small move on alignments_read = {} deltas_read = {} (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs) (alis1, alis2) = trim_align_overlap((alis1, alis2)) scoremax = len(alis1) * score_match delta_read = scoremax - score deltas_read[samplename] = delta_read alignments_read[samplename] = (alis1, alis2) if delta_read <= deltascore_max_self: if VERBOSE >= 4: print 'Read is very close to its own consensus', scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1='ref', name2='read') continue # Otherwise, move on to all other sequences and find the neighbour for contname, contseq in contseqs.iteritems(): (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs) (ali1, ali2) = trim_align_overlap((ali1, ali2)) scoremax = len(ali1) * score_match delta_read = scoremax - score deltas_read[contname] = delta_read alignments_read[contname] = (ali1, ali2) if VERBOSE >= 5: print samplename for key, d in deltas_read.iteritems(): print key, d (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1)) # Again, the correct consensus has precedence if deltas_read[samplename] == delta_read: contname = samplename (ali1, ali2) = alignments_read[contname] # The read may be closest to its own consensus, if not very close if contname == samplename: if VERBOSE >= 4: print 'Read is closest to its consensus', scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') # The read may come from another consensus (contamination) elif (delta_read <= deltascore_max_other): n_cont[contname] += 1 bamfiletrash.write(reads[0]) bamfiletrash.write(reads[1]) if VERBOSE >= 2: print 'Contaminated read found! Good:', n_good, 'cont:', sum( n_cont.itervalues()), 'sources:', n_cont if VERBOSE >= 3: print 'Read is contaminated by', contname, scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1='self', name2='read') print '' pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') if VERBOSE >= 2: print '' break # Finally, the read is not really close to anything: accept else: if VERBOSE >= 4: print 'Read is close to nothing really', scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') else: n_good += 1 bamfileout.write(reads[0]) bamfileout.write(reads[1]) n_cont = dict(n_cont) return (n_good, n_cont)
def get_coallele_counts_from_file(bamfilename, length, qual_min=30, maxreads=-1, VERBOSE=0, use_tests=False): '''Get counts of join occurence of two alleles''' from .utils.mapping import (test_read_pair_exotic_cigars, test_read_pair_exceed_reference) if VERBOSE >= 1: print 'Getting coallele counts' # Precompute conversion tables alpha_mapping = {a: alphal.index(a) for a in alphal} SANGER_SCORE_OFFSET = ord("!") q_mapping = dict() for letter in xrange(0, 255): q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET if VERBOSE >= 2: print 'Initializing matrix of cocounts' # NOTE: we are ignoring fwd/rev and read1/2 counts = np.zeros((len(alpha), len(alpha), length, length), int) posall = np.zeros(1000, dtype=[('pos', int), ('aind', int)]) if VERBOSE >= 2: from hivwholeseq.utils.mapping import get_number_reads print 'Scanning read pairs ('+str(get_number_reads(bamfilename) // 2)+')' # NOTE: the reads should already be filtered of unmapped stuff at this point with pysam.Samfile(bamfilename, 'rb') as bamfile: for ir, reads in enumerate(pair_generator(bamfile)): if ir == maxreads: if VERBOSE: print 'Max read number reached:', maxreads break if (VERBOSE >= 2) and (not ((ir +1) % 10)): if (VERBOSE == 2) and (ir + 1 != 10): sys.stdout.write("\x1b[1A") print (ir+1) if use_tests: if test_read_pair_exotic_cigars(reads): raise ValueError('CIGAR type '+str(bt)+' not recognized') if test_read_pair_exceed_reference(reads, length): raise ValueError('Read pair exceeds reference length of '+str(length)) # Temp structures posall[:] = (-1, -1) iall = 0 # Collect alleles for read in reads: alleles_ind = np.fromiter((alpha_mapping[x] for x in read.seq), np.uint8, len(read.seq)) allqual_ind = np.fromiter((q_mapping[x] for x in read.qual), np.uint8, len(read.qual)) pos_ref = read.pos pos_read = 0 for (bt, bl) in read.cigar: if bt == 1: pos_read += bl elif bt == 2: # NOTE: no CIGAR can start NOR end with a deletion qual_deletion = allqual_ind[pos_read: pos_read + 2].min() if qual_deletion >= qual_min: posall['pos'][iall: iall + bl] = np.arange(pos_ref, pos_ref + bl) posall['aind'][iall: iall + bl] = 4 iall += bl pos_ref += bl else: alleles_indb = alleles_ind[pos_read: pos_read + bl] allqual_indb = allqual_ind[pos_read: pos_read + bl] for i in xrange(len(alpha)): aitmp = (alleles_indb == i) & (allqual_indb >= qual_min) aitmp = aitmp.nonzero()[0] + pos_ref aitmplen = len(aitmp) posall['pos'][iall: iall + aitmplen] = aitmp posall['aind'][iall: iall + aitmplen] = i iall += aitmplen pos_read += bl pos_ref += bl # Avoid doubles (paired reads are twice the same biological molecule) posall.sort(order=('pos', 'aind')) iall = (posall['pos'] != -1).nonzero()[0][0] while iall < len(posall) - 1: if posall['pos'][iall + 1] == posall['pos'][iall]: # If both reads agree @ an overlap allele, take a single call if posall['aind'][iall + 1] == posall['aind'][iall]: posall[iall + 1] = (-1, -1) # else, pick one at random (FIXME: pick the highest phred) else: ibin = np.random.randint(2) posall[iall + ibin] = (-1, -1) iall += 1 iall += 1 # Add allele cocounts to the matrix # NOTE: this already takes care of the symmetry poss = [posall['pos'][posall['aind'] == i1] for i1 in xrange(len(alpha))] for i1 in xrange(len(alpha)): poss1 = poss[i1] if not len(poss1): continue for i2 in xrange(len(alpha)): poss2 = poss[i2] if not len(poss2): continue # Raveling vodoo for efficiency cobra = counts[i1, i2].ravel() ind = poss1.repeat(len(poss2)) * length + np.tile(poss2, len(poss1)) cobra[ind] += 1 return counts
def check_premap(data_folder, adaID, fragments, seq_run, samplename, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0, savefig=True, title=None): '''Check premap to reference: coverage, etc.''' refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') # FIXME: do this possibly better than parsing the description! try: fields = refseq.description.split() refseq_start = int(fields[fields.index('(indices') - 3]) except ValueError: refseq_start = 550 fragpos_filename = get_fragment_positions_filename(data_folder, adaID) if os.path.isfile(fragpos_filename): # Load the fragment positions, considering mixed fragments (e.g. F5a+b) fragtmp = [] postmp = [] with open(fragpos_filename, 'r') as f: f.readline() #HEADER for line in f: fields = line[:-1].split('\t') fragtmp.append(fields[0]) if 'inner' not in fields[1]: postmp.append([fields[1], fields[4]]) else: start = int(fields[1].split(',')[1].split(': ')[1].rstrip('}')) end = int(fields[4].split(',')[1].split(': ')[1].rstrip('}')) postmp.append([start, end]) postmp = np.array(postmp, int) # NOTE: In a lot of old files, it says F3o instead of F3ao if 'F3o' in fragtmp: fragtmp[fragtmp.index('F3o')] = 'F3ao' elif 'F3i' in fragtmp: fragtmp[fragtmp.index('F3i')] = 'F3ai' frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T else: frags_pos = None frags_pos_out = None # Open BAM and scan reads input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): if VERBOSE: print 'Premapped BAM file not found' return (None, None) # Count reads if requested n_reads = get_number_reads(input_filename) if VERBOSE: print 'N. of reads:', n_reads # Get counts counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename, len(refseq), qual_min=qual_min, match_len_min=match_len_min, maxreads=maxreads, VERBOSE=VERBOSE) # Plot results if title is None: title=', '.join(['run '+seq_run+' '+adaID, 'sample '+samplename, 'reads '+str(min(maxreads, n_reads))+'/'+str(n_reads), ]) plot_coverage(counts, offset_x=refseq_start, frags_pos=frags_pos, frags_pos_out=frags_pos_out, title=title) if savefig: from hivwholeseq.sequencing.adapter_info import foldername_adapter plt.savefig(data_folder+foldername_adapter(adaID)+'figures/coverage_premapped_'+samplename+'.png') return (counts, inserts)
def print_info_genomewide(p, title, name, method, VERBOSE=0, require_all=True): '''Pretty printer for patient pipeline info''' mod_dates = p.mod_dates def check_requisite_genomewide(md, name_requisite, samplename, mod_dates, require_all=True): '''Check requisites for genomewide observables''' stati = [] fragments=['F'+str(i+1) for i in xrange(6)] for fragment in fragments: if (name_requisite, fragment, samplename) not in mod_dates: stati.append('MISS') elif md < mod_dates[(name_requisite, fragment, samplename)]: stati.append('OLD') else: stati.append('OK') if 'OLD' in stati: return 'OLD' else: if require_all: if 'MISS' in stati: return 'MISS' else: return 'OK' else: if 'OK' in stati: return 'OK' else: return 'MISS' def check_contamination_genomewide(sample): '''Check whether any of the fragment samples is contaminated''' fragments=['F'+str(i+1) for i in xrange(6)] for fragment in fragments: if 'contaminated' in sample[fragment]: return True return False import os, sys from hivwholeseq.patients.samples import SamplePat # NOTE: this function is used to check both entire patients and single samples if isinstance(p, SamplePat): sample_iter = [(p.name, p)] else: sample_iter = p.samples.iterrows() stati = set() line = ('{:<'+str(title_len)+'}').format(title+':') print line for samplename, sample in sample_iter: sample = SamplePat(sample) title = sample.name line = ('{:<'+str(title_len)+'}').format(title+':') if isinstance(method, basestring) and hasattr(sample, method): fun = getattr(sample, method) fn = fun('genomewide') else: fn = method(sample.patient, samplename, 'genomewide') if os.path.isfile(fn): md = modification_date(fn) mod_dates[(name, 'genomewide', samplename)] = md if name is None: status = 'OK' elif check_contamination_genomewide(sample): status = 'CONT' else: status = check_requisite_genomewide(md, name, samplename, mod_dates, require_all=require_all) else: status = 'MISS' # Check the number of reads if requested if (status == 'OK') and (fn[-3:] == 'bam') and (VERBOSE >= 3): status = str(get_number_reads(fn)) stati.add(status) line = line + ('{:<'+str(cell_len)+'}').format(status) print line if 'OLD' in stati: raise ValueError('OLD status found')
def print_info(p, title, name, method, name_requisite=None, VERBOSE=0): '''Pretty printer for patient pipeline info''' import os, sys from hivwholeseq.patients.samples import SamplePat from hivwholeseq.utils.mapping import get_number_reads mod_dates = p.mod_dates # NOTE: this function is used to check both entire patients and single samples if isinstance(p, SamplePat): sample_iter = [(p.name, p)] else: sample_iter = p.samples.iterrows() fragments=['F'+str(i+1) for i in xrange(6)] stati = set() line = ('{:<'+str(title_len)+'}').format(title+':') print line for samplename, sample in sample_iter: sample = SamplePat(sample) title = sample.name line = ('{:<'+str(title_len)+'}').format(title+':') for fragment in fragments: if isinstance(method, basestring) and hasattr(sample, method): fun = getattr(sample, method) fn = fun(fragment) else: fn = method(sample.patient, samplename, fragment) if os.path.isfile(fn): md = modification_date(fn) mod_dates[(name, fragment, samplename)] = md if name_requisite is None: status = 'OK' elif ((name_requisite, fragment, samplename) in mod_dates): if md > mod_dates[(name_requisite, fragment, samplename)]: status = 'OK' else: status = 'OLD' print fn, md, mod_dates[(name_requisite, fragment, samplename)] elif ((name_requisite, fragment) in mod_dates): if md > mod_dates[(name_requisite, fragment)]: status = 'OK' else: status = 'OLD' # NOTE: on Nov 13, 2014 I updated the mod dates of all # references by mistake, without actually changing the # sequences (ironically, probably testing a backup system # for the refs themselves). So if the requisite is a ref # seq and the date is this one, it's OK if ((name_requisite == 'reference') and mod_dates[(name_requisite, fragment)].date() == \ datetime.date(2014, 11, 13)): status = 'OK' elif 'contaminated' in sample[fragment]: status = 'CONT' else: status = 'ERROR' else: status = 'MISS' # Check the number of reads if requested if (status == 'OK') and (fn[-3:] == 'bam') and (VERBOSE >= 3): status = str(get_number_reads(fn)) stati.add(status) line = line+fragment+': '+\ ('{:>'+str(cell_len - len(fragment) - 1)+'}').format(status)+' ' print line if 'OLD' in stati: raise ValueError('OLD status found')
def check_premap(data_folder, adaID, fragments, seq_run, samplename, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0, savefig=True, title=None): '''Check premap to reference: coverage, etc.''' refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') # FIXME: do this possibly better than parsing the description! try: fields = refseq.description.split() refseq_start = int(fields[fields.index('(indices') - 3]) except ValueError: refseq_start = 550 fragpos_filename = get_fragment_positions_filename(data_folder, adaID) if os.path.isfile(fragpos_filename): # Load the fragment positions, considering mixed fragments (e.g. F5a+b) fragtmp = [] postmp = [] with open(fragpos_filename, 'r') as f: f.readline() #HEADER for line in f: fields = line[:-1].split('\t') fragtmp.append(fields[0]) if 'inner' not in fields[1]: postmp.append([fields[1], fields[4]]) else: start = int( fields[1].split(',')[1].split(': ')[1].rstrip('}')) end = int( fields[4].split(',')[1].split(': ')[1].rstrip('}')) postmp.append([start, end]) postmp = np.array(postmp, int) # NOTE: In a lot of old files, it says F3o instead of F3ao if 'F3o' in fragtmp: fragtmp[fragtmp.index('F3o')] = 'F3ao' elif 'F3i' in fragtmp: fragtmp[fragtmp.index('F3i')] = 'F3ai' frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T else: frags_pos = None frags_pos_out = None # Open BAM and scan reads input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): if VERBOSE: print 'Premapped BAM file not found' return (None, None) # Count reads if requested n_reads = get_number_reads(input_filename) if VERBOSE: print 'N. of reads:', n_reads # Get counts counts, inserts = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), qual_min=qual_min, match_len_min=match_len_min, maxreads=maxreads, VERBOSE=VERBOSE) # Plot results if title is None: title = ', '.join([ 'run ' + seq_run + ' ' + adaID, 'sample ' + samplename, 'reads ' + str(min(maxreads, n_reads)) + '/' + str(n_reads), ]) plot_coverage(counts, offset_x=refseq_start, frags_pos=frags_pos, frags_pos_out=frags_pos_out, title=title) if savefig: from hivwholeseq.sequencing.adapter_info import foldername_adapter plt.savefig(data_folder + foldername_adapter(adaID) + 'figures/coverage_premapped_' + samplename + '.png') return (counts, inserts)