def assert_equal_bam_reads(testCase, bam_filename1, bam_filename2): ''' Assert that two bam files are equivalent This function converts each file to sam (plaintext) format, without header, since the header can be variable. Test data should be stored in bam format rather than sam to save space, and to ensure the bam->sam conversion is the same for both files. ''' samtools = SamtoolsTool() sam_one = util.file.mkstempfname(".sam") sam_two = util.file.mkstempfname(".sam") # write the bam files to sam format, without header (no -h) samtools.view(args=[], inFile=bam_filename1, outFile=sam_one) samtools.view(args=[], inFile=bam_filename2, outFile=sam_two) try: testCase.assertTrue(filecmp.cmp(sam_one, sam_two, shallow=False)) finally: for fname in [sam_one, sam_two]: if os.path.exists(fname): os.unlink(fname)
def compute_library_bias(isnvs, inBam, inConsFasta) : ''' For each variant, compute read counts in each library and p-value for library bias; append them to string for each variant. Format is allele:totalF:totalR:1stLibFCount:1stLibRCount:2ndLibFCount:...:p-val. Library counts are in alphabetical order of library IDs. Note: Total was computed by vphaser, library counts by samtools mpileup, so total might not be sum of library counts. ''' alleleCol = 7 # First column of output with allele counts samtoolsTool = SamtoolsTool() rgs_by_lib = sorted((rg['LB'],rg['ID']) for rg in samtoolsTool.getReadGroups(inBam).values()) rgs_by_lib = itertools.groupby(rgs_by_lib, lambda x: x[0]) libBams = [] header_sam = util.file.mkstempfname('.sam') samtoolsTool.dumpHeader(inBam, header_sam) for lib,rgs in rgs_by_lib: rgs = list(id for lb,id in rgs) # Create libBam containing all the readgroups in rgs. # In samtools 1.1, this can be done by including -r multiple times on # a single command line, but that doesn't work in 0.1.19, so instead # extract readgroups one by one and then concatenate. rgBams = [] for id in rgs : rgBam = util.file.mkstempfname('.bam') samtoolsTool.view(['-b', '-r', id], inBam, rgBam) samtoolsTool.index(rgBam) if samtoolsTool.count(rgBam) > 0: rgBams.append(rgBam) else: # most samtools functions don't like empty input bams, so skip them os.unlink(rgBam) if rgBams: if len(rgBams) > 1: libBam = util.file.mkstempfname('.bam') samtoolsTool.merge(rgBams, libBam, ['-f', '-1', '-h', header_sam]) for bam in rgBams : os.unlink(bam) else: # samtools merge cannot deal with only one (or zero) input bams libBam = rgBams[0] samtoolsTool.index(libBam) n_reads = samtoolsTool.count(libBam) log.debug("LB:%s has %s reads in %s read groups (%s)", lib, n_reads, len(rgs), ', '.join(rgs)) libBams.append(libBam) for row in isnvs : consensusAllele = row[3] pos = int(row[1]) if consensusAllele != 'i' else int(row[1]) - 1 chrom = row[0] libCounts = [get_mpileup_allele_counts(libBam, chrom, pos, inConsFasta) for libBam in libBams] numAlleles = len(row) - alleleCol countsMatrix = [[0] * numAlleles for lib in libBams] libCountsByAllele = [] for alleleInd in range(numAlleles) : allele = row[alleleCol + alleleInd].split(':')[0] libCountsByAllele.append([]) for libAlleleCounts, countsRow in zip(libCounts, countsMatrix) : f, r = libAlleleCounts.get(allele, [0, 0]) libCountsByAllele[-1].append([f, r]) countsRow[alleleInd] += f + r for alleleInd in range(numAlleles) : contingencyTable = [ [ countsRow[alleleInd] for countsRow in countsMatrix], [sum(countsRow) - countsRow[alleleInd] for countsRow in countsMatrix]] rowSums = map(sum, contingencyTable) dofs = len(libCounts) - 1 if dofs < 1 : pval = 1.0 elif min(rowSums) ** dofs / dofs < 10000 : # At this cutoff, fisher_exact should take <~ 0.1 sec pval = fisher_exact(contingencyTable) else : pval = chi2_contingency(contingencyTable) row[alleleCol + alleleInd] = str(AlleleFieldParser(None, *(row[alleleCol + alleleInd].split(':') + [pval, libCountsByAllele[alleleInd]]))) yield row for bam in libBams: os.unlink(bam) os.unlink(header_sam)