Пример #1
0
def assert_equal_bam_reads(testCase, bam_filename1, bam_filename2):
    ''' Assert that two bam files are equivalent

        This function converts each file to sam (plaintext) format,
        without header, since the header can be variable.

        Test data should be stored in bam format rather than sam
        to save space, and to ensure the bam->sam conversion
        is the same for both files.
    '''

    samtools = SamtoolsTool()

    sam_one = util.file.mkstempfname(".sam")
    sam_two = util.file.mkstempfname(".sam")

    # write the bam files to sam format, without header (no -h)
    samtools.view(args=[], inFile=bam_filename1, outFile=sam_one)
    samtools.view(args=[], inFile=bam_filename2, outFile=sam_two)

    try:
        testCase.assertTrue(filecmp.cmp(sam_one, sam_two, shallow=False))
    finally:
        for fname in [sam_one, sam_two]:
            if os.path.exists(fname):
                os.unlink(fname)
Пример #2
0
def assert_equal_bam_reads(testCase, bam_filename1, bam_filename2):
    ''' Assert that two bam files are equivalent

        This function converts each file to sam (plaintext) format,
        without header, since the header can be variable.

        Test data should be stored in bam format rather than sam
        to save space, and to ensure the bam->sam conversion
        is the same for both files.
    '''

    samtools = SamtoolsTool()

    sam_one = util.file.mkstempfname(".sam")
    sam_two = util.file.mkstempfname(".sam")

    # write the bam files to sam format, without header (no -h)
    samtools.view(args=[], inFile=bam_filename1, outFile=sam_one)
    samtools.view(args=[], inFile=bam_filename2, outFile=sam_two)

    try:
        testCase.assertTrue(filecmp.cmp(sam_one, sam_two, shallow=False))
    finally:
        for fname in [sam_one, sam_two]:
            if os.path.exists(fname):
                os.unlink(fname)
Пример #3
0
def compute_library_bias(isnvs, inBam, inConsFasta) :
    ''' For each variant, compute read counts in each library and p-value for
          library bias; append them to string for each variant.
        Format is allele:totalF:totalR:1stLibFCount:1stLibRCount:2ndLibFCount:...:p-val.
        Library counts are in alphabetical order of library IDs.
        Note: Total was computed by vphaser, library counts by samtools mpileup,
          so total might not be sum of library counts.
    '''
    alleleCol = 7 # First column of output with allele counts
    samtoolsTool = SamtoolsTool()
    rgs_by_lib = sorted((rg['LB'],rg['ID'])
        for rg in samtoolsTool.getReadGroups(inBam).values())
    rgs_by_lib = itertools.groupby(rgs_by_lib, lambda x: x[0])
    libBams = []
    header_sam = util.file.mkstempfname('.sam')
    samtoolsTool.dumpHeader(inBam, header_sam)
    for lib,rgs in rgs_by_lib:
        rgs = list(id for lb,id in rgs)
        
        # Create libBam containing all the readgroups in rgs.
        # In samtools 1.1, this can be done by including -r multiple times on
        # a single command line, but that doesn't work in 0.1.19, so instead
        # extract readgroups one by one and then concatenate.
        rgBams = []
        for id in rgs :
            rgBam = util.file.mkstempfname('.bam')
            samtoolsTool.view(['-b', '-r', id], inBam, rgBam)
            samtoolsTool.index(rgBam)
            if samtoolsTool.count(rgBam) > 0:
                rgBams.append(rgBam)
            else:
                # most samtools functions don't like empty input bams, so skip them
                os.unlink(rgBam)
        if rgBams:
            if len(rgBams) > 1:
                libBam = util.file.mkstempfname('.bam')
                samtoolsTool.merge(rgBams, libBam, ['-f', '-1', '-h', header_sam])
                for bam in rgBams :
                    os.unlink(bam)
            else:
                # samtools merge cannot deal with only one (or zero) input bams
                libBam = rgBams[0]
            samtoolsTool.index(libBam)
            n_reads = samtoolsTool.count(libBam)
            log.debug("LB:%s has %s reads in %s read groups (%s)",
                lib, n_reads, len(rgs), ', '.join(rgs))
            libBams.append(libBam)
        
    for row in isnvs :
        consensusAllele = row[3]
        pos = int(row[1]) if consensusAllele != 'i' else int(row[1]) - 1
        chrom = row[0]
        libCounts = [get_mpileup_allele_counts(libBam, chrom, pos, inConsFasta)
                     for libBam in libBams]
        numAlleles = len(row) - alleleCol
        countsMatrix = [[0] * numAlleles for lib in libBams]
        libCountsByAllele = []
        for alleleInd in range(numAlleles) :
            allele = row[alleleCol + alleleInd].split(':')[0]
            libCountsByAllele.append([])
            for libAlleleCounts, countsRow in zip(libCounts, countsMatrix) :
                f, r = libAlleleCounts.get(allele, [0, 0])
                libCountsByAllele[-1].append([f, r])
                countsRow[alleleInd] += f + r
        for alleleInd in range(numAlleles) :
            contingencyTable = [
                [         countsRow[alleleInd]         for countsRow in countsMatrix],
                [sum(countsRow) - countsRow[alleleInd] for countsRow in countsMatrix]]
            rowSums = map(sum, contingencyTable)
            dofs = len(libCounts) - 1
            if dofs < 1 :
                pval = 1.0
            elif min(rowSums) ** dofs / dofs < 10000 :
                # At this cutoff, fisher_exact should take <~ 0.1 sec
                pval = fisher_exact(contingencyTable)
            else :
                pval = chi2_contingency(contingencyTable)
            row[alleleCol + alleleInd] = str(AlleleFieldParser(None,
                *(row[alleleCol + alleleInd].split(':') +
                  [pval, libCountsByAllele[alleleInd]])))
        yield row
    for bam in libBams:
        os.unlink(bam)
    os.unlink(header_sam)