Exemplo n.º 1
0
 def __init__(self):
     import os
     self.root = os.path.dirname(
         os.path.abspath(__file__)) + "/test/test_corrGC/"
     self.tbitFile = self.root + "sequence.2bit"
     self.bamFile = self.root + "test.bam"
     self.mappability = self.root + "mappability.bw"
     self.chrNameBam = '2L'
     self.chrNameBit = 'chr2L'
     self.samtools = cfg.config.get('external_tools', 'samtools')
     bam = bamHandler.openBam(self.bamFile)
     bit = twobit.TwoBitFile(open(self.tbitFile))
     global debug
     debug = 0
     global global_vars
     global_vars = {
         '2bit': self.tbitFile,
         'bam': self.bamFile,
         'filter_out': None,
         'mappability': self.mappability,
         'extra_sampling_file': None,
         'max_reads': 5,
         'min_reads': 0,
         'min_reads': 0,
         'reads_per_bp': 0.3,
         'total_reads': bam.mapped,
         'genome_size': sum([bit[x].size for x in bit.index])
     }
Exemplo n.º 2
0
def chunk_scaffolds(genome, size=10000000):
    """Given a genome in many scaffolds, build temp files of `size` Mbp
    for easier querying"""
    print '\tChunking files into {0} bp...'.format(size)
    chromos = []
    tb = twobit.TwoBitFile(file(genome))
    # split target file into `options.size` (~10 Mbp) chunks
    fd, out = tempfile.mkstemp(suffix='.fasta')
    os.close(fd)
    temp = open(out, 'w')
    length = 0
    for seq in tb.keys():
        sequence = tb[seq][0:]
        length += len(sequence)
        # write it to the outfile
        temp.write('>{0}\n{1}\n'.format(seq, sequence))
        if length > size:
            temp.close()
            # put tempfile name on stack
            chromos.append(out + '[multiple]')
            # open a new temp file
            fd, out = tempfile.mkstemp(suffix='.fasta')
            os.close(fd)
            temp = open(out, 'w')
            # reset sequence length
            length = 0
    return chromos
Exemplo n.º 3
0
def countReadsPerGC_worker(chromNameBam,
                           start, end, stepSize, regionSize,
                           chrNameBamToBit, verbose=False):
    """given a genome region defined by
    (start, end), the GC content is quantified for
    regions of size regionSize that are contiguous
    """

    chromNameBit = chrNameBamToBit[chromNameBam]
    tbit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = bamHandler.openBam(global_vars['bam'])
    c = 1
    sub_reads_per_gc = []
    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    for index in xrange(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if region extends over the chromosome end
        if tbit[chromNameBit].size < i + regionSize:
            break

        try:
            gc = getGC_content(tbit[chromNameBit].get(i, i + regionSize))
        except Exception as detail:
            if verbose:
                print "{}:{}-{}".format(chromNameBit, i, i + regionSize)
                print detail
            continue
        numberReads = bam.count(chromNameBam, i, i + regionSize)
        sub_reads_per_gc.append((numberReads, gc))
        c += 1

    return sub_reads_per_gc
Exemplo n.º 4
0
def main():
    args = get_args()
    if args.nprocs > 1:
        pool = Pool(args.nprocs)
    # get and print start time
    begin_run = start()
    conf = ConfigParser.ConfigParser()
    conf.read(args.conf)
    params = (args.query, args.coverage, args.identity)
    # get align types ("Chromos"/"Scaffolds")
    if conf.has_section("chromos"):
        for genome in conf.items("chromos"):
            name, f = genome
            print "{0}\nWorking on {1}\n{0}\n".format("=" * 30, name)
            chromos = [os.path.join(f, chromo)
                    for chromo in twobit.TwoBitFile(file(f)).keys()]
            work = [(chromo, params) for chromo in chromos]
            if args.nprocs > 1:
                results = pool.map(align_query_to_genomes, work)
            else:
                results = map(align_query_to_genomes, work)
            save_results_and_cleanup(args.output, name, results)
    if conf.has_section("scaffolds"):
        for genome in conf.items("scaffolds"):
            name, f = genome
            print "{0}\nWorking on {1}\n{0}\n".format("=" * 30, name)
            chunks = chunk_scaffolds(f)
            work = [(chunk, params) for chunk in chunks]
            if args.nprocs > 1:
                results = pool.map(align_query_to_genomes, work)
            else:
                results = map(align_query_to_genomes, work)
            save_results_and_cleanup(args.output, name, results, chunks)
    # get and print end time
    stop(begin_run)
Exemplo n.º 5
0
 def __init__(self):
     import os
     self.root = os.path.dirname(
         os.path.abspath(__file__)) + "/test/test_corrGC/"
     self.tbitFile = self.root + "sequence.2bit"
     self.bamFile = self.root + "test.bam"
     self.chrNameBam = '2L'
     self.chrNameBit = 'chr2L'
     bam = pysam.Samfile(self.bamFile)
     bit = twobit.TwoBitFile(open(self.tbitFile))
     global debug
     debug = 0
     global global_vars
     global_vars = {
         '2bit': self.tbitFile,
         'bam': self.bamFile,
         'filter_out': None,
         'extra_sampling_file': None,
         'max_reads': 5,
         'min_reads': 0,
         'min_reads': 0,
         'reads_per_bp': 0.3,
         'total_reads': bam.mapped,
         'genome_size': sum([bit[x].size for x in bit.index])
     }
Exemplo n.º 6
0
 def test_3_deletions(self):
     """Convert BEDPE breakends that form a deletion.
     """
     genome_2bit = twobit.TwoBitFile(open(self.genome_file))
     parts = _get_vcf_breakends(self.in_file, genome_2bit,
                                {"max_single_size": 5000})
     deletion = parts.next()
     assert deletion.alt == "<DEL>", deletion
     assert "SVLEN=-4348" in deletion.info
Exemplo n.º 7
0
 def test_2_vcf_parts(self):
     """Convert BEDPE input line into VCF output parts.
     """
     genome_2bit = twobit.TwoBitFile(open(self.genome_file))
     breakends = hydra_parser(self.in_file)
     brend1, brend2 = build_vcf_parts(breakends.next(), genome_2bit)
     assert brend1.alt == "G]chr22:10112]"
     assert brend2.alt == "C]chr22:9764]"
     assert brend2.info == "SVTYPE=BND;MATEID=hydra1a;IMPRECISE;CIPOS=0,102", brend2.info
     brend1, brend2 = build_vcf_parts(breakends.next(), genome_2bit)
     assert brend1.alt == "A[chr22:12112["
     assert brend2.alt == "]chr22:7764]G"
     brend1, brend2 = build_vcf_parts(breakends.next(), genome_2bit)
     assert brend1.alt == "[chr22:11112[A"
     assert brend2.alt == "[chr22:8764[T"
     brend1, brend2 = build_vcf_parts(breakends.next(), genome_2bit)
     assert brend1.alt == "]chr22:13112]G", brend1.alt
     assert brend2.alt == "A[chr22:9764[", brend2.alt
def main():
    args = get_args()
    conf = ConfigParser.ConfigParser()
    conf.read(args.conf)
    all_files = get_all_files_from_conf(conf)
    for genome in all_files:
        name, twobit_name = genome
        out_file = os.path.join(args.output, name) + ".fasta"
        out = fasta.FastaWriter(out_file)
        tb = twobit.TwoBitFile(file(twobit_name))
        lz = os.path.join(args.lastz, name) + ".lastz"
        count = 0
        for row in lastz.Reader(lz, long_format=True):
            sequence = slice_and_return_fasta(tb, row, args.flank)
            out.write(sequence)
            count += 1
        print "\t{} sequences written to {}".format(count, out_file)
        out.close()
def main():
    args = get_args()
    conf = ConfigParser.ConfigParser()
    conf.optionxform = str
    conf.read(args.conf)
    all_files = get_all_files_from_conf(conf, args.pattern)
    #pdb.set_trace()
    for genome in all_files:
        short_name, long_name, twobit_name = genome
        if not args.exclude or (short_name not in args.exclude):
            out_file = os.path.join(args.output, short_name) + ".fasta"
            out = fasta.FastaWriter(out_file)
            tb = twobit.TwoBitFile(file(twobit_name))
            lz = os.path.join(args.lastz, long_name)
            count = 0
            for row in lastz.Reader(lz, long_format=True):
                sequence = slice_and_return_fasta(tb, row, args.flank)
                out.write(sequence)
                count += 1
            print "\t{} sequences written to {}".format(count, out_file)
            out.close()
Exemplo n.º 10
0
def fix_nonref_positions(in_file, ref_file):
    """Fix Genotyping VCF positions where the bases are all variants

    The plink/pseq output does not handle these correctly, and
    has all reference/variant bases reversed
    """
    ignore_chrs = ["."]
    ref2bit = twobit.TwoBitFile(open(ref_file))
    out_file = in_file.replace("-raw.vcf", ".vcf")

    with open(in_file) as in_handle:
        with open(out_file, "w") as out_handle:
            for line in in_handle:
                if line.startswith("#"):
                    out_handle.write(line)
                else:
                    parts = line.rstrip("\r\n").split("\t")
                    pos = int(parts[1])
                    # handle chr/non-chr naming
                    if parts[0] not in ref2bit.keys() and parts[0].replace(
                            "chr", "") in ref2bit.keys():
                        parts[0] = parts[0].replace("chr", "")
                    # handle X chromosome
                    elif parts[0] not in ref2bit.keys() and parts[0] == "23":
                        for test in ["X", "chrX"]:
                            if test in ref2bit.keys():
                                parts[0] = test
                    ref_base = None
                    if parts[0] not in ignore_chrs:
                        try:
                            ref_base = ref2bit[parts[0]].get(pos - 1,
                                                             pos).upper()
                        except Exception as msg:
                            print(
                                f"Skipping line. Failed to retrieve reference base for {str(parts)}\n{msg}"
                            )
                    parts = fix_vcf_line(parts, ref_base)
                    if parts is not None:
                        out_handle.write("\t".join(parts) + "\n")
        return out_file
def fix_nonref_positions(in_file, ref_file):
    """Fix Genotyping VCF positions where the bases are all variants.

    The plink/pseq output does not handle these correctly, and
    has all reference/variant bases reversed.
    """
    ignore_chrs = ["."]
    ref2bit = twobit.TwoBitFile(open(ref_file))
    out_file = apply("{0}-fix{1}".format, os.path.splitext(in_file))

    with open(in_file) as in_handle:
        with open(out_file, "w") as out_handle:
            for line in in_handle:
                if line.startswith("#"):
                    out_handle.write(line)
                else:
                    parts = line.rstrip("\r\n").split("\t")
                    pos = int(parts[1])
                    # handle chr/non-chr naming
                    if parts[0] not in ref2bit.keys():
                        #parts[0] = parts[0].replace("chr", "")
                        parts[0] = "chr" + parts[0]
                    ref_base = None
                    if parts[0] not in ignore_chrs:
                        try:
                            #print(parts[0])
                            ref_base = ref2bit[parts[0]].get(pos - 1,
                                                             pos).upper()
                        except Exception, msg:
                            # off the end of the chromosome
                            if str(msg).startswith("end before start"):
                                print msg
                            else:
                                print parts
                                raise
                    parts = fix_vcf_line(parts, ref_base)
                    if parts is not None:
                        parts[0] = parts[0].replace("chr", "")
                        out_handle.write("\t".join(parts) + "\n")
        return out_file
def main():
    args = get_args()
    tb = twobit.TwoBitFile(file(args.twobit))
    filtered = 0
    kept = 0
    skipped = 0
    with open(args.output, 'w') as outf:
        with open(args.fasta, 'rU') as fasta:
            for record in SeqIO.parse(fasta, 'fasta'):
                chromo, start, end = get_positions_from_coords(
                    record.description.split('|')[1])
                delta = args.buffer_length - (end - start)
                if delta < args.buffer_length:
                    split = int(round(delta / 2.))
                    new_start = start - split
                    if new_start < 0:
                        new_start = 0
                    new_end = end + split
                    sequence = tb[str(chromo)][new_start:new_end]
                    if n_count(
                            sequence) <= args.max_n and not sequence_is_masked(
                                args.mask, sequence):
                        seq = create_sequence_object(record.id, sequence,
                                                     chromo, new_start,
                                                     new_end)
                        outf.write(seq.format('fasta'))
                        kept += 1
                    else:
                        filtered += 1
                else:
                    outf.write(record.format('fasta'))
                    skipped += 1

    print "Total {} sequences.  Expanded {} and filtered {} with > {}% masked bases or > {} masked bases. Kept {}.".format(
        kept + filtered + skipped, kept + filtered, filtered, args.mask * 100,
        args.max_n, kept + skipped)
Exemplo n.º 13
0
def writeCorrectedSam_worker(chrNameBam,
                             chrNameBit,
                             start,
                             end,
                             step=None,
                             tag_but_not_change_number=False,
                             verbose=True):
    r"""
    Writes a SAM file, deleting and adding some reads in order to compensate
    for the GC bias. **This is a stochastic method.**

    First, check if samtools can be executed, otherwise the test will fail
    >>> resp = cfg.checkProgram(samtools, 'view', '')
    >>> np.random.seed(1)
    >>> test = Tester()
    >>> args = test.testWriteCorrectedSam()
    >>> tempFile = writeCorrectedSam_worker(*args, \
    ... tag_but_not_change_number=True, verbose=False)
    >>> res = os.system("{} index {}".format(test.samtools, tempFile))
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['CP'] for r in bam.fetch(args[0], 200, 250)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    >>> tempFile = \
    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
    ... tag_but_not_change_number=True, verbose=False)
    >>> res = os.system("{} index {}".format(test.samtools, tempFile))
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['CP'] for r in bam.fetch('chr2L', 0, 50)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    if verbose:
        print "Sam for %s %s %s " % (chrNameBit, start, end)
    i = 0

    tbit = twobit.TwoBitFile(open(global_vars['2bit']))

    bam = pysam.Samfile(global_vars['bam'])
    tempFileName = utilities.getTempFileName(suffix='.sam')

    outfile = pysam.Samfile(tempFileName, 'wh', template=bam)
    startTime = time.time()
    matePairs = {}
    read_repetitions = 0
    removed_duplicated_reads = 0
    # cache data
    # r.flag & 4 == 0 is to filter unmapped reads that
    # have a genomic position
    reads = [
        r for r in bam.fetch(chrNameBam, start, end)
        if r.pos > start and r.flag & 4 == 0
    ]

    r_index = -1
    for read in reads:
        r_index += 1
        copies = None
        gc = None

        # check if a mate has already been procesed
        # to apply the same correction
        try:
            copies = matePairs[read.qname]['copies']
            gc = matePairs[read.qname]['gc']
            del (matePairs[read.qname])
        except:
            # this exception happens when a mate is
            # not present. This could
            # happen because of removal of the mate
            # by some filtering
            gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit)
            if gc:
                copies = numCopiesOfRead(float(1) / R_gc[gc])
            else:
                copies = 1
        # is this read in the same orientation and position as the previous?
        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
                and read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                copies = 0  # in other words do not take into account this read
                removed_duplicated_reads += 1
        else:
            read_repetitions = 0

        readName = read.qname
        readTag = read.tags
        if gc:
            GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2))
            readTag.append(('CO', float(round(float(1) / R_gc[gc], 2))))
            readTag.append(('CP', copies))
        else:
            GC = -1

        readTag.append(('GC', GC))
        read.tags = readTag

        if read.is_paired and read.is_proper_pair \
                and not read.mate_is_unmapped \
                and not read.is_reverse:
            matePairs[readName] = {'copies': copies, 'gc': gc}
        """
        outfile.write(read)
        """
        if tag_but_not_change_number:
            outfile.write(read)
            continue

        for numCop in range(1, copies + 1):
            # the read has to be renamed such that newly
            # formed pairs will match
            if numCop > 1:
                read.qname = readName + "_%d" % (numCop)
            outfile.write(read)

        if verbose:
            if i % 500000 == 0 and i > 0:
                endTime = time.time()
                print "{},  processing {} ({:.1f} per sec) reads " \
                    "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                        i, i / (endTime - startTime),
                                        chrNameBit, start, end)
        i += 1

    outfile.close()
    if verbose:
        endTime = time.time()
        print "{},  processing {} ({:.1f} per sec) reads " \
            "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                i, i / (endTime - startTime),
                                chrNameBit, start, end)
        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
            if len(reads) > 0 else 0
        print "duplicated reads removed %d of %d (%.2f) " % \
            (removed_duplicated_reads, len(reads), percentage)

    # convert sam to bam.
    command = '{0} view -bS {1} 2> /dev/null > {1}.bam'.format(
        samtools, tempFileName)
    if verbose:
        sys.stderr.write("running {}\n".format(command))

    run_shell_command(command)

    os.remove(tempFileName)
    return tempFileName + ".bam"
Exemplo n.º 14
0
def main(svevents_file, genome_file):
    genome_2bit = twobit.TwoBitFile(open(genome_file))
    for event in svevent_reader(svevents_file):
        for vcf_line in _svevent_to_vcf(event):
            print vcf_line
Exemplo n.º 15
0
def main(hydra_file, genome_file, min_support=0):
    options = {"min_support": min_support, "max_single_size": 10000}
    out_file = "{0}.vcf".format(os.path.splitext(hydra_file)[0])
    genome_2bit = twobit.TwoBitFile(open(genome_file))
    with open(out_file, "w") as out_handle:
        hydra_to_vcf_writer(hydra_file, genome_2bit, options, out_handle)
Exemplo n.º 16
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    # check if directory is writable
    if args.filterOut:
        filter_out_file = args.filterOut.name
        args.filterOut.close()
    else:
        filter_out_file = None

    if args.extraSampling:
        extra_sampling_file = args.extraSampling.name
        args.extraSampling.close()
    else:
        extra_sampling_file = None

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile
    global_vars['filter_out'] = filter_out_file
    global_vars['extra_sampling_file'] = extra_sampling_file

    bit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = bamHandler.openBam(global_vars['bam'])

    if args.fragmentLength:
        fragment_len_dict = \
            {'median': args.fragmentLength}

    else:
        fragment_len_dict, __ = \
            get_read_and_fragment_length(args.bamfile, None,
                                         numberOfProcessors=args.numberOfProcessors,
                                         verbose=args.verbose)
        if not fragment_len_dict:
            print "\nPlease provide the fragment length used for the " \
                "sample preparation.\n"
            exit(1)

        fragment_len_dict = {'median': int(fragment_len_dict['median'])}

    chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references)

    global_vars['genome_size'] = sum([bit[x].size for x in bit.index])
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    confidence_p_value = float(1) / args.sampleSize

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    # use poisson distribution to identify peaks that should be discarted.
    # I multiply by 4, because the real distribution of reads
    # vary depending on the gc content
    # and the global number of reads per bp may a be too low.
    # empirically, a value of at least 4 times as big as the
    # reads_per_bp was found.
    # Similarly for the min value, I divide by 4.
    global_vars['max_reads'] = \
        poisson(4 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).isf(confidence_p_value)
    # this may be of not use, unless the depth of sequencing is really high
    # as this value is close to 0
    global_vars['min_reads'] = \
        poisson(0.25 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).ppf(confidence_p_value)

    for key in global_vars:
        print "{}: {}".format(key, global_vars[key])

    print "computing frequencies"
    # the GC of the genome is sampled each stepSize bp.
    stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
    print "stepSize: {}".format(stepSize)
    data = tabulateGCcontent(fragment_len_dict,
                             chrNameBitToBam,
                             stepSize,
                             chromSizes,
                             numberOfProcessors=args.numberOfProcessors,
                             verbose=args.verbose,
                             region=args.region)

    np.savetxt(args.GCbiasFrequenciesFile.name, data)

    if args.biasPlot:
        reads_per_gc = countReadsPerGC(
            args.regionSize,
            chrNameBitToBam,
            stepSize * 10,
            chromSizes,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose,
            region=args.region)
        plotGCbias(args.biasPlot,
                   data,
                   reads_per_gc,
                   args.regionSize,
                   image_format=args.plotFileFormat)
Exemplo n.º 17
0
def tabulateGCcontent_worker(chromNameBam,
                             start,
                             end,
                             stepSize,
                             fragmentLength,
                             chrNameBamToBit,
                             verbose=False):
    r""" given genome regions, the GC content of the genome is tabulated for
    fragments of length 'fragmentLength' each 'stepSize' positions.

    >>> test = Tester()
    >>> args = test.testTabulateGCcontentWorker()
    >>> N_gc, F_gc = tabulateGCcontent_worker(*args)

    The forward read positions are:
    [1,  4,  10, 10, 16, 18]
    which correspond to a GC of
    [1,  1,  1,  1,  2,  1]

    The evaluated position are
    [0,  2,  4,  6,  8, 10, 12, 14, 16, 18]
    the corresponding GC is
    [2,  1,  1,  2,  2,  1,  2,  3,  2,  1]

    >>> print N_gc
    [0 4 5 1]
    >>> print F_gc
    [0 4 1 0]
    >>> test.set_filter_out_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}

    Test for the filter out option
    >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)
    >>> test.unset_filter_out_file()

    The evaluated positions are
    [ 0  2  8 10 12 14 16 18]
    >>> print N_gc
    [0 3 4 1]
    >>> print F_gc
    [0 3 1 0]

    Test for extra_sampling option
    >>> test.set_extra_sampling_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}
    >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)

    The new positions evaluated are
    [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
    and the GC is
    [2, 1, 1, 0, 1, 2, 2, 1,  2,  3,  2,  1]
    >>> print res[0]
    [1 5 5 1]
    >>> print res[1]
    [0 5 1 0]

    """
    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    chromNameBit = chrNameBamToBit[chromNameBam]

    # array to keep track of the GC from regions of length 'fragmentLength'
    # from the genome. The index of the array is used to
    # indicate the gc content. The values inside the
    # array are counts. Thus, if N_gc[10] = 3, that means
    # that 3 regions have a gc_content of 10.
    subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
    subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')

    tbit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = bamHandler.openBam(global_vars['bam'])
    peak = 0
    startTime = time.time()

    if verbose:
        print "[{:.3f}] computing positions to " \
            "sample".format(time.time() - startTime)

    positions_to_sample = getPositionsToSample(chromNameBit, start, end,
                                               stepSize)

    read_counts = []
    # Optimize IO.
    # if the sample regions are far apart from each
    # other is faster to go to each location and fetch
    # the reads found there.
    # Otherwise, if the regions to sample are close to
    # each other, is faster to load all the reads in
    # a large region into memory and consider only
    # those falling into the positions to sample.
    # The following code gets the reads
    # that are at sampling positions that lie close together
    if np.mean(np.diff(positions_to_sample)) < 1000:
        start_pos = min(positions_to_sample)
        end_pos = max(positions_to_sample)
        if verbose:
            print "[{:.3f}] caching reads".format(time.time() - startTime)

        counts = np.bincount([
            r.pos - start_pos
            for r in bam.fetch(chromNameBam, start_pos, end_pos + 1)
            if not r.is_reverse and r.pos >= start_pos
        ],
                             minlength=end_pos - start_pos + 2)

        read_counts = counts[positions_to_sample - min(positions_to_sample)]
        if verbose:
            print "[{:.3f}] finish caching reads.".format(time.time() -
                                                          startTime)

    countTime = time.time()

    c = 1
    for index in xrange(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if the end of the chromosome is reached
        if i + fragmentLength['median'] > tbit[chromNameBit].size:
            break

        try:
            gc = getGC_content(tbit[chromNameBit].get(
                i, i + fragmentLength['median']),
                               as_fraction=False)
        except Exception as detail:
            if verbose:
                print detail
            continue

        subN_gc[gc] += 1

        # count all reads at position 'i'
        if len(read_counts) == 0:  # case when no cache was done
            num_reads = len([
                x.pos for x in bam.fetch(chromNameBam, i, i + 1)
                if x.is_reverse is False and x.pos == i
            ])
        else:
            num_reads = read_counts[index]

        if num_reads >= global_vars['max_reads']:
            peak += 1
            continue

        subF_gc[gc] += num_reads
        if verbose:
            if index % 50000 == 0:
                endTime = time.time()
                print "%s processing %d (%.1f per sec) @ %s:%s-%s %s" % \
                    (multiprocessing.current_process().name,
                     index, index / (endTime - countTime),
                     chromNameBit, start, end, stepSize)
        c += 1

    if verbose:
        endTime = time.time()
        print "%s processing %d (%.1f per sec) @ %s:%s-%s %s" % \
            (multiprocessing.current_process().name,
             index, index / (endTime - countTime),
             chromNameBit, start, end, stepSize)
        print "%s total time %.1f @ %s:%s-%s %s" % (
            multiprocessing.current_process().name,
            (endTime - startTime), chromNameBit, start, end, stepSize)

    return (subN_gc, subF_gc)
Exemplo n.º 18
0
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
    r"""writes a bedgraph file containing the GC correction of
    a region from the genome

    >>> test = Tester()
    >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
    >>> open(tempFile, 'r').readlines()
    ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
    >>> os.remove(tempFile)
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    cvg_corr = np.zeros(end - start)

    i = 0

    tbit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = pysam.Samfile(global_vars['bam'])
    read_repetitions = 0
    removed_duplicated_reads = 0
    startTime = time.time()

    # caching seems to be faster
    # r.flag & 4 == 0 is to skip unmapped
    # reads that nevertheless are asigned
    # to a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0]

    bam.close()
    r_index = -1
    for read in reads:
        r_index += 1
        try:
            # calculate GC content of read fragment
            gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit)
        except Exception as detail:
            print detail
            """ this exception happens when the end of a
            chromosome is reached """
            continue
        if not gc:
            continue

        # is this read in the same orientation and position as the previous?
        if r_index > 0 and read.pos == reads[r_index - 1].pos and \
                read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                removed_duplicated_reads += 1
                continue
        else:
            read_repetitions = 0

        try:
            fragmentStart, fragmentEnd = getFragmentFromRead(
                read, fragmentLength, extendPairedEnds=True)
            vectorStart = max(fragmentStart - start, 0)
            vectorEnd = min(fragmentEnd - start, end - start)
        except TypeError:
            # the get_fragment_from_read functions returns None in some cases.
            # Those cases are to be skipped, hence the continue line.
            continue

        cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
        i += 1
    if debug:
        endTime = time.time()
        print "{}, processing {} ({:.1f} per sec) "
        "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i,
                                  i / (endTime - startTime), chrNameBit, start,
                                  end)

    if i == 0:
        return None

    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
    # save in bedgraph format
    for bin in xrange(0, len(cvg_corr), step):
        value = np.mean(cvg_corr[bin:min(bin + step, end)])
        if value > 0:
            writeStart = start + bin
            writeEnd = min(start + bin + step, end)
            _file.write("%s\t%d\t%d\t%.1f\n" %
                        (chrNameBit, writeStart, writeEnd, value))

    tempFileName = _file.name
    _file.close()
    return tempFileName
Exemplo n.º 19
0
def main(args=None):
    args = process_args(args)
    global F_gc, N_gc, R_gc

    data = np.loadtxt(args.GCbiasFrequenciesFile.name)

    F_gc = data[:, 0]
    N_gc = data[:, 1]
    R_gc = data[:, 2]

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile

    # compute the probability to find more than one read (a redundant read)
    # at a certain position based on the gc of the read fragment
    # the binomial function is used for that
    max_dup_gc = [
        binom.isf(1e-7, F_gc[x], 1.0 /
                  N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1
        for x in range(len(F_gc))
    ]

    global_vars['max_dup_gc'] = max_dup_gc

    bit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = pysam.Samfile(global_vars['bam'])

    global_vars['genome_size'] = sum([bit[x].size for x in bit.index])
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    # apply correction
    print "applying correction"
    # divide the genome in fragments containing about 4e5 reads.
    # This amount of reads takes about 20 seconds
    # to process per core (48 cores, 256 Gb memory)
    chunkSize = int(4e5 / global_vars['reads_per_bp'])

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    regionStart = 0
    if args.region:
        chromSizes, regionStart, regionEnd, chunkSize = \
            mapReduce.getUserRegion(chromSizes, args.region,
                                    max_chunk_size=chunkSize)

    print "genome partition size for multiprocessing: {}".format(chunkSize)
    print "using region {}".format(args.region)
    mp_args = []
    bedGraphStep = args.binSize
    chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references)
    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()])
    print chrNameBitToBam, chrNameBamToBit
    c = 1
    for chrom, size in chromSizes:
        start = 0 if regionStart == 0 else regionStart
        for i in xrange(start, size, chunkSize):
            try:
                chrNameBamToBit[chrom]
            except KeyError:
                print "no sequence information for "
                "chromosome {} in 2bit file".format(chrom)
                print "Reads in this chromosome will be skipped"
                continue
            length = min(size, i + chunkSize)
            mp_args.append(
                (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep))
            c += 1

    pool = multiprocessing.Pool(args.numberOfProcessors)

    if args.correctedFile.name.endswith('bam'):
        if len(mp_args) > 1 and args.numberOfProcessors > 1:
            print("using {} processors for {} "
                  "number of tasks".format(args.numberOfProcessors,
                                           len(mp_args)))

            res = pool.map_async(writeCorrectedSam_wrapper,
                                 mp_args).get(9999999)
        else:
            res = map(writeCorrectedSam_wrapper, mp_args)

        if len(res) == 1:
            command = "cp {} {}".format(res[0], args.correctedFile.name)
            run_shell_command(command)
        else:
            print "concatenating (sorted) intermediate BAMs"
            header = pysam.Samfile(res[0])
            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
            header.close()
            for f in res:
                f = pysam.Samfile(f)
                for e in f.fetch(until_eof=True):
                    of.write(e)
                f.close()
            of.close()

        print "indexing BAM"
        pysam.index(args.correctedFile.name)

        for tempFileName in res:
            os.remove(tempFileName)

    if args.correctedFile.name.endswith('bg') or \
            args.correctedFile.name.endswith('bw'):

        _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg')
        if len(mp_args) > 1 and args.numberOfProcessors > 1:

            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
        else:
            res = map(writeCorrected_wrapper, mp_args)

        # concatenate intermediary bedgraph files
        _temp_bg_file = open(_temp_bg_file_name, 'w')
        for tempFileName in res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file)
                os.remove(tempFileName)
        _temp_bg_file.close()
        args.correctedFile.close()

        if args.correctedFile.name.endswith('bg'):
            shutil.move(_temp_bg_file_name, args.correctedFile.name)

        else:
            chromSizes = [(x, bit[x].size) for x in bit.keys()]
            writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name,
                                           args.correctedFile.name)
            os.remove(_temp_bg_file)
Exemplo n.º 20
0
def writeCorrectedSam_worker(chrNameBam,
                             chrNameBit,
                             start,
                             end,
                             step=None,
                             tag_but_not_change_number=False,
                             verbose=True):
    r"""
    Writes a BAM file, deleting and adding some reads in order to compensate
    for the GC bias. **This is a stochastic method.**
    >>> np.random.seed(1)
    >>> test = Tester()
    >>> args = test.testWriteCorrectedSam()
    >>> tempFile = writeCorrectedSam_worker(*args, \
    ... tag_but_not_change_number=True, verbose=False)
    >>> from StringIO import StringIO
    >>> ostdout = sys.stdout
    >>> import tempfile
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    >>> tempFile = \
    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
    ... tag_but_not_change_number=True, verbose=False)
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    if verbose:
        print "Sam for %s %s %s " % (chrNameBit, start, end)
    i = 0

    tbit = twobit.TwoBitFile(open(global_vars['2bit']))

    bam = pysam.Samfile(global_vars['bam'])
    tempFileName = utilities.getTempFileName(suffix='.bam')

    outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
    startTime = time.time()
    matePairs = {}
    read_repetitions = 0
    removed_duplicated_reads = 0
    # cache data
    # r.flag & 4 == 0 is to filter unmapped reads that
    # have a genomic position
    reads = [
        r for r in bam.fetch(chrNameBam, start, end)
        if r.pos > start and r.flag & 4 == 0
    ]

    r_index = -1
    for read in reads:
        r_index += 1
        copies = None
        gc = None

        # check if a mate has already been procesed
        # to apply the same correction
        try:
            copies = matePairs[read.qname]['copies']
            gc = matePairs[read.qname]['gc']
            del (matePairs[read.qname])
        except:
            # this exception happens when a mate is
            # not present. This could
            # happen because of removal of the mate
            # by some filtering
            gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit)
            if gc:
                copies = numCopiesOfRead(float(1) / R_gc[gc])
            else:
                copies = 1
        # is this read in the same orientation and position as the previous?
        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
                and read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                copies = 0  # in other words do not take into account this read
                removed_duplicated_reads += 1
        else:
            read_repetitions = 0

        readName = read.qname
        # Each tag is a tuple of (tag name, value, type)
        # Note that get_tags() returns ord(type) rather than type and this must
        # be fixed!
        # It turns out that the "with_value_type" option only started working in
        # pysam-0.8.4, so we can't reliably add tags on earlier versions without
        # potentially creating BAM files that break HTSJDK/IGV/etc.

        readTag = read.get_tags(with_value_type=True)
        replace_tags = False
        if len(readTag) > 0:
            if len(readTag[0]) == 3:
                if type(readTag[2]) is int:
                    readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
                replace_tags = True
        else:
            replace_tags = True

        if gc:
            GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2))
            readTag.append(('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
            readTag.append(('YN', copies, "i"))
        else:
            GC = -1

        readTag.append(('YG', GC, "i"))
        if replace_tags:
            read.set_tags(readTag)

        if read.is_paired and read.is_proper_pair \
                and not read.mate_is_unmapped \
                and not read.is_reverse:
            matePairs[readName] = {'copies': copies, 'gc': gc}
        """
        outfile.write(read)
        """
        if tag_but_not_change_number:
            outfile.write(read)
            continue

        for numCop in range(1, copies + 1):
            # the read has to be renamed such that newly
            # formed pairs will match
            if numCop > 1:
                read.qname = readName + "_%d" % (numCop)
            outfile.write(read)

        if verbose:
            if i % 500000 == 0 and i > 0:
                endTime = time.time()
                print "{},  processing {} ({:.1f} per sec) reads " \
                    "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                        i, i / (endTime - startTime),
                                        chrNameBit, start, end)
        i += 1

    outfile.close()
    if verbose:
        endTime = time.time()
        print "{},  processing {} ({:.1f} per sec) reads " \
            "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                i, i / (endTime - startTime),
                                chrNameBit, start, end)
        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
            if len(reads) > 0 else 0
        print "duplicated reads removed %d of %d (%.2f) " % \
            (removed_duplicated_reads, len(reads), percentage)

    return tempFileName