import pysam

files = ["m54119_180806_194558.subreads.bam",
"m54119_180807_160930.subreads.bam",
"m54119_180808_103633.subreads.bam"]
total_size = 0
for fn in files:
    bam = pysam.AlignmentFile(fn, 'rb',  check_header=False, check_sq=False)
    for index,read in enumerate(bam):
        total_size += len(read.seq)
        if index % 1000000 == 0:
            print total_size
            print index
            print fn
예제 #2
0
def sampleNameBam(bamFile):
    """get @RG SM: information as sample name from BAM header"""
    bam = pysam.AlignmentFile(bamFile)
    name = bam.header['RG'][0]['SM']
    return name
예제 #3
0
import pysam

infile = pysam.AlignmentFile("-", "rb")
outfile = pysam.AlignmentFile("-", "w", template=infile)
for s in infile:
    outfile.write(s)
예제 #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--template-bam-file", dest="filename_genome_bam", type="string",
                      help="input bam file for header information [%default]")

    parser.add_option("-s", "--contigs-tsv-file", dest="filename_contigs", type="string",
                      help="filename with contig sizes [%default]")

    parser.add_option("-o", "--colour", dest="colour_mismatches", action="store_true",
                      help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option("-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true",
                      help="ignore mismatches [%default]")

    parser.add_option("-c", "--remove-contigs", dest="remove_contigs", type="string",
                      help="','-separated list of contigs to remove [%default]")

    parser.add_option("-f", "--force-output", dest="force", action="store_true",
                      help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.set_defaults(
        filename_genome_bam=None,
        filename_gtf=None,
        filename_mismapped=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    genomefile, referencenames, referencelengths = None, None, None

    if options.filename_genome_bam:
        genomefile = pysam.AlignmentFile(options.filename_genome_bam, "rb")
    elif options.filename_contigs:
        contigs = IOTools.ReadMap(IOTools.openFile(options.filename_contigs))
        data = list(zip(*list(contigs.items())))
        referencenames, referencelengths = data[0], list(map(int, data[1]))
    else:
        raise ValueError(
            "please provide either --template-bam-file or --contigs-tsv-file")

    infile = pysam.AlignmentFile("-", "rb")
    outfile = pysam.AlignmentFile("-", "wb", template=genomefile,
                                  referencenames=referencenames,
                                  referencelengths=referencelengths)

    if options.colour_mismatches:
        tag = "CM"
    else:
        tag = "NM"

    nambiguous = 0
    ninput = 0
    nunmapped = 0
    ncigar = 0
    nfull = 0
    noutput = 0

    contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)])

    for qname, readgroup in itertools.groupby(infile, lambda x: x.qname):
        ninput += 1
        reads = list(readgroup)
        if reads[0].is_unmapped:
            nunmapped += 1
            continue

        # filter for best match
        best = min([x.opt(tag) for x in reads])
        reads = [x for x in reads if x.opt(tag) == best]
        if len(reads) > 1:
            nambiguous += 1
            continue

        read = reads[0]

        # reject complicated matches (indels, etc)
        # to simplify calculations below.
        if len(read.cigar) > 1:
            ncigar += 1
            continue

        # set NH flag to latest count
        t = dict(read.tags)
        t['NH'] = 1
        read.tags = list(t.items())

        sname = infile.getrname(read.tid)

        contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split(
            "|")
        first_exon_end, last_exon_start = middle.split("-")
        first_exon_start, first_exon_end, last_exon_start, last_exon_end = list(map(int, (
            first_exon_start, first_exon_end, last_exon_start, last_exon_end)))
        first_exon_end += 1

        total = first_exon_end - first_exon_start + \
            last_exon_end - last_exon_start
        first_exon_length = first_exon_end - first_exon_start

        match1 = first_exon_length - read.pos
        intron_length = last_exon_start - first_exon_end
        match2 = read.qlen - match1

        # match lies fully in one exon - ignore
        if match1 <= 0 or match2 <= 0:
            nfull += 1
            continue

        # increment pos
        read.pos = first_exon_start + read.pos
        read.tid = contig2tid[contig]
        # 3 = BAM_CREF_SKIP
        read.cigar = [(0, match1), (3, intron_length), (0, match2)]

        outfile.write(read)

        noutput += 1

    outfile.close()
    if genomefile:
        genomefile.close()

    c = E.Counter()
    c.input = ninput
    c.output = noutput
    c.full = nfull
    c.cigar = ncigar
    c.ambiguous = nambiguous
    c.unmapped = nunmapped

    E.info("%s" % str(c))

    # write footer and output benchmark information.
    E.Stop()
예제 #5
0
def count_bam_file_length(bam_file: str) -> int:
    """Get length of BAM indexed file"""
    import pysam

    return pysam.AlignmentFile(bam_file).count()
예제 #6
0
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts):
    ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end'''

    if args.seed is not None: random.seed(int(args.seed) + int(start))

    mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf)
    if ins is None:
        mutid += ':DEL'
    else:
        mutid += ':INS:' + ins

    bamfile = pysam.AlignmentFile(args.bamFileName, 'rb')
    bammate = pysam.AlignmentFile(
        args.bamFileName, 'rb')  # use for mates to avoid iterator problems
    reffile = pysam.Fastafile(args.refFasta)
    vcffile = pysam.VariantFile(args.germline,
                                'r') if args.germline is not None else None
    tmpbams = []

    is_insertion = ins is not None
    is_deletion = ins is None

    snvfrac = float(args.snvfrac)

    mutstr = get_mutstr(chrom, start, end, ins, reffile)

    del_ln = 0
    if is_deletion:
        del_ln = end - start

    mutpos = start
    mutpos_list = [start]

    # optional CNV file
    cnv = None
    if (args.cnvfile):
        cnv = pysam.Tabixfile(args.cnvfile, 'r')

    log = open(
        'addindel_logs_' + os.path.basename(args.outBamFile) + '/' +
        os.path.basename(args.outBamFile) + "." + "_".join(
            (chrom, str(start), str(end))) + ".log", 'w')

    tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str(
        uuid4()) + ".bam"
    logger.info("%s creating tmp bam: %s" % (mutid, tmpoutbamname))
    outbam_muts = pysam.AlignmentFile(tmpoutbamname, 'wb', template=bamfile)

    mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(
        args,
        log,
        bamfile,
        bammate,
        chrom,
        mutpos,
        mutpos + del_ln + 1,
        mutpos_list,
        avoid=avoid,
        mutid_list=[mutid],
        is_insertion=is_insertion,
        is_deletion=is_deletion,
        ins_seq=ins,
        reffile=reffile,
        indel_start=start,
        indel_end=end,
        vcffile=vcffile)

    if mutfail:
        outbam_muts.close()
        os.remove(tmpoutbamname)
        return None

    # pick reads to change
    readlist = []
    for extqname, read in outreads.items():
        if read.seq != mutreads[extqname]:
            readlist.append(extqname)

    logger.info("%s len(readlist): %d" % (mutid, len(readlist)))
    readlist.sort()
    random.shuffle(readlist)

    if len(readlist) < int(args.mindepth):
        logger.warning("%s skipped, too few reads in region: %d" %
                       (mutid, len(readlist)))
        outbam_muts.close()
        os.remove(tmpoutbamname)
        return None

    if vaf is None:
        vaf = float(args.mutfrac
                    )  # default minor allele freq if not otherwise specified

    if cnv:  # cnv file is present
        if chrom in cnv.contigs:
            for cnregion in cnv.fetch(chrom, start, end):
                cn = float(
                    cnregion.strip().split()[3])  # expect chrom,start,end,CN
                logger.info(mutid + "\t" +
                            ' '.join(("copy number in snp region:", chrom,
                                      str(start), str(end), "=", str(cn))))
                if float(cn) > 0.0:
                    vaf = vaf / float(cn)
                else:
                    vaf = 0.0
                logger.info("%s adjusted VAF: %f" % (mutid, vaf))
    else:
        logger.info("%s selected VAF: %f" % (mutid, vaf))

    lastread = int(len(readlist) * vaf)

    # pick at least args.minmutreads if possible
    if lastread < int(args.minmutreads):
        if len(readlist) > int(args.minmutreads):
            lastread = int(args.minmutreads)
            logger.warning("%s forced %d reads" % (mutid, lastread))
        else:
            logger.warning(
                "%s dropped site with fewer reads than --minmutreads" % mutid)
            os.remove(tmpoutbamname)
            return None

    readtrack = dd(list)

    for readname in readlist:
        orig_name, readpos, pairend = readname.split(',')
        readtrack[orig_name].append('%s,%s' % (readpos, pairend))

    usedreads = 0
    newreadlist = []

    for orig_name in readtrack:
        for read_instance in readtrack[orig_name]:
            newreadlist.append(orig_name + ',' + read_instance)
            usedreads += 1

        if usedreads >= lastread:
            break

    readlist = newreadlist

    logger.info("%s picked: %d reads" % (mutid, len(readlist)))

    wrote = 0
    nmut = 0
    mut_out = {}
    # change reads from .bam to mutated sequences
    for extqname, read in outreads.items():
        if read.seq != mutreads[extqname]:
            if not args.nomut and extqname in readlist:
                qual = read.qual  # changing seq resets qual (see pysam API docs)
                read.seq = mutreads[extqname]  # make mutation
                read.qual = qual
                nmut += 1
        if not hasSNP or args.force:
            wrote += 1
            mut_out[extqname] = read

    muts_written = {}

    for extqname in mut_out:
        if extqname not in muts_written:
            outbam_muts.write(mut_out[extqname])
            muts_written[extqname] = True

            if mutmates[extqname] is not None:
                # is mate also in mutated list?
                mate_read = mutmates[extqname]

                pairname = 'F'  # read is first in pair
                if mate_read.is_read2:
                    pairname = 'S'  # read is second in pair
                if not mate_read.is_paired:
                    pairname = 'U'  # read is unpaired

                mateqname = ','.join(
                    (mate_read.qname, str(mate_read.pos), pairname))

                if mateqname in mut_out:
                    # yes: output mutated mate
                    outbam_muts.write(mut_out[mateqname])
                    muts_written[mateqname] = True

                else:
                    # no: output original mate
                    outbam_muts.write(mate_read)

    logger.info("%s wrote: %d, mutated: %d" % (mutid, wrote, nmut))

    if not hasSNP or args.force:
        outbam_muts.close()
        aligners.remap_bam(args.aligner,
                           tmpoutbamname,
                           args.refFasta,
                           alignopts,
                           threads=int(args.alignerthreads),
                           mutid=mutid,
                           paired=(not args.single),
                           insane=args.insane)

        outbam_muts = pysam.AlignmentFile(tmpoutbamname, 'rb')
        coverwindow = 1
        incover = countReadCoverage(bamfile, chrom, mutpos - coverwindow,
                                    mutpos + del_ln + coverwindow)
        outcover = countReadCoverage(outbam_muts, chrom, mutpos - coverwindow,
                                     mutpos + del_ln + coverwindow)

        avgincover = float(sum(incover)) / float(len(incover))
        avgoutcover = float(sum(outcover)) / float(len(outcover))
        spikein_frac = 0.0
        if wrote > 0:
            spikein_frac = float(nmut) / float(wrote)

        # qc cutoff for final snv depth
        if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover >=
                float(args.coverdiff)) or args.force:
            tmpbams.append(tmpoutbamname)
            indelstr = ''
            if is_insertion:
                indelstr = ':'.join(('INS', chrom, str(start), ins))
            else:
                indelstr = ':'.join(('DEL', chrom, str(start), str(end)))

            snvstr = chrom + ":" + str(start) + "-" + str(
                end) + " (VAF=" + str(vaf) + ")"
            log.write("\t".join(("indel", indelstr, str(mutpos), mutstr,
                                 str(avgincover), str(avgoutcover),
                                 str(spikein_frac), str(maxfrac))) + "\n")
        else:
            outbam_muts.close()
            os.remove(tmpoutbamname)
            if os.path.exists(tmpoutbamname + '.bai'):
                os.remove(tmpoutbamname + '.bai')

            logger.warning("%s dropped for outcover/incover < %s" %
                           (mutid, str(args.coverdiff)))
            return None

    outbam_muts.close()
    bamfile.close()
    bammate.close()
    log.close()

    return sorted(tmpbams)
예제 #7
0
def get_position_matrix(bam, chrom, start, stop, reffile, stepper='all'):
    """
    Given a coordinate range, return a dataframe containing positional
    coverage.

    :param bam: basestring
        BAM file name
    :param chrom: basestring
        chromosome (ie. chr1)
    :param start:
        start position (start at this position)
    :param stop: int
        stop position (do not exceed this position)
    :param reffile: basestring
        reference fasta file
    :param stepper: stepper
    :return:
    """
    total_reads = 0
    reference = pybedtools.BedTool.seq([chrom, 0, stop], reffile)
    infile = pysam.AlignmentFile(bam, "rb", reference_filename=reffile)
    count = start  # running counter for each position added
    alphabet = {}
    positions = []
    offset = 0
    max_offset = 0
    MAX_DEPTH = 10000000
    check = start
    for pileupcolumn in infile.pileup(chrom,
                                      start,
                                      stop,
                                      stepper=stepper,
                                      max_depth=MAX_DEPTH):
        if pileupcolumn.pos >= start:
            st = ""
            # print("pileuppos: {}".format(pileupcolumn.pos))
            # print("count: {}".format(count))
            if count >= (
                    stop
            ) or pileupcolumn.pos >= stop:  # I think this works because of sorted reads?
                break
            """
            if there is no read coverage at the beginning positions
            """
            while (count < pileupcolumn.pos):
                alphabet['A'] = 0
                alphabet['T'] = 0
                alphabet['C'] = 0
                alphabet['G'] = 0
                alphabet['del'] = 0
                alphabet['ref'] = reference[pileupcolumn.reference_pos].upper()
                # print(alphabet)
                positions.append(alphabet)
                alphabet = {}
                # print("ADDING COUNT")
                count = count + 1

                # print('{}\t0'.format(count))
            # print str(pp.pos)+'\t'+str(pp.n)
            # print(len(pileupcolumn.pileups))
            for pileupread in pileupcolumn.pileups:  # for each pileup read
                total_reads = total_reads + 1
                if not pileupread.is_del and not pileupread.indel and not pileupread.is_refskip:
                    st = st + pileupread.alignment.query_sequence[
                        pileupread.query_position]
                elif pileupread.is_del:
                    st = st + 'd'
                elif pileupread.is_refskip:
                    st = st + 's'
                else:
                    st = st + '-'

                    # print(st)
                    # print("ADDING: {} at step: {}, at pos: {}".format(st, count, pileupcolumn.reference_pos))
            alphabet['A'] = st.count('A')
            alphabet['T'] = st.count('T')
            alphabet['C'] = st.count('C')
            alphabet['G'] = st.count('G')
            alphabet['del'] = st.count('d')
            alphabet['ref'] = reference[pileupcolumn.reference_pos].upper()
            count = count + 1
            # print(alphabet)

            positions.append(alphabet)
            alphabet = {}
            # print('{} '.format(count)),
    """
    If there are positions in the end without read coverage
    """
    while count < stop:
        # count = count + 1
        alphabet['A'] = 0
        alphabet['T'] = 0
        alphabet['C'] = 0
        alphabet['G'] = 0
        alphabet['del'] = 0
        alphabet['ref'] = reference[count].upper()
        # print(alphabet)
        count = count + 1
        positions.append(alphabet)
    # print(start, stop, len(positions), max_offset, check-start)
    # print(total_reads)
    return pd.DataFrame(positions)
예제 #8
0
 def __init__(
     self,
     sites,
     bam,
     logs_dir,
     # send_end,
     name,
     cells=None,
     log_every=25000,
     pad_left=15,
     pad_right=20,
     min_reads=2,
     min_freq=0.65,
     site_keys=None,
     add_seq=False,
     chr_tag="",
     **kwargs,
 ):
     super().__init__(name=name, **kwargs)
     self.sites = sites
     self.chr_tag = chr_tag
     self.bam = pysam.AlignmentFile(bam, "r")
     self.logs_dir = logs_dir
     self.log_every = int(log_every)
     # self.send_end = send_end
     self.pad_left = int(pad_left)
     self.pad_right = int(pad_right)
     self.min_reads = min_reads
     self.min_freq = min_freq
     self.add_seq = add_seq
     self.iter = 0
     self.n_corr = 0
     self.n_umi = 0
     self.rm_reads = []
     self.info = dict()
     self.info["umi"] = dict(
         cfreq=[],
         freq=[],
         count=[],
         nucl=[],
         qual=[],
         avg_qual=[],
         motif_len=[],
         variants=[],
         chrom=[],
         start=[],
         stop=[],
     )
     self.info["global"] = dict(
         n_umi=[],
         n_umi_corr=[],
         n_reads=[],
         n_reads_corr=[],
         f_min_reads=[],
         f_min_freq=[],
         nucl=[],
         motif_len=[],
         variants=[],
         variants_kept=[],
         chrom=[],
         start=[],
         stop=[],
     )
     self.sites_kept = set()
     if cells is not None:
         self.cells = set(
             map(lambda x: x.split()[0], open(cells, "r").read().splitlines())
         )
         self.has_cells = True
     else:
         self.cells = set()
         self.has_cells = False
     if site_keys is None:
         self.site_keys = OrderedDict(
             {v: i for i, v in enumerate(["chrom", "start", "stop"])}
         )
     else:
         self.site_keys = OrderedDict(site_keys)
     self.variant_f = open(self.pr_path.format("variants", "txt"), "w")
     self.variant_f.write(
         "chrom start cb a1 a2 n-umi-1 n-umi-2 n-reads-1 n-reads-2\n"
     )
예제 #9
0
def rawAssignment(SRAList, Names, Params):
    # changes: 04.Jul :: restucturate h5 file - creates h5 files with keys like "For_rpm/I" and sets "Position" to index
    makeDirectory("6-AssignRaw")
    makeDirectory("6-AssignRaw/Reports")
    # include_mapped_twice influence ho normalisation factor is computed. vt blow
    include_mapped_twice = Params[
        'MappedTwice']  # includes reads mapped twice NH:i:2
    save_csv = bool(Params["Save2csv"]
                    )  # save output to tab delim csv. in addition to hdf5

    rlmin = int(Params["ReadLenMiN"])
    rlmax = int(Params["ReadLenMaX"])
    Mapping = Params["Mapping"]  # Mapping 5 or 3 prime end
    rlrange = str(rlmin) + "-" + str(rlmax)  # read length range 4 filename

    for iN in Names:

        BamName = "5-Aligned/" + iN + ".bam"  # sorted and indexed BAM
        bamfile = pysam.AlignmentFile(BamName, "rb")  # open BAM filee
        outfile_for = "6-AssignRaw/" + iN + "_raw" + "_For.txt"
        outfile_rev = "6-AssignRaw/" + iN + "_raw" + "_Rev.txt"
        outfile_hdf = "6-AssignRaw/" + iN + ".h5"
        outf_idx_hdf = "6-AssignRaw/" + iN + "_" + "idx" + ".h5"
        LogFileName = "6-AssignRaw/Reports/" + iN + "_log.txt"
        LOG_FILE = open(LogFileName, "wt")
        # counters for log
        c2_twice = c_once = total_no = 0
        # empty dataframe for collecting data
        df_for_sum = pd.DataFrame()
        df_rev_sum = pd.DataFrame()
        # Process Log
        report = "\nBamFile: {}\nrlmin:   {}\nrlmax:   {}\nName:    {}\nMapping: {}".format(
            BamName, rlmin, rlmax, iN, Mapping)
        LOG_FILE.write(report + "\n")
        print(report, "\n")

        # humanChr() gives an ordered list
        for ref in yeastChr():
            c1 = 0
            c2 = 0
            reads_mapped_ref = 0
            ref_total_read_count = 0

            defF = defaultdict(list)  # DefaultDict  For
            defR = defaultdict(list)  # DefaultDict  Rev
            ForDict = {}  # Collecting  data For
            RevDict = {}  # Collecting  data Rev

            for read in bamfile.fetch(ref):
                ref_total_read_count += 1
                # collect no of readsa
                if (read.get_tag("NH") == 1):
                    c1 += 1
                elif (read.get_tag("NH") == 2):
                    c2 += 1
                else:
                    pass

                if (
                        read.get_tag("NH") == 1
                ):  # NH tag (NH:i:1) tells how many times read are mapped to genome
                    reads_mapped_ref += 1
                    readl = read.query_length  # get read length
                    # Redefining leftmost & rightmost
                    if not read.is_reverse:  # read is Forward
                        beg = read.reference_start  # 5'
                        end = read.reference_end - 1  # 3' correct by -1
                    else:  # read is Reverse
                        beg = read.reference_end - 1  # 5' correct by -1
                        end = read.reference_start  # 3'

                    if Mapping == "5":
                        defR[readl].append(
                            beg) if read.is_reverse else defF[readl].append(
                                beg)
                    if Mapping == "3":
                        defR[readl].append(
                            end) if read.is_reverse else defF[readl].append(
                                end)
                # to include those mapped twice
                if (read.get_tag("NH") == 2) & (include_mapped_twice == "Yes"):
                    reads_mapped_ref += 1
                    readl = read.query_length  # get read length
                    # Redefining leftmost & rightmost
                    if not read.is_reverse:  # read is Forward
                        beg = read.reference_start  # 5'
                        end = read.reference_end - 1  # 3' correct by -1
                    else:  # read is Reverse
                        beg = read.reference_end - 1  # 5' correct by -1
                        end = read.reference_start  # 3'

                    if Mapping == "5":
                        defR[readl].append(
                            beg) if read.is_reverse else defF[readl].append(
                                beg)
                    if Mapping == "3":
                        defR[readl].append(
                            end) if read.is_reverse else defF[readl].append(
                                end)

            dummy = [0]
            for rlen in range(rlmin, rlmax + 1):
                ForDict[rlen] = Counter(defF.get(
                    rlen, dummy))  # .get() method if rlen
                RevDict[rlen] = Counter(defR.get(
                    rlen, dummy))  # if don't exist use dummy

            df_for = update_df(pd.DataFrame(ForDict), Chr=ref, strand="+")
            df_rev = update_df(pd.DataFrame(RevDict), Chr=ref, strand="-")

            df_for_sum = pd.concat([df_for_sum, df_for],
                                   ignore_index=True)  # collect summary table
            df_rev_sum = pd.concat([df_rev_sum, df_rev],
                                   ignore_index=True)  # collect summary table

            # Log_File pr Chr
            report = "{:<5s}\t{:>10,d} reads".format(ref, reads_mapped_ref)
            LOG_FILE.write(report + "\n")
            print(report)
            # Reset/collect counter data
            c_once += c1
            c2_twice += c2  # mapped twice
            total_no += ref_total_read_count
            reads_mapped_ref = ref_total_read_count = 0

        # Per Name !!!
        # convert int column names to str
        df_for_sum.rename(columns={i: str(i)
                                   for i in range(rlmin, rlmax + 1)},
                          inplace=True)  # num col_names to str
        df_rev_sum.rename(columns={i: str(i)
                                   for i in range(rlmin, rlmax + 1)},
                          inplace=True)  # num col_names to str

        ## Log Report summary
        report = "\nTotal No of reads {:>11,} mapped to genome\n".format(
            total_no)
        report += "Number   of reads {:>11,d} mapped once to genome\n".format(
            c_once)
        report += "Number   of reads {:>11,d} mapped twice to genome reports # will be added if MappedTwice == True \n".format(
            c2_twice)
        report += "Number   of reads {:>11,d} mapped more than counted already\n".format(
            total_no - (c_once + c2_twice))
        LOG_FILE.write(report)
        print(report)
        ##>>
        report = "\nOutput tables are stored:"
        LOG_FILE.write(report + "\n")
        print(report)

        if save_csv == True:
            df_for_sum.to_csv(outfile_for, sep='\t', header=True,
                              index=True)  # csv table output
            df_rev_sum.to_csv(outfile_rev, sep='\t', header=True,
                              index=True)  # csv table output
            report = "{}\n{}\n".format(outfile_for, outfile_rev)
            LOG_FILE.write(report + "\n")
            print(report)

        report = "{}\tkeys: 'For_raw', 'Rev_raw'".format(outfile_hdf)

        store = pd.HDFStore(outfile_hdf, complevel=5, complib="zlib", mode="w")
        store.put("For_raw", df_for_sum, format="table", data_columns=True)
        store.put("Rev_raw", df_rev_sum, format="table", data_columns=True)
        LOG_FILE.write("\n" + report + "\n")
        print(report)
        # Convert to RPM s
        report = "\n Converting raw -> rpm \n"
        LOG_FILE.write(report + "\n")
        print(report)
        report = ""
        #
        ## Convert RAW -> RPM
        #
        # include_mapped_twice = Yes  mapped twice are included to RPM normalisation
        #
        normFactor = 0
        if include_mapped_twice == "Yes":
            l = [0 for read in bamfile.fetch()
                 if read.get_tag("NH") <= 2]  # reads mapped once & twice
            normFactor = len(l) / 10**6  # normalisation factor
            report = "Normalization factor {} is computed based reads  mapped once and twice {:,}".format(
                normFactor, len(l))
        else:
            l = [0 for read in bamfile.fetch()
                 if read.get_tag("NH") == 1]  # reads mapped once
            normFactor = len(l) / 10**6  # normalisation factor
            report = "Normalization factor {} is computed based on reads mapped once {:,}".format(
                normFactor, len(l))

        LOG_FILE.write(report + "\n")
        print(report)
        report = ""
        col2norm = [str(i) for i in (range(rlmin, rlmax + 1))] + ["sum"]

        for iX in col2norm:  # normalization

            df_for_sum[iX] = df_for_sum[iX] / normFactor
            df_rev_sum[iX] = df_rev_sum[iX] / normFactor
            line = "Normal factor for {} - {:7.4f}".format(iX, normFactor)
            report += line + "\n"
            print(line)

        LOG_FILE.write(report + "\n")
        print("")

        if save_csv == True:
            outfile_for, outfile_rev = outfile_for.replace(
                "_raw", "_rpm"), outfile_rev.replace("_raw", "_rpm")
            df_for_sum.to_csv(outfile_for, sep='\t', header=True,
                              index=True)  # csv table output
            df_rev_sum.to_csv(outfile_rev, sep='\t', header=True,
                              index=True)  # csv table output
            report = "{}\n{}\n".format(outfile_for, outfile_rev)
            LOG_FILE.write(report + "\n")
            print(report)

        store.put("For_rpm", df_for_sum, format="table", data_columns=True)
        store.put("Rev_rpm", df_rev_sum, format="table", data_columns=True)
        store.close()
        report = "\n{}\tkeys: 'For_rpm', 'Rev_rpm'\n".format(outfile_hdf)
        report += "\n{}\tTime taken thus far: {}".format(
            iN,
            time.time() - Start)
        LOG_FILE.write(report + "\n")
        print(report)

        # restructurate hdf
        infile = outfile_hdf
        outfile = outf_idx_hdf
        restructurate_hd5(infile, outfile, close_outfile=True)
        report = "Restructurate hdf\nInfile:   {}\nOutfile:  {}".format(
            infile, outfile)
        LOG_FILE.write(report + "\n")
        print(report, "\n")

    LOG_FILE.close()
    bamfile.close()
예제 #10
0
def main():
    args = parse_args()
    og_bam = args.original_bam
    read_set_out = args.read_set_out
    read_set = args.read_set
    files_to_delete = args.files_to_delete
    files_to_delete_path = args.files_to_delete_path
    new_file_path = args.new_file_path
    new_bams = args.new_bam_filename
    new_fastqs = args.new_fastq_filename

    print dir(gzip_module)

    bam_files = []
    fastq_files = []

    original_reads = set()
    new_reads = set()

    to_remove = []

    if og_bam == None and read_set == None:
        raise ValueError('--original_bam or --read_set parameter must be set.')

    if not og_bam == None and read_set_out == None:
        raise ValueError(
            '--read_set_out parameter must be set when using --original_bam.')

    if og_bam == None and not read_set_out == None:
        raise ValueError(
            '--original_bam parameter must be set when using --read_set_out.')

    if not og_bam == None and not read_set == None:
        raise ValueError(
            '--original_bam and --read_set parameters cannot both be set. Only use one.'
        )

    if not new_bams == None and not new_fastqs == None:
        raise ValueError(
            '--new_bam and --new_fastq parameters cannot both be set. Only use one.'
        )

    if new_bams == None and new_fastqs == None:
        raise ValueError(
            'Either --new_bam_filename or --new_fastq_filename must be set.')

    if not files_to_delete == None and files_to_delete_path == None:
        print "Using './' path as default filepath for files indicated by --files_to_delete parameter."

    if new_file_path == None:
        print "Using './' path as default filepath for new BAMs or new FASTQs."

    if not og_bam == None and read_set == None:
        # original bam parameter set and read_set parameter not set.

        samfile = pysam.AlignmentFile(os.path.abspath(og_bam), 'rb')

        for read in samfile.fetch(until_eof=True):
            if read.is_read1:
                original_reads.add(read.query_name + '/1')
            elif read.is_read2:
                original_reads.add(read.query_name + '/2')
            else:
                original_reads.add(read.query_name)

        p = subprocess.Popen('gzip > ' + read_set_out + '.gz',
                             bufsize=-1,
                             shell=True,
                             stdin=subprocess.PIPE)
        p.stdin.write('\n'.join(original_reads))

    elif og_bam == None and not read_set == None:
        # read set parameter set and original bam parameter not set.
        if '.gz' in os.path.basename(read_set):
            with gzip_module.open(read_set, 'rb') as f:
                original_reads = set(r.rstrip() for r in f)
        else:
            with open(read_set) as f:
                original_reads = set(r.rstrip() for r in f)

    # Parse new bam or new fastqs into set
    if not new_bams == None and new_fastqs == None:
        if len(new_bams) == 1:
            if '*' in new_bams[0]:
                bam_files = glob.glob(
                    os.path.join(new_file_path, os.path.basename(new_bams[0])))
            else:
                bam_files.append(
                    os.path.join(new_file_path, os.path.basename(new_bams[0])))
        else:
            bam_files = [
                os.path.join(new_file_path, os.path.basename(b))
                for b in new_bams
            ]

        for bam_file in bam_files:
            samfile = pysam.AlignmentFile(os.path.abspath(bam_file), 'rb')

            for read in samfile.fetch(until_eof=True):
                if read.is_read1:
                    new_reads.add(read.query_name + '/1')
                elif read.is_read2:
                    new_reads.add(read.query_name + '/2')
                else:
                    new_reads.add(read.query_name)

    elif new_bams == None and not new_fastqs == None:
        if len(new_fastqs) == 1:
            if '*' in new_fastqs[0]:
                fastq_files = glob.glob(
                    os.path.join(new_file_path,
                                 os.path.basename(new_fastqs[0])))
            else:
                fastq_files.append(
                    os.path.join(new_file_path,
                                 os.path.basename(new_fastqs[0])))
        else:
            fastq_files = [
                os.path.join(new_file_path, os.path.basename(f))
                for f in new_fastqs
            ]

        for fastq_file in fastq_files:
            abs_fastq_file = os.path.abspath(fastq_file)
            if '.gz' in fastq_file:
                with gzip_module.open(abs_fastq_file) as fastq:
                    i = 0
                    for line in fastq:
                        if i % 4 == 0:
                            qname = line.rstrip()
                            new_reads.add(qname[1:])
                        i += 1
            else:
                with open(abs_fastq_file) as fastq:
                    i = 0
                    for line in fastq:
                        if i % 4 == 0:
                            qname = line.rstrip()
                            new_reads.add(qname[1:])
                        i += 1

    # Check if the two sets are the same.
    if not len(original_reads - new_reads) == 0:
        print original_reads - new_reads
        raise ValueError(
            "FAIL: Read names are missing in the new files compared to original file."
        )
    elif not len(new_reads - original_reads) == 0:
        print new_reads - original_reads
        raise ValueError(
            "FAIL: More read names in new file(s) compared to original file.")
    else:
        print "SUCCESS: Read names in original file matches read names in new file(s)."
        to_remove = []
        if not files_to_delete == None:
            if len(files_to_delete) == 1:
                if '*' in files_to_delete[0]:
                    to_remove = glob.glob(
                        os.path.join(files_to_delete_path,
                                     os.path.basename(files_to_delete)))
                else:
                    to_remove.append(
                        os.path.join(files_to_delete_path,
                                     os.path.basename(files_to_delete[0])))
            else:
                to_remove = [
                    os.path.join(files_to_delete_path, os.path.basename(f))
                    for f in files_to_delete
                ]
            subprocess.call(["rm"] + to_remove)

    return
예제 #11
0
 def openFile(self, dataFile):
     return pysam.AlignmentFile(dataFile)
예제 #12
0
    return distances[-1]


# criteria
# the boundary of real and noisy STAMPs
num_of_STAMPs = int(sys.argv[3])
# num_of_STAMPs = 500
edit_distance_threthold = 1
minimal_mapping_quality = 10

bam_file = sys.argv[2]

with open(sys.argv[1], 'r') as f:
    selected_STAMPs = [next(f).split('\t')[1] for x in range(num_of_STAMPs)]

f = pysam.AlignmentFile(bam_file, 'rb')
output_bam_noisy = bam_file.replace('.bam', '') + '_mapQ' + \
                   str(minimal_mapping_quality) + '_below' + \
                   str(num_of_STAMPs) + '.bam'
f_output_noisy = pysam.AlignmentFile(output_bam_noisy, 'wb', template=f)

output_bam_real = bam_file.replace('.bam', '') + '_mapQ' + \
                  str(minimal_mapping_quality) + '_above' + \
                  str(num_of_STAMPs) + '.bam'
f_output_real = pysam.AlignmentFile(output_bam_real, 'wb', template=f)

for read in f.fetch(until_eof=True):
    if read.mapping_quality >= minimal_mapping_quality:
        indicator = int()
        for i in read.tags:
            if i[0] == 'XC':
예제 #13
0
    def _parse_sam_file_and_vcf(
        cls,
        samfile,
        query_vcf_file,
        flank_length,
        allow_mismatches,
        exclude_regions=None,
        max_soft_clipped=3,
        number_ns=0,
    ):
        if exclude_regions is None:
            exclude_regions = {}

        found = []
        match_flag = []
        correct_allele = []
        gt_conf = []
        allele = []

        samfile_handle = pysam.AlignmentFile(samfile, "r")
        sam_previous_record_name = None
        for sam_record in samfile_handle.fetch(until_eof=True):
            if sam_record.query_name == sam_previous_record_name:
                continue
            sam_previous_record_name = sam_record.query_name
            found_conf = False
            found_allele = False

            # see if excluded region in bed file
            ref, start, ref_num, var_num, allele_num = sam_record.query_name.rsplit(
                ".", maxsplit=5)
            start = int(start) + flank_length
            exclude = False
            for ref_name in exclude_regions.keys():
                end = int(start) + 1
                interval = pyfastaq.intervals.Interval(start, end)
                exclude = EvaluateRecall._interval_intersects_an_interval_in_list(
                    interval, exclude_regions[ref_name])
            if exclude:
                found.append("Exclude")
                gt_conf.append(0)
                allele.append("0")
                continue

            match = EvaluateRecall._check_if_sam_match_is_good(
                sam_record,
                flank_length,
                query_sequence=sam_record.query_sequence,
                allow_mismatches=allow_mismatches,
                max_soft_clipped=max_soft_clipped,
            )
            alignment_start = str(sam_record).split("\t")[3]
            match_flag.append(match)
            if match == "Good":
                logging.debug("SAM record is a good match")
                logging.debug("SAM record reference is %s" %
                              sam_record.reference_name)
                ref_name, expected_start, vcf_pos_index, vcf_record_index, allele_index = sam_record.reference_name.rsplit(
                    ".", maxsplit=4)

                vcf_reader = pysam.VariantFile(query_vcf_file)
                vcf_interval_start = (int(expected_start) +
                                      int(alignment_start) + flank_length - 2 -
                                      number_ns)
                vcf_interval_end = (int(expected_start) +
                                    int(alignment_start) + flank_length -
                                    number_ns)
                logging.debug(
                    "Find VCF records matching ref %s in interval [%i,%i]" %
                    (ref_name, vcf_interval_start, vcf_interval_end))
                for i, vcf_record in enumerate(
                        vcf_reader.fetch(ref_name, vcf_interval_start,
                                         vcf_interval_end)):
                    if i == int(vcf_pos_index):
                        sample_name = vcf_record.samples.keys()[0]
                        if ("GT" in vcf_record.format.keys() and len(
                                set(vcf_record.samples[sample_name]["GT"]))
                                == 1):
                            if int(allele_index) == int(
                                    vcf_record.samples[sample_name]["GT"][0]):
                                found.append("1")
                                allele.append(str(allele_index))
                                correct_allele.append("1")
                                found_allele = True
                                if "GT_CONF" in vcf_record.format.keys():
                                    gt_conf.append(
                                        int(
                                            float(
                                                vcf_record.samples[sample_name]
                                                ["GT_CONF"])))
                                    found_conf = True

            if not found_allele:
                found.append("0")
                allele.append("0")
                correct_allele.append("0")
            if not found_conf:
                gt_conf.append(0)
        assert len(found) == len(gt_conf)
        assert len(found) == len(allele)
        assert len(found) == len(match_flag)
        assert len(found) == len(correct_allele)
        return found, gt_conf, allele, match_flag, correct_allele
예제 #14
0
                        "--fasta",
                        help="Input fasta file",
                        required=True)
    parser.add_argument("-l", "--length", help="Read length", default=100)
    parser.add_argument("-o",
                        "--output",
                        help="Output BAM file",
                        required=True)
    parser.add_argument("-s", "--seed", help="Random seed", default=2)
    parser.add_argument("--verbose", help="Verbose mode", action="store_true")
    opts = parser.parse_args()

    seqs = loadSeqsFromFasta(opts.fasta)
    header = makeHeader(seqs)

    with pysam.AlignmentFile(opts.output, "wb", header=header) as outf:
        for i, seq in enumerate(seqs):
            n = 0
            if opts.verbose:
                print("Chromosome: {}".format(seq))
            for pos in range(len(seqs[seq]) - int(opts.length)):
                n += 1
                a = pysam.AlignedSegment()
                a.query_name = "read_" + str(pos) + "_" + str(n)

                a.query_sequence = seqs[seq][pos:pos + int(opts.length)]
                a.flag = 0
                a.reference_id = i
                a.reference_start = pos
                a.mapping_quality = 20
                a.cigarstring = str(opts.length) + 'M'
예제 #15
0
def to_bam(string):
    return pysam.AlignmentFile(string, mode="rb")
예제 #16
0
            chrom_ind += chrom_span
        if len(gene_merge_list) != 0:
            merge_dict[sort_df.loc[i, 'name2']] = gene_merge_list
        i += sym_span
    return merge_dict


samfile_path = '/Users/liuzhen/intern/data/test/tophat_map_g1/accepted_hits.bam'
refgene_path = '/Users/liuzhen/intern/data/test/refGene.txt'
result_path = '/Users/liuzhen/intern/data/test/RPKM_gene_dpsfd_avglen_result.txt'
refgene_column = [
    'bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd',
    'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2', 'cdsStartStat',
    'cdsEndStat', 'exonFrames'
]
samfile = pysam.AlignmentFile(samfile_path, 'rb')
refgene_df = pd.read_table(refgene_path,
                           sep='\t',
                           header=None,
                           names=refgene_column)

mapped_reads_amount = samfile.count()
merge_dict = mergeRefGeneDf(refgene_df)
result_list = []
for gene in merge_dict:
    read_counts = 0
    exon_length = 0
    chrom_num = 0
    trans_id_list = []
    tss_list = []
    chrom_list = []
예제 #17
0
def main(args):
    logger.info("starting %s called with args: %s" %
                (sys.argv[0], ' '.join(sys.argv)))
    bedfile = open(args.varFileName, 'r')
    reffile = pysam.Fastafile(args.refFasta)

    if not os.path.exists(args.bamFileName + '.bai'):
        logger.error("input bam must be indexed, not .bai file found for %s" %
                     args.bamFileName)
        sys.exit(1)

    alignopts = {}
    if args.alignopts is not None:
        alignopts = dict([o.split(':') for o in args.alignopts.split(',')])

    aligners.checkoptions(args.aligner, alignopts)

    # load readlist to avoid, if specified
    avoid = None
    if args.avoidreads is not None:
        avoid = dictlist(args.avoidreads)

    # make a temporary file to hold mutated reads
    outbam_mutsfile = "addindel." + str(uuid4()) + ".muts.bam"
    bamfile = pysam.AlignmentFile(args.bamFileName, 'rb')
    outbam_muts = pysam.AlignmentFile(outbam_mutsfile, 'wb', template=bamfile)
    outbam_muts.close()
    bamfile.close()
    tmpbams = []

    if not os.path.exists(args.tmpdir):
        os.mkdir(args.tmpdir)
        logger.info("created tmp directory: %s" % args.tmpdir)

    if not os.path.exists('addindel_logs_' +
                          os.path.basename(args.outBamFile)):
        os.mkdir('addindel_logs_' + os.path.basename(args.outBamFile))
        logger.info("created directory: addindel_logs_%s" %
                    os.path.basename(args.outBamFile))

    assert os.path.exists('addindel_logs_' + os.path.basename(args.outBamFile)
                          ), "could not create output directory!"
    assert os.path.exists(args.tmpdir), "could not create temporary directory!"

    pool = Pool(processes=int(args.procs))
    results = []

    ntried = 0
    for bedline in bedfile:
        if ntried < int(args.numsnvs) or int(args.numsnvs) == 0:
            c = bedline.strip().split()
            chrom = c[0]
            start = int(c[1])
            end = int(c[2])
            vaf = float(c[3])
            type = c[4]
            ins = None

            assert type in ('INS', 'DEL')
            if type == 'INS':
                ins = c[5]

            # make mutation (submit job to thread pool)
            result = pool.apply_async(
                makemut, [args, chrom, start, end, vaf, ins, avoid, alignopts])
            results.append(result)
            ntried += 1

    for result in results:
        tmpbamlist = result.get()
        if tmpbamlist is not None:
            for tmpbam in tmpbamlist:
                if os.path.exists(tmpbam):
                    tmpbams.append(tmpbam)

    if len(tmpbams) == 0:
        logger.error("no succesful mutations")
        sys.exit()

    tmpbams.sort()

    # merge tmp bams
    if len(tmpbams) == 1:
        os.rename(tmpbams[0], outbam_mutsfile)
    elif len(tmpbams) > 1:
        mergebams(tmpbams, outbam_mutsfile, maxopen=int(args.maxopen))

    bedfile.close()

    # cleanup
    for bam in tmpbams:
        if os.path.exists(bam):
            os.remove(bam)
        if os.path.exists(bam + '.bai'):
            os.remove(bam + '.bai')

    if os.listdir(args.tmpdir) == []:
        os.rmdir(args.tmpdir)

    if args.skipmerge:
        logger.info("skipping merge, plase merge reads from %s manually." %
                    outbam_mutsfile)
    else:
        if args.tagreads:
            from bamsurgeon.markreads import markreads
            tmp_tag_bam = 'tag.%s.bam' % str(uuid4())
            markreads(outbam_mutsfile, tmp_tag_bam)
            move(tmp_tag_bam, outbam_mutsfile)
            logger.info("tagged reads.")

        logger.info("done making mutations, merging mutations into %s --> %s" %
                    (args.bamFileName, args.outBamFile))
        replace(args.bamFileName,
                outbam_mutsfile,
                args.outBamFile,
                seed=args.seed)

        #cleanup
        os.remove(outbam_mutsfile)

    var_basename = '.'.join(os.path.basename(args.varFileName).split('.')[:-1])
    bam_basename = '.'.join(os.path.basename(args.outBamFile).split('.')[:-1])

    vcf_fn = bam_basename + '.addindel.' + var_basename + '.vcf'

    makevcf.write_vcf_indel(
        'addindel_logs_' + os.path.basename(args.outBamFile), args.refFasta,
        vcf_fn)

    logger.info('vcf output written to ' + vcf_fn)
예제 #18
0
import pysam
import sys

samfile = pysam.AlignmentFile(sys.argv[1], "r")
for align in samfile:
    if not align.is_supplementary \
      and not align.is_secondary \
      and not align.is_unmapped \
      and align.reference_length >= 500 \
      and align.mapping_quality > 0:
        try:
            sa = align.get_tag('SA')
        except KeyError:
            # exclude alignments if supplementary exists
            ref = samfile.get_reference_name(align.reference_id)
            ref, contig = ref.split("__")

            print("%s\t%s\t%s\t%s" %
                  (ref, contig, align.query_name, align.alen))
예제 #19
0
import pysam

sam = pysam.AlignmentFile("SRR3189743.join.aligned.sort.bam")

# print(sam.header)

#sam.count(contig=sam.references[100])
#sam.count_coverage(contig=sam.references[100])
#sam.pileup(contig=sam.references[100])

#for c in sorted([(sam.count(contig=c), c) for c in sam.references]):
#    print(f"{c[0]}\t{c[1]}")

target_contig = sam.references[0]

# select significant column
cc = sam.count_coverage(contig=target_contig)

[ for i in range(len(cc)) ]

# divide reads into new tmp bams

# recur for new bams

sam.close()
예제 #20
0
def retrieve_reads(in_bam, in_vcf, quality, out_list):
    print("\033[32m%s\033[0m Reading VCF" %
          (time.strftime('[%H:%M:%S]', time.localtime(time.time()))))
    snp_db, snp_pos_db = read_vcf(in_vcf)
    reads_db = {}
    for chrn in snp_pos_db:
        reads_db[chrn] = {}
        for pos in snp_pos_db[chrn]:
            reads_db[chrn][pos] = {'ref': {}, 'alt': {}}
    bamfile = pysam.AlignmentFile(in_bam, 'rb')

    print("\033[32m%s\033[0m Reading bam" %
          (time.strftime('[%H:%M:%S]', time.localtime(time.time()))))
    for line in bamfile:
        read_name = line.query_name
        flag = bin(line.flag)
        #if line.mapping_quality < quality or line.mapping_quality == 255: # filter data with mapping quality
        #	continue
        if flag[-3] == '1':  # this flag means segment unmapped
            continue
        if len(flag) > 7 and flag[
                -5] == '1':  # this flag means query seq is reverse complemented
            is_rev = True
        else:
            is_rev = False
        chrn = line.reference_name
        sp = line.reference_start
        ep = sp + line.reference_length
        search_pos = get_pos_in_range(snp_pos_db[chrn], sp, ep)
        cigar = line.cigartuples
        query_sequence = line.query_sequence
        alignment_length = line.query_alignment_length
        mapping_quality = line.mapping_quality
        for i in search_pos:
            offset = i - sp - 1
            if offset > len(query_sequence):
                continue
            ref = snp_db[chrn][i][0]
            alt = snp_db[chrn][i][1]
            query_base = get_base_pos_with_offset(cigar, query_sequence,
                                                  offset, is_rev)
            if query_base.lower() == ref.lower():
                if read_name not in reads_db[chrn][i]['alt']:
                    reads_db[chrn][i]['ref'][read_name] = [
                        mapping_quality, alignment_length
                    ]
                else:
                    amapq, amapl = reads_db[chrn][i]['alt'][read_name]
                    if mapping_quality > amapq:
                        reads_db[chrn][i]['alt'].pop(read_name)
                        reads_db[chrn][i]['ref'][read_name] = [
                            mapping_quality, alignment_length
                        ]
                    elif alignment_length > amapl:
                        reads_db[chrn][i]['alt'].pop(read_name)
                        reads_db[chrn][i]['ref'][read_name] = [
                            mapping_quality, alignment_length
                        ]
            elif query_base.lower() == alt.lower():
                if read_name not in reads_db[chrn][i]['ref']:
                    reads_db[chrn][i]['alt'][read_name] = [
                        mapping_quality, alignment_length
                    ]
                else:
                    rmapq, rmapl = reads_db[chrn][i]['ref'][read_name]
                    if mapping_quality > rmapq:
                        reads_db[chrn][i]['ref'].pop(read_name)
                        reads_db[chrn][i]['alt'][read_name] = [
                            mapping_quality, alignment_length
                        ]
                    elif alignment_length > rmapl:
                        reads_db[chrn][i]['ref'].pop(read_name)
                        reads_db[chrn][i]['alt'][read_name] = [
                            mapping_quality, alignment_length
                        ]

    bamfile.close()
    print("\033[32m%s\033[0m Writing result" %
          (time.strftime('[%H:%M:%S]', time.localtime(time.time()))))
    with open(out_list, 'w') as f_out:
        for chrn in sorted(reads_db):
            for pos in sorted(reads_db[chrn]):
                f_out.write("%s,%d,%s," %
                            (chrn, pos, '|'.join(snp_db[chrn][pos])))
                ref_list = []
                for read_name in sorted(reads_db[chrn][pos]['ref']):
                    ref_list.append(
                        "%s(%d;%d)" %
                        (read_name, reads_db[chrn][pos]['ref'][read_name][0],
                         reads_db[chrn][pos]['ref'][read_name][1]))
                alt_list = []
                for read_name in sorted(reads_db[chrn][pos]['alt']):
                    alt_list.append(
                        "%s(%d;%d)" %
                        (read_name, reads_db[chrn][pos]['alt'][read_name][0],
                         reads_db[chrn][pos]['alt'][read_name][1]))
                f_out.write("%s,%s\n" %
                            ('|'.join(ref_list), '|'.join(alt_list)))
    print("\033[32m%s\033[0m Finished" %
          (time.strftime('[%H:%M:%S]', time.localtime(time.time()))))
예제 #21
0
def main():
    start = timeit.default_timer()
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)-15s [%(processName)s.%(levelname)s] %(message)s")
    parser = argparse.ArgumentParser(description="Utility for retrieving tumor specific kmers in RNA-seq reads.")
    parser.add_argument("--Kmer_file", required=True, type=str, nargs='?', help="provide Kmer file here")
    parser.add_argument("--input_bam_file", required=True, type=str, nargs='?', help="provide input bam file path here")
    parser.add_argument("--out_bam_file", required=True, type=str, nargs='?', help="provide output bam file path here")
    args = parser.parse_args()

    cigar_map = {0:'M', 1:'I', 2:'D', 3:'N', 4:'S', 5:'H', 6:'P', 7:'=', 8:'X', 9:'B'}

    samfile = pysam.AlignmentFile(args.input_bam_file, "rb")
    kmer_reads = pysam.AlignmentFile(args.out_bam_file, "wb", template=samfile)

    trie = ahocorasick.Automaton()

    with open(args.Kmer_file) as f:
        for line in f:
            line_split = line.strip().split('\t')
            trie.add_word(line_split[0], line_split[0])

    trie.make_automaton()
    logging.info("finished making automaton")

    for read in samfile.fetch():
        if not read.is_unmapped and not read.is_secondary and read.is_proper_pair and read.is_paired and\
                not read.is_duplicate and not read.is_supplementary and 'N' in read.cigarstring:

            for end, kmer in trie.iter(read.query_sequence.upper()):
                start_index = end - len(kmer) + 1
                end_index = end + 1

                if all(ref_pos is None for ref_pos in
                       read.get_reference_positions(full_length=True)[start_index:end_index]):
                    continue

                quality_string = read.to_string().split('\t')[10][start_index:end_index]

                kmer_read = pysam.AlignedSegment()
                kmer_read.query_name = read.query_name
                kmer_read.query_sequence = read.query_sequence[start_index:end_index].upper()
                kmer_read.flag = read.flag
                kmer_read.reference_id = read.reference_id

                for ref_pos in read.get_reference_positions(full_length=True)[start_index:end_index]:
                    if ref_pos is not None:
                        kmer_read.reference_start = ref_pos
                        break

                kmer_read.mapping_quality = read.mapping_quality

                current_ind = 0
                in_kmer = False
                cigarString_temp = ""

                for operation, count in read.cigartuples:
                    if current_ind >= end_index:
                        break

                    if cigar_map[operation] == 'N' or cigar_map[operation] == 'D':
                        if in_kmer:
                            cigarString_temp += str(count) + cigar_map[operation]

                    elif cigar_map[operation] == 'M' or cigar_map[operation] == 'I' or cigar_map[operation] == 'S':
                        if current_ind + count > start_index:
                            cigarString_temp += str(
                                min(end_index, current_ind + count) - max(current_ind, start_index)) + cigar_map[
                                                    operation]
                            in_kmer = True
                        current_ind += count
                    else:
                        logging.warn('Unexpected cigar op {}'.format(cigar_map[operation]))

                if "S" in cigarString_temp or "N" not in cigarString_temp:
                    continue

                kmer_read.cigarstring = cigarString_temp
                kmer_read.query_qualities = pysam.qualitystring_to_array(quality_string)
                kmer_reads.write(kmer_read)

    kmer_reads.close()
    samfile.close()
    logging.info("Done!")
    stop = timeit.default_timer()
    logging.info("total search time: {}".format(stop - start))
예제 #22
0
def check_raw_alignments(df, args, pon):

    # get soft-clip position and direction
    clips = []
    for chrA, posA, contA, chrB, posB, contB, idx, svlen, spanning in zip(
            df.chrA, df.posA, df.contigA, df.chrB, df.posB, df.contigB,
            df.index, df.svlen, df.spanning):
        if spanning:
            clips.append((chrA, posA, 3, idx, chrA == chrB, svlen))
            clips.append((chrB, posB, 3, idx, chrA == chrB, svlen))
        else:
            if contA:
                start_lower = contA[0].islower()
                end_lower = contA[-1].islower()
                if start_lower and not end_lower:
                    clip_side = 0
                elif not start_lower and end_lower:
                    clip_side = 1
                else:  # start_lower and end_lower:
                    clip_side = 3  # any side
                clips.append((chrA, posA, clip_side, idx, chrA == chrB, svlen))
            if contB:
                start_lower = contB[0].islower()
                end_lower = contB[-1].islower()
                if start_lower and not end_lower:
                    clip_side = 0
                elif not start_lower and end_lower:
                    clip_side = 1
                else:
                    clip_side = 3
                clips.append((chrB, posB, clip_side, idx, chrA == chrB, svlen))

    clips = sorted(clips, key=lambda x: (x[0], x[1]))

    opts = {"bam": "rb", "cram": "rc", "sam": "r", "-": "rb", "stdin": "rb"}
    pad = 20
    found = set([])
    for pth, _ in pon:
        # open alignment file
        kind = pth.split(".")[-1]
        bam_mode = opts[kind]

        pysam.set_verbosity(0)
        infile = pysam.AlignmentFile(
            pth,
            bam_mode,
            threads=1,
            reference_filename=None if kind != "cram" else args["ref"])
        pysam.set_verbosity(3)

        for chrom, pos, cs, index, intra, svlen in clips:

            if index in found:
                continue

            for a in infile.fetch(chrom, pos - pad if pos - pad > 0 else 0,
                                  pos + pad):
                if not a.cigartuples:
                    continue
                # if pos == 3786481 and a.cigartuples[-1][0] == 4:
                #     echo(a.cigartuples, abs(pos - a.pos), abs(pos - a.reference_end))
                if a.cigartuples[0][0] == 4 and cs != 1:
                    current_pos = a.pos
                    if abs(current_pos - pos) < 8:
                        found.add(index)
                        break
                if a.cigartuples[-1][0] == 4 and cs != 0:
                    current_pos = a.reference_end
                    if abs(current_pos - pos) < 8:
                        found.add(index)
                        break

    df = df.drop(found)

    return df
예제 #23
0
    def bam_worker(bam_q, progress_q, worker_i):

        worker = worker_i

        slices = 0
        crumbs = 0
        covered_snps = 0

        bam = pysam.AlignmentFile(bam_path)

        while True:
            work_block = bam_q.get()
            if work_block is None:
                progress_q.put({
                    "pos": None,
                    "worker_i": worker_i,
                    "slices": slices,
                    "crumbs": crumbs,
                    "covered_snps": covered_snps,
                })
                break

            reads = {}
            dreads = set([])

            for p_col in bam.pileup(reference=target_contig,
                                    start=work_block["start"] - 1,
                                    stop=work_block["end"],
                                    ignore_overlaps=False,
                                    min_base_quality=0,
                                    stepper=stepper):

                if p_col.reference_pos + 1 > end_pos:
                    # Ignore positions beyond the end_pos
                    break

                if vcf_handler["region"][p_col.reference_pos + 1] != 1:
                    # Ignore non-SNPs
                    continue

                for p_read in p_col.pileups:

                    curr_read_1or2 = 0
                    if p_read.alignment.is_paired:
                        if p_read.alignment.is_read1:
                            curr_read_1or2 = 1
                        elif p_read.alignment.is_read2:
                            curr_read_1or2 = 2
                        else:
                            #TODO Probably indicative of bad data
                            pass

                    curr_read_name = "%s_%s_%d" % (p_read.alignment.query_name,
                                                   str(p_read.alignment.flag),
                                                   curr_read_1or2)

                    LEFTMOST_1pos = p_read.alignment.reference_start + 1  # Convert 0-based reference_start to 1-based position (to match region array and 1-based VCF)

                    # Special case: Consider reads that begin before the start_pos, but overlap the 0th block
                    if work_block["i"] == 0:
                        if LEFTMOST_1pos < start_pos:
                            # Read starts before the start_pos
                            if p_read.alignment.reference_start + 1 + p_read.alignment.query_alignment_length < start_pos:
                                # Read ends before the start_pos
                                continue
                            LEFTMOST_1pos = start_pos
                    else:
                        # This read begins before the start of the current (non-0) block
                        # and will have already been covered by the block that preceded it
                        if LEFTMOST_1pos < work_block["start"]:
                            continue

                    sequence = None
                    qual = None
                    if p_read.is_del:
                        # TODO Not sure about how to estimate quality of deletion?
                        sequence = "-" * (abs(p_read.indel) + 1)
                        qual = p_read.alignment.query_qualities[
                            p_read.query_position_or_next] * (
                                abs(p_read.indel) + 1)
                    elif p_read.indel > 0:
                        # p_read.indel peeks to next CIGAR and determines whether the base FOLLOWING this one is an insertion or not
                        sequence = p_read.alignment.query_sequence[
                            p_read.query_position:p_read.query_position +
                            p_read.indel + 1]
                        qual = p_read.alignment.query_qualities[
                            p_read.query_position:p_read.query_position +
                            p_read.indel + 1]
                    else:
                        sequence = p_read.alignment.query_sequence[
                            p_read.query_position]
                        qual = p_read.alignment.query_qualities[
                            p_read.query_position]

                    if not sequence:
                        print(
                            "[WARN] Sequence data seems to not be correctly salvaged from read %s"
                            % p_read.alignment.query_name)
                        continue

                    if curr_read_name not in reads:
                        reads[curr_read_name] = {
                            "rank": np.sum(
                                vcf_handler["region"]
                                [1:LEFTMOST_1pos]),  # non-inclusive 1pos end
                            "seq": [],
                            "quals": [],
                            "refs_1pos": [],
                            "read_variants_0pos": [],
                        }
                        if p_read.alignment.query_name in debug_reads:
                            dreads.add(curr_read_name)
                    reads[curr_read_name]["seq"].append(sequence)
                    reads[curr_read_name]["quals"].append(qual)
                    reads[curr_read_name]["refs_1pos"].append(
                        p_col.reference_pos + 1)
                    reads[curr_read_name]["read_variants_0pos"].append(
                        p_read.query_position)

            for dread in sorted(dreads):
                r = reads[dread]
                if r:
                    for snp_i, ref_pos in enumerate(r["refs_1pos"]):
                        print(dread, ref_pos, r["seq"][snp_i])
                    print("RANK", dread, r["rank"])

            if debug_pos:
                for read in reads:
                    for d_pos in set(reads[read]["refs_1pos"]) & debug_pos:
                        i = reads[read]["refs_1pos"].index(d_pos)
                        print(read, d_pos, reads[read]["seq"][i])

            num_reads = len(reads)
            for qi, qname in enumerate(reads):
                progress_q.put({
                    "pos": num_reads - (qi + 1),
                    "worker_i": worker_i
                })

                if not len(reads[qname]["seq"]) > 1:
                    # Ignore reads without evidence
                    continue
                slices += 1

                rank = reads[qname]["rank"]
                support_len = len(reads[qname]["seq"])

                support_seq = "".join(
                    [b[0] for b in reads[qname]["seq"]]
                )  # b[0] has the affect of capturing the base before any insertion
                covered_snps += len(
                    support_seq.replace("N", "").replace("_", ""))

                # For each position in the supporting sequence (that is, each covered SNP)
                for i in range(0, support_len):
                    snp_a = support_seq[i]

                    #if support_len == 1:
                    #    if rank == 0:
                    #        hansel.add_observation('_', snp_a, 0, 1)
                    #        hansel.add_observation(snp_a, '_', 1, 2)
                    #    else:
                    #        hansel.add_observation(snp_a, '_', rank+1, rank+2)

                    # For each position in the supporting sequence following i
                    for j in range(i + 1, support_len):
                        snp_b = support_seq[j]

                        # Ignore observations who are from an invalid transition
                        if snp_a in ['_', 'N']:
                            continue

                        # Sentinel->A
                        if i == 0 and j == 1 and rank == 0:
                            # If this is the first position in the support (support_pos == 0)
                            # and rank > 0 (that is, this is not the first SNP)
                            # and SNPs a, b are adjacent
                            hansel.add_observation('_', snp_a, 0, 1)
                            hansel.add_observation(snp_a, snp_b, 1, 2)
                            crumbs += 1

                        # B->Sentinel
                        elif (j + rank +
                              1) == vcf_handler["N"] and abs(i - j) == 1:
                            # Last observation (abs(i-j)==1),
                            # that ends on the final SNP (j+rank+1 == N)
                            hansel.add_observation(snp_a, snp_b,
                                                   vcf_handler["N"] - 1,
                                                   vcf_handler["N"])
                            hansel.add_observation(snp_b, '_',
                                                   vcf_handler["N"],
                                                   vcf_handler["N"] + 1)
                            crumbs += 1

                        # A regular observation (A->B)
                        else:
                            hansel.add_observation(snp_a, snp_b, i + rank + 1,
                                                   j + rank + 1)
                            crumbs += 1

                            if use_end_sentinels:
                                if j == (support_len - 1) and abs(i - j) == 1:
                                    # The last SNP on a read, needs a sentinel afterward
                                    hansel.add_observation(
                                        snp_b, '_', j + rank + 1, j + rank + 2)
예제 #24
0
def getBs(rlist):
    [r1,r2,r3,r4]=rlist
    currentID=r1.strip().split(' ')[0][1:]
    fastqOutput="%s%s%s%s" % (r1,r2,r3,r4)
    eachFaPos=faPos[currentID].split(";")
    eachChr=eachFaPos[0]
    eachLeft=int(eachFaPos[1])
    eachRight=int(eachFaPos[2])
    eachLen=int(eachFaPos[3])
    eachFaSeq=faOutput[currentID]
    foFastq=open(outPrefixTmp+"seq1_"+currentID+".fastq",'w')
    foFastq.write(fastqOutput)
    foFastq.close()
    foFa=open(outPrefixTmp+"seq2_"+currentID+".fa",'w')
    foFa.write(eachFaSeq)
    foFa.close()
    cmd="minimap2 -ax splice "+strandFastq+" -k14 "+outPrefixTmp+"seq2_"+currentID+".fa "+outPrefixTmp+"seq1_"+currentID+".fastq >"+outPrefixTmp+currentID+".sam 2>/dev/null"
    os.system(cmd)
    if not os.path.exists(outPrefixTmp+currentID+".sam"):
        return('')
    samfile=pysam.AlignmentFile(outPrefixTmp+currentID+".sam","r")
    BSright=[]
    Mright=[]
    BSleft=[]
    Mleft=[]
    for read in samfile.fetch():
        if (read.flag & 4 != 4):
            readInfo=getReadInfo(read)
        else:
            continue
        ExonS=readInfo[0]
        ExonE=readInfo[1]
        ExonS_diff=abs(np.array(ExonS)-eachLen-hangLen)
        ExonE_diff=abs(np.array(ExonE)-eachLen+hangLen)
        ExonS_diff_idx=np.where(ExonS_diff==min(ExonS_diff))[0]
        ExonE_diff_idx=np.where(ExonE_diff==min(ExonE_diff))[0]
        commonIdx=set(ExonS_diff_idx-1) & set(ExonE_diff_idx)
        if len(commonIdx)==0:
            if len(ExonS_diff_idx)==1 and len(ExonE_diff_idx)==1:
                if ExonS_diff_idx[0] == ExonE_diff_idx[0]:
                    if ExonS[ExonS_diff_idx[0]]<eachLen:
                        if  ExonS_diff_idx[0]<len(ExonS)-1:
                            if ExonS[ExonS_diff_idx[0]+1]>eachLen:
                                commonIdx=[ExonE_diff_idx[0]]
                    else:
                        if  ExonE_diff_idx[0]>0:
                            if ExonE[ExonE_diff_idx[0]-1]<eachLen:
                                commonIdx=[ExonE_diff_idx[0]-1]
        for index in commonIdx:
        #1 based position
            tmpright=eachLeft+ExonE[index]-1
            tmpleft=eachLeft+ExonS[index+1]-eachLen
            BSright.append(tmpright)
            BSleft.append(tmpleft)
            Mright.append(genome.sequence({'chr': eachChr, 'start':tmpright+1, 'stop':tmpright+2}).upper())
            Mleft.append(genome.sequence({'chr': eachChr, 'start':tmpleft-2, 'stop':tmpleft-1}).upper())
    samfile.close()
    os.remove(outPrefixTmp+"seq1_"+currentID+".fastq")
    os.remove(outPrefixTmp+"seq2_"+currentID+".fa")
    os.remove(outPrefixTmp+currentID+".sam")
    if len(BSleft)==0:
        return('')
    return(currentID+"\t"+eachChr+"\t"+','.join([str(i) for i in BSleft])+"\t"+','.join([str(i) for i in BSright])+"\t"+','.join([str(i) for i in Mleft])+"\t"+','.join([str(i) for i in Mright]))
예제 #25
0
    print("File: " + str(x))

    bam = os.path.isfile(os.path.splitext(x)[0] + ".bam")
    sorted_bam = os.path.isfile(os.path.splitext(x)[0] + ".sorted.bam")
    bai = os.path.isfile(os.path.splitext(x)[0] + ".sorted.bam.bai")

    if not (bam):
        os.system("samtools view -bS " + x + " > " + os.path.splitext(x)[0] +
                  ".bam")
    if not (sorted_bam):
        os.system("samtools sort " + os.path.splitext(x)[0] + ".bam " +
                  os.path.splitext(x)[0] + ".sorted")
    if not (bai):
        os.system("samtools index " + os.path.splitext(x)[0] + ".sorted.bam")

    samfile = pysam.AlignmentFile(os.path.splitext(x)[0] + ".sorted.bam", "rb")

    splitRef = []
    splitAlt = []
    count = 0.0
    countRef = 0.0
    countAlt = 0.0

    for pileupcolumn in samfile.pileup(chrStr):
        if pileupcolumn.pos == loc:
            for pileupread in pileupcolumn.pileups:
                if not pileupread.is_del:
                    count += 1
                    if pileupread.alignment.query_sequence[
                            pileupread.query_position] == refAllele:
                        splitRef.append(pileupread.alignment.query_name)
예제 #26
0
파일: snps.py 프로젝트: pythseq/megalodon
def _get_snps_queue(snps_q, snps_conn, snps_db_fn, snps_txt_fn, db_safety,
                    pr_refs_fn, pr_ref_filts, whatshap_map_fn,
                    ref_names_and_lens, ref_fn):
    def write_whatshap_alignment(read_id, snp_seq, snp_quals, chrm, strand,
                                 r_st, snp_cigar):
        a = pysam.AlignedSegment()
        a.query_name = read_id
        a.flag = 0 if strand == 1 else 16
        a.reference_id = whatshap_map_fp.get_tid(chrm)
        a.reference_start = r_st
        a.template_length = len(snp_seq)
        a.mapping_quality = WHATSHAP_MAX_QUAL
        a.set_tags([('RG', WHATSHAP_RG_ID)])

        # convert to reference based sequence
        if strand == -1:
            snp_seq = mh.revcomp(snp_seq)
            snp_quals = snp_quals[::-1]
            snp_cigar = snp_cigar[::-1]
        a.query_sequence = snp_seq
        a.query_qualities = array('B', snp_quals)
        a.cigartuples = snp_cigar
        whatshap_map_fp.write(a)

        return

    def get_snp_call():
        # note strand is +1 for fwd or -1 for rev
        r_snp_calls, (read_id, chrm, strand, r_start, ref_seq, read_len, q_st,
                      q_en, cigar) = snps_q.get(block=False)
        snps_db.executemany(
            ADDMANY_SNPS,
            [(read_id, chrm, strand, pos, alt_lp, snp_ref_seq, snp_alt_seq,
              snp_id)
             for pos, alt_lps, snp_ref_seq, snp_alt_seqs, snp_id in r_snp_calls
             for alt_lp, snp_alt_seq in zip(alt_lps, snp_alt_seqs)])
        if snps_txt_fp is not None and len(r_snp_calls) > 0:
            snps_txt_fp.write('\n'.join((
                ('\t'.join('{}' for _ in field_names)).format(
                    read_id, chrm, strand, pos, np.log1p(-np.exp(alt_lps).sum(
                    )), alt_lp, snp_ref_seq, snp_alt_seq, snp_id) for pos,
                alt_lps, snp_ref_seq, snp_alt_seqs, snp_id in r_snp_calls
                for alt_lp, snp_alt_seq in zip(alt_lps, snp_alt_seqs))) + '\n')
            snps_txt_fp.flush()
        if do_ann_snps:
            if not mapping.read_passes_filters(pr_ref_filts, read_len, q_st,
                                               q_en, cigar):
                return
            snp_seq, snp_quals, snp_cigar = annotate_snps(
                r_start, ref_seq, r_snp_calls, strand)
            if pr_refs_fn is not None:
                pr_refs_fp.write('>{}\n{}\n'.format(read_id, snp_seq))
                pr_refs_fp.flush()
            if whatshap_map_fn is not None:
                write_whatshap_alignment(read_id, snp_seq, snp_quals, chrm,
                                         strand, r_start, snp_cigar)

        return

    snps_db = sqlite3.connect(snps_db_fn)
    if db_safety < 2:
        snps_db.execute(SET_ASYNC_MODE)
    if db_safety < 1:
        snps_db.execute(SET_NO_ROLLBACK_MODE)
    snps_db.execute(CREATE_SNPS_TBLS)
    if snps_txt_fn is None:
        snps_txt_fp = None
    else:
        snps_txt_fp = open(snps_txt_fn, 'w')
        field_names = ('read_id', 'chrm', 'strand', 'pos', 'ref_log_prob',
                       'alt_log_prob', 'ref_seq', 'alt_seq', 'snp_id')
        snps_txt_fp.write('\t'.join(field_names) + '\n')

    if pr_refs_fn is not None:
        pr_refs_fp = open(pr_refs_fn, 'w')

    if whatshap_map_fn is not None:
        _, map_fmt = os.path.splitext(whatshap_map_fn)
        if map_fmt == '.bam': w_mode = 'wb'
        elif map_fmt == '.cram': w_mode = 'wc'
        elif map_fmt == '.sam': w_mode = 'w'
        else:
            raise mh.MegaError('Invalid mapping output format')
        header = {
            'HD': {
                'VN': '1.4'
            },
            'SQ': [{
                'LN': ref_len,
                'SN': ref_name
            } for ref_name, ref_len in sorted(zip(*ref_names_and_lens))],
            'RG': [
                {
                    'ID': WHATSHAP_RG_ID,
                    'SM': SAMPLE_NAME
                },
            ]
        }
        whatshap_map_fp = pysam.AlignmentFile(whatshap_map_fn,
                                              w_mode,
                                              header=header,
                                              reference_filename=ref_fn)

    do_ann_snps = whatshap_map_fn is not None or pr_refs_fn is not None

    while True:
        try:
            get_snp_call()
        except queue.Empty:
            if snps_conn.poll():
                break
            sleep(0.1)
            continue

    while not snps_q.empty():
        get_snp_call()
    if snps_txt_fp is not None: snps_txt_fp.close()
    if pr_refs_fn is not None: pr_refs_fp.close()
    if whatshap_map_fn is not None: whatshap_map_fp.close()
    snps_db.execute(CREATE_SNPS_IDX)
    snps_db.commit()
    snps_db.close()

    return
예제 #27
0
def Filter(inputBAM,
           outputBAM,
           log,
           bed,
           MQ=2,
           minIdentity=0.8,
           NM=-1,
           printOnly=False,
           verbose=True,
           force=False):
    if (printOnly or checkStep([inputBAM], [outputBAM], force)):

        mappedReads = 0
        unmappedReads = 0
        filteredReads = 0

        mqFiltered = 0
        idFiltered = 0
        nmFiltered = 0
        multimapper = 0

        infile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)

        # Default filtering without bed
        if (bed == None):

            print("#No bed-file supplied. Running default filtering on " +
                  inputBAM + ".",
                  file=log)

            for read in infile:

                if (not read.is_secondary and not read.is_supplementary):
                    if (read.is_unmapped):
                        unmappedReads += 1
                    else:
                        mappedReads += 1

                if (read.is_unmapped):
                    continue
                if (read.mapping_quality < MQ):
                    mqFiltered += 1
                    continue
                if (float(read.get_tag("XI")) < minIdentity):
                    idFiltered += 1
                    continue
                if (NM > -1 and int(read.get_tag("NM")) > NM):
                    nmFiltered += 1
                    continue

                if (not read.is_secondary and not read.is_supplementary):
                    filteredReads += 1

                outfile.write(read)

            print("Criterion\tFiltered reads", file=log)
            print("MQ < " + str(MQ) + "\t" + str(mqFiltered), file=log)
            print("ID < " + str(minIdentity) + "\t" + str(idFiltered),
                  file=log)
            print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log)
            print("MM\t0", file=log)
        else:
            # Multimap retention strategy filtering when bed is supplied

            random.seed(1)

            print(
                "#Bed-file supplied. Running multimap retention filtering strategy on "
                + inputBAM + ".",
                file=log)

            mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment(
                infile, outfile, bed, minIdentity, NM, log)
            #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)

        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if ('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0):
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if (bed != None):
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else:
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""

            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo)
            #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}"

        slamDunkPG = {
            'ID': 'slamdunk',
            'PN': 'slamdunk filter v' + __version__,
            'VN': __bam_version__
        }
        if ('PG' in inFileBamHeader):
            inFileBamHeader['PG'].append(slamDunkPG)
        else:
            inFileBamHeader['PG'] = [slamDunkPG]

        infile.close()
        outfile.close()

        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, verbose)

        pysamIndex(outputBAM)
        #pysamFlagstat(outputBAM)
        #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)

    else:
        print("Skipped filtering for " + inputBAM, file=log)
예제 #28
0
    # all of the non-uk mutations
    other_variants = all_mutations - uk_variant_mutations
    # L5F doesn't included in both lists
    other_variants.remove('L5F')

    all_tables = {}

    files_list = glob.glob(bam_dir + '/*.mapped.sorted.bam')

    # iterate all bam files:
    for file in files_list:
        pileup_table = pd.DataFrame(np.empty(shape=(29903, 6)) * np.nan,
                                    columns=['C', 'A', 'G', 'T', 'N', 'del'],
                                    index=list(
                                        range(29903)))  # empty pileup table
        bam = pysam.AlignmentFile(file, 'rb')  # upload bam file
        pileup_iter = bam.pileup(stepper='nofilter')  # samtools pileup
        # iterate over reads in each position and count nucleotides, Ns and deletions.
        for position in pileup_iter:
            c = Counter({'C': 0, 'A': 0, 'G': 0, 'T': 0, 'N': 0, 'del': 0})
            for pileupread in position.pileups:
                if not pileupread.is_del and not pileupread.is_refskip:
                    c[pileupread.alignment.query_sequence[
                        pileupread.query_position].upper()] += 1
                elif pileupread.is_del:
                    c['del'] += 1
                elif pileupread.is_refskip:  # N?
                    c['N'] += 1
            pileup_table.loc[position.reference_pos] = pd.Series(c)
        # produce pileup table(for each bam): pos,A,C,T,G,N,del,totaldepth,
        pileup_table.index.name = 'pos'
예제 #29
0
#!/usr/bin/python3

import os
import pysam
import sys
import numpy as np

genome = sys.argv[1]
output_folder = sys.argv[2]

input_fasta = output_folder + "/results/" + genome + "_reconstructed_genome.fna"
samfile = pysam.AlignmentFile(
    output_folder + "/artifacts/sorted_contigs_alignment_on_rgenome.bam", "r")
predicted_file_name = output_folder + "/results/" + genome + "_predictedCDSs"
predicted_filtered_file_name = output_folder + "/results/" + genome + "_predictedCDSs_filtered"
frags_with_no_genes = output_folder + "/results/" + genome + "_frags_with_no_genes.txt"
predicted_filtered_only_genes_file_name = output_folder + "/artifacts/" + genome + "_predictedCDSs_filtered_only_genes.bed"
gene_positions = os.popen("grep 'CDS' " + predicted_file_name +
                          " | awk '{print $2}'").read()

sequence_identifier = os.popen("grep '>' " + input_fasta).read()
sequence_identifier = sequence_identifier[1:-1]
sequence_identifier = sequence_identifier.split(" ")[0]

gene_positions = gene_positions.split("\n")
ref_name = samfile.references[0]
deduced_bases = []
for gene in gene_positions:
    in_reverse_strands = False
    if not gene:
        continue
예제 #30
0
    def categorize_no_overlap_outcomes(self, max_reads=None):
        outcomes = defaultdict(list)

        with self.fns['no_overlap_outcome_list'].open('w') as fh:
            fh.write(f'## Generated at {utilities.current_time_string()}\n')

            alignment_groups = self.no_overlap_alignment_groups()

            if max_reads is not None:
                alignment_groups = islice(alignment_groups, max_reads)

            for name, als in self.progress(
                    alignment_groups,
                    desc='Categorizing non-overlapping read pairs'):
                try:
                    pair_layout = layout_module.NonoverlappingPairLayout(
                        als['R1'], als['R2'], self.target_info)
                    pair_layout.categorize()
                except:
                    print(self.sample_name, name)
                    raise

                outcomes[pair_layout.category,
                         pair_layout.subcategory].append(name)

                outcome = self.final_Outcome.from_layout(pair_layout)
                fh.write(f'{outcome}\n')

        # To make plotting easier, for each outcome, make a file listing all of
        # qnames for the outcome and a bam file (sorted by name) with all of the
        # alignments for these qnames.

        qname_to_outcome = {}
        bam_fhs = {}

        with ExitStack() as stack:
            full_bam_fns = {
                which:
                self.fns_by_read_type['bam_by_name'][f'{which}_no_overlap']
                for which in ['R1', 'R2']
            }
            full_bam_fhs = {
                which:
                stack.enter_context(pysam.AlignmentFile(full_bam_fns[which]))
                for which in ['R1', 'R2']
            }

            for outcome, qnames in outcomes.items():
                outcome_fns = self.outcome_fns(outcome)
                outcome_fns['dir'].mkdir(exist_ok=True)
                for which in ['R1', 'R2']:
                    bam_fn = outcome_fns['bam_by_name'][f'{which}_no_overlap']
                    bam_fhs[outcome, which] = stack.enter_context(
                        pysam.AlignmentFile(bam_fn,
                                            'wb',
                                            template=full_bam_fhs[which]))

                fh = stack.enter_context(
                    outcome_fns['no_overlap_query_names'].open('w'))
                for qname in qnames:
                    qname_to_outcome[qname] = outcome
                    fh.write(qname + '\n')

            for which in ['R1', 'R2']:
                for al in full_bam_fhs[which]:
                    if al.query_name in qname_to_outcome:
                        outcome = qname_to_outcome[al.query_name]
                        bam_fhs[outcome, which].write(al)