示例#1
0
def parse_bam_differential(afn, bfn, regs, step):
    """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping 
    onto a segment (chr, start, end). No normalization is done at this step.
    """
    abam = Samfile(str(afn), "rb")
    bbam = Samfile(str(bfn), "rb")
    acount = []
    bcount = []
    oldchr = "chr1"
    for reg in regs:
        chr, start, end = reg[:3]
        if chr != oldchr:
            log("files: %s - %s : %s counted" % (afn, bfn, oldchr))
            oldchr = chr
        # this could be improved
        for s in xrange(start, end, step):
            e = s + step
            an = abam.count(chr, s, e)
            bn = bbam.count(chr, s, e)
            acount.append(an)
            bcount.append(bn)
        acount.append(-1)
        bcount.append(-1)
    log("files: %s - %s : %s counted (finished)" % (afn, bfn, oldchr))
    return acount, bcount
示例#2
0
def parse_bam_differential(afn, bfn, regs, step):
    """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping 
    onto a segment (chr, start, end). No normalization is done at this step.
    """
    abam = Samfile(str(afn), "rb")
    bbam = Samfile(str(bfn), "rb")
    acount = []
    bcount = []
    oldchr = "chr1"
    for reg in regs:
        chr, start, end = reg[:3]
        if chr != oldchr:
            log("files: %s - %s : %s counted" % (afn, bfn, oldchr))
            oldchr = chr
        # this could be improved
        for s in xrange(start, end, step):
            e = s + step
            an = abam.count(chr, s, e)
            bn = bbam.count(chr, s, e)
            acount.append(an)
            bcount.append(bn)
        acount.append(-1)
        bcount.append(-1)
    log("files: %s - %s : %s counted (finished)" % (afn, bfn, oldchr))
    return acount, bcount
示例#3
0
def parse_bam_absolute(fn, regs):
    """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping 
    onto a segment (chr, start, end) and normalizes the count by the segment's length.
    """
    bam = Samfile(str(fn), "rb")
    count = []
    for reg in regs:
        chr, start, end = reg[:3]
        n = bam.count(chr, start, end)
        count.append(float(n) / (end - start))
    return count
示例#4
0
def parse_bam_absolute(fn, regs):
    """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping 
    onto a segment (chr, start, end) and normalizes the count by the segment's length.
    """
    bam = Samfile(str(fn), "rb")
    count = []
    for reg in regs:
        chr, start, end = reg[:3]
        n = bam.count(chr, start, end)
        count.append(float(n) / (end - start))
    return count
示例#5
0
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option(
        "--splitdir",
        dest="splitdir",
        default="outRoot",
        help="split the ace per contig to dir",
    )
    p.add_option(
        "--unpaired",
        dest="unpaired",
        default=False,
        help="remove read pairs on the same contig",
    )
    p.add_option(
        "--minreadno",
        dest="minreadno",
        default=3,
        type="int",
        help="minimum read numbers per contig",
    )
    p.add_option(
        "--minctgsize",
        dest="minctgsize",
        default=100,
        type="int",
        help="minimum contig size per contig",
    )
    p.add_option(
        "--astat",
        default=False,
        action="store_true",
        help="create .astat to list repetitiveness",
    )
    p.add_option(
        "--readids",
        default=False,
        action="store_true",
        help="create file of mapped and unmapped ids",
    )

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print("AS {0} {1}".format(ncontigs, totalreads), file=fw)
    print(file=fw)

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw)
        print(fill(str(cseq.seq)), file=fw)
        print(file=fw)

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print("{0}\t{1:.1f}".format(contig, astat), file=astatfw)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print("BQ\n{0}".format(text), file=fw)
        print(file=fw)

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print(readname, file=readsfw)
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print(af, file=fw)

        print(file=fw)

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(
                rname, alen, ninfos, ntags, fill(aseq)
            )
            qs = "QA 1 {0} 1 {0}".format(alen)

            print(rd, file=fw)
            print(file=fw)
            print(qs, file=fw)
            print(file=fw)
示例#6
0
文件: sam.py 项目: arvin580/jcvi
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option("--splitdir", dest="splitdir", default="outRoot",
            help="split the ace per contig to dir [default: %default]")
    p.add_option("--unpaired", dest="unpaired", default=False,
            help="remove read pairs on the same contig [default: %default]")
    p.add_option("--minreadno", dest="minreadno", default=3, type="int",
            help="minimum read numbers per contig [default: %default]")
    p.add_option("--minctgsize", dest="minctgsize", default=100, type="int",
            help="minimum contig size per contig [default: %default]")
    p.add_option("--astat", default=False, action="store_true",
            help="create .astat to list repetitiveness [default: %default]")
    p.add_option("--readids", default=False, action="store_true",
            help="create file of mapped and unmapped ids [default: %default]")

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs,
        genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print >> fw, "AS {0} {1}".format(ncontigs, totalreads)
    print >> fw

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads,
                nsegments)
        print >> fw, fill(str(cseq.seq))
        print >> fw

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print >> astatfw, "{0}\t{1:.1f}".format(contig, astat)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print >> fw, "BQ\n{0}".format(text)
        print >> fw

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print >> readsfw, readname
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print >> fw, af

        print >> fw

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags,
                    fill(aseq))
            qs = "QA 1 {0} 1 {0}".format(alen)

            print >> fw, rd
            print >> fw
            print >> fw, qs
            print >> fw
示例#7
0
def create_multi_table(max_dist, alias_file_name, genes_file_name, exp_file_name, dsb_file_name, dist_file_name, output_location):

  # Global Parameters
  seed(111)
  promExt = 2000
  command = "mkdir -p "+output_location
  os.system(command)

  # Allowed chromosomes
  chrom_list = ["chr"+str(e) for e in range(1,23)+["X"]]

  # Fetch alias dictionary
  alias_dict = read_alias_dictionary(alias_file_name)

  # Fetch expression
  exp_dict = fetch_expression(exp_file_name)

  # Fetch distance
  dist_dict = create_distance_dictionary(dist_file_name)

  # Input Files
  allGenesFile = open(genes_file_name, "rU")
  try: dsbFile = Samfile(dsb_file_name, "rb")
  except Exception:
    print("ERROR: Could not open DSB BAM file. Check your Pysam installation.")
    exit(1)

  # Output file
  output_file_name = output_location + "table.txt"
  outputFile = open(output_file_name, "w")
  outputFile.write("\t".join(["GENE", "DISTANCE", "EXPRESSION", "DSB"])+"\n")

  # Iterating in gene file
  for line in allGenesFile:

    # Initialization
    ll = line.strip().split("\t")
    try: chrom = ll[0]; p1 = int(ll[1]); p2 = int(ll[2]); gene = ll[3].upper(); score = ll[4]; strand = ll[5]
    except Exception: print("ERROR: The genes file must be a tab-separated bed file with columns: chromosome, start, end, gene_name, score (not used), strand")
    try: gene = alias_dict[gene]
    except Exception: pass
    if(chrom not in chrom_list): continue
    if(strand == "+"): region = [chrom, p1 - promExt, p1]
    else: region = [chrom, p2, p2 + promExt]

    # Fetch distance
    try: distance = dist_dict[gene]
    except Exception: continue
    if(distance >= max_dist): continue

    # Fetch expression 1
    try: exp = exp_dict[alias_dict[gene]]
    except Exception: continue
    if(exp <= 0): continue
    if(distance <= 0): dfactor = 0.9
    else: dfactor = distance
    exp = (200./dfactor) + exp * exp
    jitt = random() * 3
    exp = exp - jitt

    # Fetch DSB counts
    if(dsbFile):
      if(strand == "+"):
        tss1 = p1 - promExt
        tss2 = p1
      elif(strand == "-"):
        tss1 = p2
        tss2 = p2 + promExt
      dsbCount = dsbFile.count(chrom, tss1, tss2)
      dsbCount = (exp + (dsbCount/10.) + (random()*3.)) / 1000.
    else: continue

    # Fetch expression 2
    try:
      jitt = 50 * random() * ((100 * dsbCount)**2)
      exp = exp + jitt
    except Exception: continue

    # Writing to file
    outputFile.write("\t".join([str(e) for e in [gene, distance, exp, dsbCount]])+"\n")

  # Closing files
  if(dsbFile): dsbFile.close()
  allGenesFile.close()
  outputFile.close()

  # Script path
  script_path = "/".join(os.path.realpath(__file__).split("/")[:-1]) + "/"

  # Creating plots
  output_dist_dsb_exp = output_location + "3D_dist_dsb_exp.pdf"
  command = "Rscript "+script_path+"3Dplot.R "+" ".join([str(max_dist), output_file_name, output_dist_dsb_exp])
  os.system(command)

  output_dist_dsb = output_location + "2D_dist_dsb.pdf"
  output_dist_exp = output_location + "2D_dist_exp.pdf"
  output_exp_dsb = output_location + "2D_exp_dsb.pdf"
  command = "Rscript "+script_path+"2Dplot.R "+" ".join([str(max_dist), output_file_name, output_dist_dsb, output_dist_exp, output_exp_dsb])
  os.system(command)