Exemplo n.º 1
0
def nucmer(args):
    """
    %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3

    Select specific chromosome region based on MTR mapping. The above command
    will extract chr1:2,000,001-3,000,000.
    """
    p = OptionParser(nucmer.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    mapbed, mtrfasta, asmfasta, chr, idx = args
    idx = int(idx)
    m1 = 1000000
    bedfile = "sample.bed"
    bed = Bed()
    bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1)))
    bed.print_to_file(bedfile)

    cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile)
    idsfile = "query.ids"
    sh(cmd, outfile=idsfile)

    sfasta = fastaFromBed(bedfile, mtrfasta)
    qfasta = "query.fasta"
    cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta)
    sh(cmd)

    cmd = "nucmer {0} {1}".format(sfasta, qfasta)
    sh(cmd)

    mummerplot_main(["out.delta", "--refcov=0"])
    sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
Exemplo n.º 2
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch", default=False, action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale", type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError, "`{0}` is on `{1}` with no number to extract".\
                                format(saccn, sseqid)
        bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend,
                            "{0}:{1}".format(newsseqid, sstart)))
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Exemplo n.º 3
0
def nucmer(args):
    """
    %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3

    Select specific chromosome region based on MTR mapping. The above command
    will extract chr1:2,000,001-3,000,000.
    """
    p = OptionParser(nucmer.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    mapbed, mtrfasta, asmfasta, chr, idx = args
    idx = int(idx)
    m1 = 1000000
    bedfile = "sample.bed"
    bed = Bed()
    bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1)))
    bed.print_to_file(bedfile)

    cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(
        mapbed, bedfile)
    idsfile = "query.ids"
    sh(cmd, outfile=idsfile)

    sfasta = fastaFromBed(bedfile, mtrfasta)
    qfasta = "query.fasta"
    cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta)
    sh(cmd)

    cmd = "nucmer {0} {1}".format(sfasta, qfasta)
    sh(cmd)

    mummerplot_main(["out.delta", "--refcov=0"])
    sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
Exemplo n.º 4
0
def insert(args):
    """
    %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta

    Insert scaffolds into assembly.
    """
    from jcvi.formats.agp import mask, bed
    from jcvi.formats.sizes import agp

    p = OptionParser(insert.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    candidates, gapsbed, chrfasta, unplacedfasta = args
    refinedbed = refine([candidates, gapsbed])
    sizes = Sizes(unplacedfasta).mapping
    cbed = Bed(candidates)
    corder = cbed.order
    gbed = Bed(gapsbed)
    gorder = gbed.order

    gpbed = Bed()
    gappositions = {}  # (chr, start, end) => gapid

    fp = open(refinedbed)
    gap_to_scf = defaultdict(list)
    seen = set()
    for row in fp:
        atoms = row.split()
        if len(atoms) <= 6:
            continue
        unplaced = atoms[3]
        strand = atoms[5]
        gapid = atoms[9]
        if gapid not in seen:
            seen.add(gapid)
            gi, gb = gorder[gapid]
            gpbed.append(gb)
            gappositions[(gb.seqid, gb.start, gb.end)] = gapid
        gap_to_scf[gapid].append((unplaced, strand))

    gpbedfile = "candidate.gaps.bed"
    gpbed.print_to_file(gpbedfile, sorted=True)

    agpfile = agp([chrfasta])
    maskedagpfile = mask([agpfile, gpbedfile])
    maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed"
    bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)])

    mbed = Bed(maskedbedfile)
    finalbed = Bed()
    for b in mbed:
        sid = b.seqid
        key = (sid, b.start, b.end)
        if key not in gappositions:
            finalbed.add("{0}\n".format(b))
            continue

        gapid = gappositions[key]
        scfs = gap_to_scf[gapid]

        # For scaffolds placed in the same gap, sort according to positions
        scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end)
        for scf, strand in scfs:
            size = sizes[scf]
            finalbed.add("\t".join(str(x) for x in (scf, 0, size, sid, 1000, strand)))

    finalbedfile = "final.bed"
    finalbed.print_to_file(finalbedfile)

    # Clean-up
    toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile]
    FileShredder(toclean)
Exemplo n.º 5
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option(
        "--switch",
        default=False,
        action="store_true",
        help="Switch reference and aligned map elements",
    )
    p.add_option(
        "--scale", type="float", help="Scale the aligned map distance by factor"
    )
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorsfile,) = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError(
                "`{0}` is on `{1}` with no number to extract".format(saccn, sseqid)
            )
        bedline = "\t".join(
            str(x)
            for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))
        )
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Exemplo n.º 6
0
def ancestral(args):
    """
    %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed

    Paint 14 chromosomes following alpha WGD.
    """
    p = OptionParser(ancestral.__doc__)
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorsfile, ) = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)

    # We focus on the following chromosome pairs
    target_pairs = {
        (1, 1),
        (1, 6),
        (1, 8),
        (1, 13),
        (2, 4),
        (3, 12),
        (3, 14),
        (5, 6),
        (5, 8),
        (7, 9),
        (7, 11),
        (9, 10),
        (10, 11),
    }

    def get_target(achr, bchr):
        if "chr" not in achr and "chr" not in bchr:
            return None
        achr, bchr = get_number(achr), get_number(bchr)
        if achr > bchr:
            achr, bchr = bchr, achr
        if (achr, bchr) in target_pairs:
            return achr, bchr
        return None

    def build_bedline(astart, aend, target_pair):
        # target_name = "{:02d}-{:02d}".format(*target_pair)
        target_name = [
            str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10)
        ][0]
        return "\t".join(
            str(x)
            for x in (astart.seqid, astart.start, aend.end, target_name))

    # Iterate through the blocks, store any regions that has hits to one of the
    # target_pairs
    ac = AnchorFile(anchorsfile)
    blocks = ac.blocks
    outbed = Bed()
    for i, block in enumerate(blocks):
        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        astart, aend = min(a)[1], max(a)[1]
        bstart, bend = min(b)[1], max(b)[1]
        # Now convert to BED lines with new accn
        achr, bchr = astart.seqid, bstart.seqid
        target = get_target(achr, bchr)
        if target is None:
            continue
        outbed.add(build_bedline(astart, aend, target))
        outbed.add(build_bedline(bstart, bend, target))
    outbed.print_to_file(sorted=True)
Exemplo n.º 7
0
def insert(args):
    """
    %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta

    Insert scaffolds into assembly.
    """
    from jcvi.formats.agp import mask, bed
    from jcvi.formats.sizes import agp

    p = OptionParser(insert.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    candidates, gapsbed, chrfasta, unplacedfasta = args
    refinedbed = refine([candidates, gapsbed])
    sizes = Sizes(unplacedfasta).mapping
    cbed = Bed(candidates)
    corder = cbed.order
    gbed = Bed(gapsbed)
    gorder = gbed.order

    gpbed = Bed()
    gappositions = {}  # (chr, start, end) => gapid

    fp = open(refinedbed)
    gap_to_scf = defaultdict(list)
    seen = set()
    for row in fp:
        atoms = row.split()
        if len(atoms) <= 6:
            continue
        unplaced = atoms[3]
        strand = atoms[5]
        gapid = atoms[9]
        if gapid not in seen:
            seen.add(gapid)
            gi, gb = gorder[gapid]
            gpbed.append(gb)
            gappositions[(gb.seqid, gb.start, gb.end)] = gapid
        gap_to_scf[gapid].append((unplaced, strand))

    gpbedfile = "candidate.gaps.bed"
    gpbed.print_to_file(gpbedfile, sorted=True)

    agpfile = agp([chrfasta])
    maskedagpfile = mask([agpfile, gpbedfile])
    maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed"
    bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)])

    mbed = Bed(maskedbedfile)
    finalbed = Bed()
    for b in mbed:
        sid = b.seqid
        key = (sid, b.start, b.end)
        if key not in gappositions:
            finalbed.add("{0}\n".format(b))
            continue

        gapid = gappositions[key]
        scfs = gap_to_scf[gapid]

        # For scaffolds placed in the same gap, sort according to positions
        scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end)
        for scf, strand in scfs:
            size = sizes[scf]
            finalbed.add("\t".join(str(x) for x in \
                    (scf, 0, size, sid, 1000, strand)))

    finalbedfile = "final.bed"
    finalbed.print_to_file(finalbedfile)

    # Clean-up
    toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile]
    FileShredder(toclean)
Exemplo n.º 8
0
def extract(args):
    import re
    from jcvi.formats.bed import Bed

    db = ""
    if op.isfile(args.db):
        db = Fasta(args.db)
    else:
        f_db = "%s/data/%s/10_genome.fna" % (os.environ["genome"], args.db)
        assert op.isfile(f_db), "cannot find %s" % args.db
        db = Fasta(f_db)

    reg1 = re.compile("^([\w\-]+)\:([\d,]+)(\-|\.{1,2})([\d,]+)$")
    reg2 = re.compile("^([\w\-]+)$")
    bed = Bed()
    if op.isfile(args.loc):
        if args.list:
            fho = must_open(args.loc, 'r')
            for line in fho:
                sid = line.strip()
                beg = 0
                if sid in db:
                    end = len(db[sid])
                    bed.add("%s\t%d\t%d\n" % (sid, beg, end))
                # else:
                # logging.error("%s not in db => skipped" % sid)
        else:
            bed = Bed(args.loc, sorted=False)
    else:
        for loc in args.loc.split(","):
            res = reg1.match(loc)
            if res:
                sid, beg, end = res.group(1), res.group(2), res.group(4)
                beg = int(beg.replace(",", ""))
                end = int(end.replace(",", ""))
                bed.add("%s\t%d\t%d\n" % (sid, beg - 1, end))
            else:
                res = reg2.match(loc)
                if res:
                    sid = res.group(1)
                    beg = 0
                    if sid in db:
                        end = len(db[sid])
                        bed.add("%s\t%d\t%d\n" % (sid, beg, end))
                    # else:
                    # logging.error("%s not in db => skipped" % sid)
                else:
                    logging.error("%s: unknown locstr => skipped" % loc)

    rcds = []
    for b in bed:
        sid, beg, end = b.seqid, b.start, b.end
        oid = sid if args.list else f"{sid}-{beg}-{end}"
        if b.accn:
            oid = b.accn
        if sid not in db:
            print("%s not in db => skipped" % sid)
            continue
        size = end - beg + 1
        bp_pad = 0
        if beg < 1:
            bp_pad += 1 - beg
            beg = 1
        if beg > len(db[sid]):
            bp_pad = 1
            beg = len(db[sid])
        if end > len(db[sid]):
            bp_pad += end - len(db[sid])
            end = len(db[sid])
        seq = db[sid][beg - 1:end].seq
        if args.padding:
            if bp_pad > 0:
                if end - beg + 1 < 30:
                    seq = "N" * size
                else:
                    seq += "N" * bp_pad
            assert len(seq) == size, "error in seq size: %s:%d-%d %d" % (
                sid, beg, end, bp_pad)

        if args.tsv:
            print("\t".join([sid, str(beg), str(end), seq]))
        else:
            rcd = SeqRecord(Seq(seq), id=oid, description='')
            SeqIO.write([rcd], sys.stdout, 'fasta')