예제 #1
0
def liftover(args):
    """
    %prog liftover agpfile bedfile

    Given coordinates in components, convert to the coordinates in chromosomes.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--prefix",
                 default=False,
                 action="store_true",
                 help="Prepend prefix to accn names [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile).order
    bed = Bed(bedfile)
    newbed = Bed()
    for b in bed:
        component = b.seqid
        if component not in agp:
            newbed.append(b)
            continue

        i, a = agp[component]

        assert a.component_beg < a.component_end
        arange = a.component_beg, a.component_end
        assert b.start < b.end
        brange = b.start, b.end

        st = range_intersect(arange, brange)
        if not st:
            continue
        start, end = st
        assert start <= end

        if a.orientation == '-':
            d = a.object_end + a.component_beg
            s, t = d - end, d - start
        else:
            d = a.object_beg - a.component_beg
            s, t = d + start, d + end

        name = b.accn.replace(" ", "_")
        if opts.prefix:
            name = component + "_" + name
        bline = "\t".join(str(x) for x in (a.object, s - 1, t, name))
        newbed.append(BedLine(bline))

    newbed.print_to_file(sorted=True)
예제 #2
0
파일: agp.py 프로젝트: bennyyu/jcvi
def liftover(args):
    """
    %prog liftover agpfile bedfile

    Given coordinates in components, convert to the coordinates in chromosomes.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--prefix", default=False, action="store_true",
                 help="Prepend prefix to accn names [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile).order
    bed = Bed(bedfile)
    newbed = Bed()
    for b in bed:
        component = b.seqid
        if component not in agp:
            newbed.append(b)
            continue

        i, a = agp[component]

        assert a.component_beg < a.component_end
        arange = a.component_beg, a.component_end
        assert b.start < b.end
        brange = b.start, b.end

        st = range_intersect(arange, brange)
        if not st:
            continue
        start, end = st
        assert start <= end

        if a.orientation == '-':
            d = a.object_end + a.component_beg
            s, t = d - end, d - start
        else:
            d = a.object_beg - a.component_beg
            s, t = d + start, d + end

        name = b.accn.replace(" ", "_")
        if opts.prefix:
            name = component + "_" + name
        bline = "\t".join(str(x) for x in (a.object, s - 1, t, name))
        newbed.append(BedLine(bline))

    newbed.sort(key=newbed.nullkey)
    newbed.print_to_file()
예제 #3
0
파일: bed.py 프로젝트: radaniba/jcvi
def refine(args):
    """
    %prog refine bedfile1 bedfile2 refinedbed

    Refine bed file using a second bed file. The final bed is keeping all the
    intervals in bedfile1, but refined by bedfile2 whenever they have
    intersection.
    """
    p = OptionParser(refine.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    abedfile, bbedfile, refinedbed = args
    fw = open(refinedbed, "w")
    intersected = refined = 0
    for a, b in intersectBed_wao(abedfile, bbedfile):
        if b is None:
            print >> fw, a
            continue

        intersected += 1
        aspan_before = a.span
        arange = (a.start, a.end)
        brange = (b.start, b.end)
        irange = range_intersect(arange, brange)
        a.start, a.end = irange
        aspan_after = a.span
        if aspan_before > aspan_after:
            refined += 1
        print >> fw, a

    fw.close()
    print >> sys.stderr, "Total intersected: {0}".format(intersected)
    print >> sys.stderr, "Total refined: {0}".format(refined)
    summary([abedfile])
    summary([refinedbed])
예제 #4
0
파일: bed.py 프로젝트: yangjl/jcvi
def refine(args):
    """
    %prog refine bedfile1 bedfile2 refinedbed

    Refine bed file using a second bed file. The final bed is keeping all the
    intervals in bedfile1, but refined by bedfile2 whenever they have
    intersection.
    """
    p = OptionParser(refine.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    abedfile, bbedfile, refinedbed = args
    fw = open(refinedbed, "w")
    intersected = refined = 0
    for a, b in intersectBed_wao(abedfile, bbedfile):
        if b is None:
            print >> fw, a
            continue

        intersected += 1
        aspan_before = a.span
        arange = (a.start, a.end)
        brange = (b.start, b.end)
        irange = range_intersect(arange, brange)
        a.start, a.end = irange
        aspan_after = a.span
        if aspan_before > aspan_after:
            refined += 1
        print >> fw, a

    fw.close()
    print >> sys.stderr, "Total intersected: {0}".format(intersected)
    print >> sys.stderr, "Total refined: {0}".format(refined)
    summary([abedfile])
    summary([refinedbed])
예제 #5
0
def connect(args):
    """
    %prog connect assembly.fasta read_mapping.blast

    Connect contigs using long reads.
    """
    p = OptionParser(connect.__doc__)
    p.add_option(
        "--clip",
        default=2000,
        type="int",
        help="Only consider end of contigs",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, blastfile = args
    clip = opts.clip

    sizes = Sizes(fastafile).mapping
    blast = Blast(blastfile)
    blasts = []
    for b in blast:
        seqid = b.subject
        size = sizes[seqid]
        start, end = b.sstart, b.sstop
        cstart, cend = min(size, clip), max(0, size - clip)
        if start > cstart and end < cend:
            continue
        blasts.append(b)

    key = lambda x: x.query
    blasts.sort(key=key)
    g = BiGraph()
    for query, bb in groupby(blasts, key=key):
        bb = sorted(bb, key=lambda x: x.qstart)
        nsubjects = len(set(x.subject for x in bb))
        if nsubjects == 1:
            continue
        print("\n".join(str(x) for x in bb))
        for a, b in pairwise(bb):
            astart, astop = a.qstart, a.qstop
            bstart, bstop = b.qstart, b.qstop
            if a.subject == b.subject:
                continue

            arange = astart, astop
            brange = bstart, bstop
            ov = range_intersect(arange, brange)
            alen = astop - astart + 1
            blen = bstop - bstart + 1
            if ov:
                ostart, ostop = ov
                ov = ostop - ostart + 1

            print(ov, alen, blen)
            if ov and (ov > alen / 2 or ov > blen / 2):
                print("Too much overlap ({0})".format(ov))
                continue

            asub = a.subject
            bsub = b.subject
            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(asub, bsub, atag, btag)

    graph_to_agp(g, blastfile, fastafile, verbose=False)
예제 #6
0
파일: agp.py 프로젝트: bennyyu/jcvi
def mask(args):
    """
    %prog mask agpfile bedfile

    Mask given ranges in componets to gaps.
    """
    p = OptionParser(mask.__doc__)
    p.add_option("--split", default=False, action="store_true",
                 help="Split object and create new names [default: %default]")
    p.add_option("--log", default=False, action="store_true",
                 help="Write verbose logs to .masklog file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    # agp lines to replace original ones, keyed by the component
    agp_fixes = defaultdict(list)

    newagpfile = agpfile.replace(".agp", ".masked.agp")
    logfile = bedfile.replace(".bed", ".masklog")
    fw = open(newagpfile, "w")
    if opts.log:
        fwlog = open(logfile, "w")

    for component, intervals in bed.sub_beds():
        if opts.log:
            print >> fwlog, "\n".join(str(x) for x in intervals)
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation
        if opts.log:
            print >> fwlog, a

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        # Make sure `ivs` contain DISJOINT ranges, and located within `arange`
        ivs = []
        for i in intervals:
            iv = range_intersect(arange, (i.start, i.end))
            if iv is not None:
                ivs.append(iv)

        # Sort the ends of `ivs` as well as the arange
        arange = a.component_beg - 1, a.component_end + 1
        endpoints = sorted(flatten(ivs + [arange]))
        # reverse if component on negative strand
        if orientation == '-':
            endpoints.reverse()

        sum_of_spans = 0
        # assign complements as sequence components
        for i, (a, b) in enumerate(pairwise(endpoints)):
            if orientation == '-':
                a, b = b, a
            if orientation not in ('+', '-'):
                orientation = '+'

            oid = object + "_{0}".format(i / 2) if opts.split else object
            aline = [oid, 0, 0, 0]
            if i % 2 == 0:
                cspan = b - a - 1
                aline += ['D', component, a + 1, b - 1, orientation]
                is_gap = False
            else:
                cspan = b - a + 1
                aline += ["N", cspan, "fragment", "yes"]
                is_gap = True
            if cspan <= 0:
                continue

            sum_of_spans += cspan
            aline = "\t".join(str(x) for x in aline)
            if not (opts.split and is_gap):
                agp_fixes[component].append(aline)

            if opts.log:
                print >> fwlog, aline

        assert component_span == sum_of_spans
        if opts.log:
            print >> fwlog

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile
예제 #7
0
파일: scaffold.py 프로젝트: rrane/jcvi
def connect(args):
    """
    %prog connect assembly.fasta read_mapping.blast

    Connect contigs using long reads.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.blast import Blast
    from jcvi.utils.iter import pairwise
    from jcvi.utils.range import range_intersect
    from jcvi.algorithms.graph import BiGraph, BiEdge
    from jcvi.assembly.syntenypath import graph_to_agp

    p = OptionParser(connect.__doc__)
    p.add_option("--clip", default=2000, type="int",
            help="Only consider end of contigs [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, blastfile = args
    clip = opts.clip

    sizes = Sizes(fastafile).mapping
    blast = Blast(blastfile)
    blasts = []
    for b in blast:
        seqid = b.subject
        size = sizes[seqid]
        start, end = b.sstart, b.sstop
        cstart, cend = min(size, clip), max(0, size - clip)
        if start > cstart and end < cend:
            continue
        blasts.append(b)

    key = lambda x: x.query
    blasts.sort(key=key)
    g = BiGraph()
    for query, bb in groupby(blasts, key=key):
        bb = sorted(bb, key=lambda x: x.qstart)
        nsubjects = len(set(x.subject for x in bb))
        if nsubjects == 1:
            continue
        print "\n".join(str(x) for x in bb)
        for a, b in pairwise(bb):
            astart, astop = a.qstart, a.qstop
            bstart, bstop = b.qstart, b.qstop
            if a.subject == b.subject:
                continue

            arange = astart, astop
            brange = bstart, bstop
            ov = range_intersect(arange, brange)
            alen = astop - astart + 1
            blen = bstop - bstart + 1
            if ov:
                ostart, ostop = ov
                ov = ostop - ostart + 1

            print ov, alen, blen
            if ov and (ov > alen / 2 or ov > blen / 2):
                print "Too much overlap ({0})".format(ov)
                continue

            asub = a.subject
            bsub = b.subject
            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            e = BiEdge(asub, bsub, atag, btag)
            g.add_edge(e)
            print "=" * 5, e

    graph_to_agp(g, blastfile, fastafile, verbose=False)
예제 #8
0
def mask(args):
    """
    %prog mask agpfile bedfile

    Mask given ranges in components to gaps.
    """
    p = OptionParser(mask.__doc__)
    p.add_option("--split",
                 default=False,
                 action="store_true",
                 help="Split object and create new names [default: %default]")
    p.add_option(
        "--log",
        default=False,
        action="store_true",
        help="Write verbose logs to .masklog file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    # agp lines to replace original ones, keyed by the component
    agp_fixes = defaultdict(list)

    newagpfile = agpfile.replace(".agp", ".masked.agp")
    logfile = bedfile.replace(".bed", ".masklog")
    fw = open(newagpfile, "w")
    if opts.log:
        fwlog = open(logfile, "w")

    for component, intervals in bed.sub_beds():
        if opts.log:
            print >> fwlog, "\n".join(str(x) for x in intervals)
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation
        if opts.log:
            print >> fwlog, a

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        # Make sure `ivs` contain DISJOINT ranges, and located within `arange`
        ivs = []
        for i in intervals:
            iv = range_intersect(arange, (i.start, i.end))
            if iv is not None:
                ivs.append(iv)

        # Sort the ends of `ivs` as well as the arange
        arange = a.component_beg - 1, a.component_end + 1
        endpoints = sorted(flatten(ivs + [arange]))
        # reverse if component on negative strand
        if orientation == '-':
            endpoints.reverse()

        sum_of_spans = 0
        # assign complements as sequence components
        for i, (a, b) in enumerate(pairwise(endpoints)):
            if orientation == '-':
                a, b = b, a
            if orientation not in ('+', '-'):
                orientation = '+'

            oid = object + "_{0}".format(i / 2) if opts.split else object
            aline = [oid, 0, 0, 0]
            if i % 2 == 0:
                cspan = b - a - 1
                aline += ['D', component, a + 1, b - 1, orientation]
                is_gap = False
            else:
                cspan = b - a + 1
                aline += ["N", cspan, "fragment", "yes"]
                is_gap = True
            if cspan <= 0:
                continue

            sum_of_spans += cspan
            aline = "\t".join(str(x) for x in aline)
            if not (opts.split and is_gap):
                agp_fixes[component].append(aline)

            if opts.log:
                print >> fwlog, aline

        #assert component_span == sum_of_spans
        if opts.log:
            print >> fwlog

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile
예제 #9
0
def test_range_intersect(a, b, expected):
    from jcvi.utils.range import range_intersect

    assert range_intersect(a, b) == expected