예제 #1
0
def dedup(args):
    """
    %prog dedup scaffolds.fasta

    Remove redundant contigs with CD-HIT. This is run prior to
    assembly.sspace.embed().
    """
    from jcvi.formats.fasta import gaps
    from jcvi.apps.cdhit import deduplicate, ids

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=GoodPct)
    p.set_mingap(default=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    scaffolds, = args
    mingap = opts.mingap
    splitfile, oagpfile, cagpfile = gaps(
        [scaffolds, "--split", "--mingap={0}".format(mingap)])

    dd = splitfile + ".cdhit"
    clstrfile = dd + ".clstr"
    idsfile = dd + ".ids"
    if need_update(splitfile, clstrfile):
        deduplicate([splitfile, "--pctid={0}".format(opts.pctid)])
    if need_update(clstrfile, idsfile):
        ids([clstrfile])

    agp = AGP(cagpfile)
    reps = set(x.split()[-1] for x in open(idsfile))
    pf = scaffolds.rsplit(".", 1)[0]
    dedupagp = pf + ".dedup.agp"
    fw = open(dedupagp, "w")

    ndropped = ndroppedbases = 0
    for a in agp:
        if not a.is_gap and a.component_id not in reps:
            span = a.component_span
            logging.debug("Drop component {0} ({1})".\
                          format(a.component_id, span))
            ndropped += 1
            ndroppedbases += span
            continue
        print >> fw, a
    fw.close()

    logging.debug("Dropped components: {0}, Dropped bases: {1}".\
                  format(ndropped, ndroppedbases))
    logging.debug("Deduplicated file written to `{0}`.".format(dedupagp))

    tidyagp = tidy([dedupagp, splitfile])
    dedupfasta = pf + ".dedup.fasta"
    build([tidyagp, dd, dedupfasta])

    return dedupfasta
예제 #2
0
파일: goldenpath.py 프로젝트: Hensonmw/jcvi
def dedup(args):
    """
    %prog dedup scaffolds.fasta

    Remove redundant contigs with CD-HIT. This is run prior to
    assembly.sspace.embed().
    """
    from jcvi.formats.fasta import gaps
    from jcvi.apps.cdhit import deduplicate, ids

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=GoodPct)
    p.set_mingap(default=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    scaffolds, = args
    mingap = opts.mingap
    splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)])

    dd = splitfile + ".cdhit"
    clstrfile = dd + ".clstr"
    idsfile = dd + ".ids"
    if need_update(splitfile, clstrfile):
        deduplicate([splitfile, "--pctid={0}".format(opts.pctid)])
    if need_update(clstrfile, idsfile):
        ids([clstrfile])

    agp = AGP(cagpfile)
    reps = set(x.split()[-1] for x in open(idsfile))
    pf = scaffolds.rsplit(".", 1)[0]
    dedupagp = pf + ".dedup.agp"
    fw = open(dedupagp, "w")

    ndropped = ndroppedbases = 0
    for a in agp:
        if not a.is_gap and a.component_id not in reps:
            span = a.component_span
            logging.debug("Drop component {0} ({1})".\
                          format(a.component_id, span))
            ndropped += 1
            ndroppedbases += span
            continue
        print >> fw, a
    fw.close()

    logging.debug("Dropped components: {0}, Dropped bases: {1}".\
                  format(ndropped, ndroppedbases))
    logging.debug("Deduplicated file written to `{0}`.".format(dedupagp))

    tidyagp = tidy([dedupagp, splitfile])
    dedupfasta = pf + ".dedup.fasta"
    build([tidyagp, dd, dedupfasta])

    return dedupfasta
예제 #3
0
파일: goldenpath.py 프로젝트: Hensonmw/jcvi
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang", default=GoodOverhang, type="int",
                 help="Maximum overhang length [default: %default]")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl, a.component_span, b.component_span,
                        cutoff, qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".\
                    format(len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print >> sys.stderr, "Remove {0}".format(c)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta
예제 #4
0
파일: sspace.py 프로젝트: arvin580/jcvi
def embed(args):
    """
    %prog embed evidencefile scaffolds.fasta contigs.fasta

    Use SSPACE evidencefile to scaffold contigs into existing scaffold
    structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE
    directly to scaffold.

    Rules:
    1. Only update existing structure by embedding contigs small enough to fit.
    2. Promote singleton contigs only if they are big (>= min_length).
    """
    p = OptionParser(embed.__doc__)
    p.set_mingap(default=10)
    p.add_option("--min_length", default=200, type="int", help="Minimum length to consider [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    evidencefile, scaffolds, contigs = args
    min_length = opts.min_length
    splitfasta, oagp, cagp = gaps([scaffolds, "--split", "--mingap={0}".format(opts.mingap)])

    agp = AGP(cagp)
    p = agp.graph

    ef = EvidenceFile(evidencefile, contigs)
    sizes = ef.sz
    q = ef.graph

    logging.debug("Reference graph: {0}".format(p))
    logging.debug("Patch graph: {0}".format(q))

    newagp = deepcopy(agp)

    seen = set()
    deleted = set()
    for a in agp:
        if a.is_gap:
            continue

        name = a.component_id
        object = a.object
        if name in deleted:
            print >>sys.stderr, "* Skip {0}, already embedded".format(name)
            continue

        seen.add(name)

        target_name, tag = get_target(p, name)
        path = q.get_path(name, target_name, tag=tag)
        path_size = sum([sizes[x.v] for x, t in path]) if path else None
        status = NO_UPDATE

        # Heuristic, the patch must not be too long
        if path and path_size > min_length and len(path) > 3:
            path = None

        if not path:
            print >>sys.stderr, name, target_name, path, path_size, status
            continue

        backward = False
        for x, t in path:
            if x.v in seen:
                print >>sys.stderr, "* Does not allow backward" " patch on {0}".format(x.v)
                backward = True
                break

        if backward:
            continue

        # Build the path plus the ends
        vv = q.get_node(name)
        path.appendleft((vv, tag))
        if tag == ">":
            path.reverse()
            status = INSERT_BEFORE
        elif target_name is None:
            status = INSERT_AFTER
        else:
            target = q.get_node(target_name)
            path.append((target, tag))
            status = INSERT_BETWEEN

        print >>sys.stderr, name, target_name, path, path_size, status

        # Trim the ends off from the constructed AGPLines
        lines = path_to_agp(q, path, object, sizes, status)
        if status == INSERT_BEFORE:
            lines = lines[:-1]
            td = newagp.insert_lines(name, lines, delete=True, verbose=True)
        elif status == INSERT_AFTER:
            lines = lines[1:]
            td = newagp.insert_lines(name, lines, after=True, delete=True, verbose=True)
        else:
            lines = lines[1:-1]
            td = newagp.update_between(name, target_name, lines, delete=True, verbose=True)
        deleted |= td
        seen |= td

    # Recruite big singleton contigs
    CUTOFF = opts.min_length
    for ctg, size in sizes.items():
        if ctg in seen:
            continue
        if size < CUTOFF:
            continue
        newagp.append(AGPLine.cline(ctg, ctg, sizes, "?"))

    # Write a new AGP file
    newagpfile = "embedded.agp"
    newagp.print_to_file(newagpfile, index=True)
    tidy([newagpfile, contigs])
예제 #5
0
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang",
                 default=GoodOverhang,
                 type="int",
                 help="Maximum overhang length [default: %default]")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl,
                    a.component_span,
                    b.component_span,
                    cutoff,
                    qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".\
                    format(len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print >> sys.stderr, "Remove {0}".format(c)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta
예제 #6
0
파일: gaps.py 프로젝트: zhaotao1987/jcvi
def annotate(args):
    """
    %prog annotate agpfile gaps.linkage.bed assembly.fasta

    Annotate AGP file with linkage info of `paired-end` or `map`.
    File `gaps.linkage.bed` is generated by assembly.gaps.estimate().
    """
    from jcvi.formats.agp import AGP, bed, tidy

    p = OptionParser(annotate.__doc__)
    p.add_option("--minsize", default=200,
                 help="Smallest component size [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    agpfile, linkagebed, assemblyfasta = args
    linkagebed = Bed(linkagebed)
    spannedgaps = set()
    for b in linkagebed:
        score = int(b.score)
        if score == 0:
            spannedgaps.add((b.accn, b.start, b.end))

    agp = AGP(agpfile)
    newagpfile = agpfile.rsplit(".", 1)[0] + ".linkage.agp"
    newagp = open(newagpfile, "w")
    contig_id = 0
    minsize = opts.minsize
    for a in agp:
        if not a.is_gap:
            cs = a.component_span
            if cs < minsize:
                a.is_gap = True
                a.component_type = "N"
                a.gap_length = cs
                a.gap_type = "scaffold"
                a.linkage = "yes"
                a.linkage_evidence = []
            else:
                contig_id += 1
                a.component_id = "contig{0:04d}".format(contig_id)
                a.component_beg = 1
                a.component_end = cs
                a.component_type = "W"

            print >> newagp, a
            continue

        gapinfo = (a.object, a.object_beg, a.object_end)
        gaplen = a.gap_length

        if gaplen == 100 and gapinfo not in spannedgaps:
            a.component_type = "U"
            tag = "map"
        else:
            tag = "paired-ends"

        a.linkage_evidence.append(tag)
        print >> newagp, a

    newagp.close()
    logging.debug("Annotated AGP written to `{0}`.".format(newagpfile))

    contigbed = assemblyfasta.rsplit(".", 1)[0] + ".contigs.bed"
    bedfile = bed([newagpfile, "--nogaps", "--outfile=" + contigbed])

    contigfasta = fastaFromBed(bedfile, assemblyfasta, name=True, stranded=True)

    tidy([newagpfile, contigfasta])
예제 #7
0
파일: sspace.py 프로젝트: biologyguy/jcvi
def embed(args):
    """
    %prog embed evidencefile scaffolds.fasta contigs.fasta

    Use SSPACE evidencefile to scaffold contigs into existing scaffold
    structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE
    directly to scaffold.

    Rules:
    1. Only update existing structure by embedding contigs small enough to fit.
    2. Promote singleton contigs only if they are big (>= min_length).
    """
    p = OptionParser(embed.__doc__)
    p.set_mingap(default=10)
    p.add_option("--min_length", default=200, type="int",
                 help="Minimum length to consider [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    evidencefile, scaffolds, contigs = args
    min_length = opts.min_length
    splitfasta, oagp, cagp = gaps([scaffolds, "--split",
                                   "--mingap={0}".format(opts.mingap)])

    agp = AGP(cagp)
    p = agp.graph

    ef = EvidenceFile(evidencefile, contigs)
    sizes = ef.sz
    q = ef.graph

    logging.debug("Reference graph: {0}".format(p))
    logging.debug("Patch graph: {0}".format(q))

    newagp = deepcopy(agp)

    seen = set()
    deleted = set()
    for a in agp:
        if a.is_gap:
            continue

        name = a.component_id
        object = a.object
        if name in deleted:
            print >> sys.stderr, "* Skip {0}, already embedded".format(name)
            continue

        seen.add(name)

        target_name, tag = get_target(p, name)
        path = q.get_path(name, target_name, tag=tag)
        path_size = sum([sizes[x.v] for x, t in path]) if path else None
        status = NO_UPDATE

        # Heuristic, the patch must not be too long
        if path and path_size > min_length and len(path) > 3:
            path = None

        if not path:
            print >> sys.stderr, name, target_name, path, path_size, status
            continue

        backward = False
        for x, t in path:
            if x.v in seen:
                print >> sys.stderr, "* Does not allow backward" \
                                     " patch on {0}".format(x.v)
                backward = True
                break

        if backward:
            continue

        # Build the path plus the ends
        vv = q.get_node(name)
        path.appendleft((vv, tag))
        if tag == ">":
            path.reverse()
            status = INSERT_BEFORE
        elif target_name is None:
            status = INSERT_AFTER
        else:
            target = q.get_node(target_name)
            path.append((target, tag))
            status = INSERT_BETWEEN

        print >> sys.stderr, name, target_name, path, path_size, status

        # Trim the ends off from the constructed AGPLines
        lines = path_to_agp(q, path, object, sizes, status)
        if status == INSERT_BEFORE:
            lines = lines[:-1]
            td = newagp.insert_lines(name, lines, \
                                 delete=True, verbose=True)
        elif status == INSERT_AFTER:
            lines = lines[1:]
            td = newagp.insert_lines(name, lines, after=True, \
                                 delete=True, verbose=True)
        else:
            lines = lines[1:-1]
            td = newagp.update_between(name, target_name, lines, \
                                 delete=True, verbose=True)
        deleted |= td
        seen |= td

    # Recruite big singleton contigs
    CUTOFF = opts.min_length
    for ctg, size in sizes.items():
        if ctg in seen:
            continue
        if size < CUTOFF:
            continue
        newagp.append(AGPLine.cline(ctg, ctg, sizes, "?"))

    # Write a new AGP file
    newagpfile = "embedded.agp"
    newagp.print_to_file(newagpfile, index=True)
    tidy([newagpfile, contigs])