def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps( [scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def embed(args): """ %prog embed evidencefile scaffolds.fasta contigs.fasta Use SSPACE evidencefile to scaffold contigs into existing scaffold structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE directly to scaffold. Rules: 1. Only update existing structure by embedding contigs small enough to fit. 2. Promote singleton contigs only if they are big (>= min_length). """ p = OptionParser(embed.__doc__) p.set_mingap(default=10) p.add_option("--min_length", default=200, type="int", help="Minimum length to consider [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) evidencefile, scaffolds, contigs = args min_length = opts.min_length splitfasta, oagp, cagp = gaps([scaffolds, "--split", "--mingap={0}".format(opts.mingap)]) agp = AGP(cagp) p = agp.graph ef = EvidenceFile(evidencefile, contigs) sizes = ef.sz q = ef.graph logging.debug("Reference graph: {0}".format(p)) logging.debug("Patch graph: {0}".format(q)) newagp = deepcopy(agp) seen = set() deleted = set() for a in agp: if a.is_gap: continue name = a.component_id object = a.object if name in deleted: print >>sys.stderr, "* Skip {0}, already embedded".format(name) continue seen.add(name) target_name, tag = get_target(p, name) path = q.get_path(name, target_name, tag=tag) path_size = sum([sizes[x.v] for x, t in path]) if path else None status = NO_UPDATE # Heuristic, the patch must not be too long if path and path_size > min_length and len(path) > 3: path = None if not path: print >>sys.stderr, name, target_name, path, path_size, status continue backward = False for x, t in path: if x.v in seen: print >>sys.stderr, "* Does not allow backward" " patch on {0}".format(x.v) backward = True break if backward: continue # Build the path plus the ends vv = q.get_node(name) path.appendleft((vv, tag)) if tag == ">": path.reverse() status = INSERT_BEFORE elif target_name is None: status = INSERT_AFTER else: target = q.get_node(target_name) path.append((target, tag)) status = INSERT_BETWEEN print >>sys.stderr, name, target_name, path, path_size, status # Trim the ends off from the constructed AGPLines lines = path_to_agp(q, path, object, sizes, status) if status == INSERT_BEFORE: lines = lines[:-1] td = newagp.insert_lines(name, lines, delete=True, verbose=True) elif status == INSERT_AFTER: lines = lines[1:] td = newagp.insert_lines(name, lines, after=True, delete=True, verbose=True) else: lines = lines[1:-1] td = newagp.update_between(name, target_name, lines, delete=True, verbose=True) deleted |= td seen |= td # Recruite big singleton contigs CUTOFF = opts.min_length for ctg, size in sizes.items(): if ctg in seen: continue if size < CUTOFF: continue newagp.append(AGPLine.cline(ctg, ctg, sizes, "?")) # Write a new AGP file newagpfile = "embedded.agp" newagp.print_to_file(newagpfile, index=True) tidy([newagpfile, contigs])
def embed(args): """ %prog embed evidencefile scaffolds.fasta contigs.fasta Use SSPACE evidencefile to scaffold contigs into existing scaffold structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE directly to scaffold. Rules: 1. Only update existing structure by embedding contigs small enough to fit. 2. Promote singleton contigs only if they are big (>= min_length). """ p = OptionParser(embed.__doc__) p.set_mingap(default=10) p.add_option("--min_length", default=200, type="int", help="Minimum length to consider [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) evidencefile, scaffolds, contigs = args min_length = opts.min_length splitfasta, oagp, cagp = gaps([scaffolds, "--split", "--mingap={0}".format(opts.mingap)]) agp = AGP(cagp) p = agp.graph ef = EvidenceFile(evidencefile, contigs) sizes = ef.sz q = ef.graph logging.debug("Reference graph: {0}".format(p)) logging.debug("Patch graph: {0}".format(q)) newagp = deepcopy(agp) seen = set() deleted = set() for a in agp: if a.is_gap: continue name = a.component_id object = a.object if name in deleted: print >> sys.stderr, "* Skip {0}, already embedded".format(name) continue seen.add(name) target_name, tag = get_target(p, name) path = q.get_path(name, target_name, tag=tag) path_size = sum([sizes[x.v] for x, t in path]) if path else None status = NO_UPDATE # Heuristic, the patch must not be too long if path and path_size > min_length and len(path) > 3: path = None if not path: print >> sys.stderr, name, target_name, path, path_size, status continue backward = False for x, t in path: if x.v in seen: print >> sys.stderr, "* Does not allow backward" \ " patch on {0}".format(x.v) backward = True break if backward: continue # Build the path plus the ends vv = q.get_node(name) path.appendleft((vv, tag)) if tag == ">": path.reverse() status = INSERT_BEFORE elif target_name is None: status = INSERT_AFTER else: target = q.get_node(target_name) path.append((target, tag)) status = INSERT_BETWEEN print >> sys.stderr, name, target_name, path, path_size, status # Trim the ends off from the constructed AGPLines lines = path_to_agp(q, path, object, sizes, status) if status == INSERT_BEFORE: lines = lines[:-1] td = newagp.insert_lines(name, lines, \ delete=True, verbose=True) elif status == INSERT_AFTER: lines = lines[1:] td = newagp.insert_lines(name, lines, after=True, \ delete=True, verbose=True) else: lines = lines[1:-1] td = newagp.update_between(name, target_name, lines, \ delete=True, verbose=True) deleted |= td seen |= td # Recruite big singleton contigs CUTOFF = opts.min_length for ctg, size in sizes.items(): if ctg in seen: continue if size < CUTOFF: continue newagp.append(AGPLine.cline(ctg, ctg, sizes, "?")) # Write a new AGP file newagpfile = "embedded.agp" newagp.print_to_file(newagpfile, index=True) tidy([newagpfile, contigs])