def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps( [scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta