def mask(args): """ %prog mask fastafile Mask the contaminants. By default, this will compare against UniVec_Core and Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can perform FASTA tidy if requested. """ p = OptionParser(mask.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args assert op.exists(fastafile) outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta" vecbedfile = blast([fastafile]) ecoliurl = \ "ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.fna" ecolifile = download(ecoliurl, filename="Ecoli.fasta") assert op.exists(ecolifile) ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)]) cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile) cmd += " | mergeBed -nms -d 100 -i stdin" cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".\ format(fastafile, outfastafile) sh(cmd) tidy([outfastafile])
def mask(args): """ %prog mask fastafile Mask the contaminants. By default, this will compare against UniVec_Core and Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can perform FASTA tidy if requested. """ p = OptionParser(mask.__doc__) p.add_option( "--db", default=ECOLI_URL, help= "Contaminant db other than Ecoli K12, will download if file starts with http://, https://, or ftp://", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args db = opts.db assert op.exists(fastafile) outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta" vecbedfile = blast([fastafile]) ecolifile = (download(db, filename="Ecoli.fasta", handle_gzip=True) if is_internet_file(db) else db) assert op.exists(ecolifile) ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)]) cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile) cmd += " | sort -k1,1 -k2,2n" cmd += " | mergeBed -c 4 -o distinct -d 100 -i stdin" cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".format( fastafile, outfastafile) sh(cmd) return tidy([outfastafile])
def scaffold(args): """ %prog scaffold ctgfasta agpfile Build scaffolds based on ordering in the AGP file. """ from jcvi.formats.agp import bed, order_to_agp, build from jcvi.formats.bed import Bed p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, agpfile = args sizes = Sizes(ctgfasta).mapping pf = ctgfasta.rsplit(".", 1)[0] phasefile = pf + ".phases" fwphase = open(phasefile, "w") newagpfile = pf + ".new.agp" fwagp = open(newagpfile, "w") scaffoldbuckets = defaultdict(list) bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"]) bb = Bed(bedfile) for s, partialorder in bb.sub_beds(): name = partialorder[0].accn bname = name.rsplit("_", 1)[0] if opts.prefix else s scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder]) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. for bname, scaffolds in sorted(scaffoldbuckets.items()): ctgorder = [] singletons = set() for scaf in sorted(scaffolds): for node, orientation in scaf: ctgorder.append((node, orientation)) if len(scaf) == 1: singletons.add(node) nscaffolds = len(scaffolds) nsingletons = len(singletons) if nsingletons == 1 and nscaffolds == 0: phase = 3 elif nsingletons == 0 and nscaffolds == 1: phase = 2 else: phase = 1 msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\ format(bname, nscaffolds, nsingletons, phase) print >> sys.stderr, msg print >> fwphase, "\t".join((bname, str(phase))) order_to_agp(bname, ctgorder, sizes, fwagp) fwagp.close() os.remove(bedfile) fastafile = "final.fasta" build([newagpfile, ctgfasta, fastafile]) tidy([fastafile])