def __init__( self, data: TSPDataModel, work_dir=Work_dir, clean=True, verbose=False, precision=0, seed=666, ): """Run concorde on TSP instance Args: data (TSPDataModel): TSP instance with edge weights work_dir ([type], optional): Path to the work dir. Defaults to Work_dir. clean (bool, optional): Clean up intermediate results. Defaults to True. verbose (bool, optional): Show verbose messages. Defaults to False. precision (int, optional): Float precision of distance. Defaults to 0. seed (int, optional): Random seed. Defaults to 666. """ self.data = data self.work_dir = work_dir self.clean = clean self.verbose = verbose mkdir(work_dir) tspfile = op.join(work_dir, "data.tsp") self.print_to_tsplib(tspfile, precision=precision) _, outfile = self.run_concorde(tspfile, seed=seed) self.tour = self.parse_output(outfile) if clean: shutil.rmtree(work_dir) residual_output = ["data.sol", "data.res", "Odata.res"] FileShredder(residual_output, verbose=False)
def mergeclean(args): """ %prog mergeclean [*.bam|*.count] Clean redundant merged bam/count files. This usually happens after running formats.sam.merge() several times. """ from itertools import groupby from jcvi.formats.base import FileShredder p = OptionParser(mergeclean.__doc__) p.set_sep(sep="_", help="Separator to group per prefix") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) files = sorted(args) sep = opts.sep key = lambda x: x.split(sep)[0] mtime = lambda x: os.stat(x).st_mtime for pf, fs in groupby(files, key=key): fs = list(fs) if len(fs) == 1: continue newest_f = max(fs, key=mtime) print >> sys.stderr, "|".join(fs), "=>", newest_f fs.remove(newest_f) FileShredder(fs)
def download(url, filename=None, debug=True, cookies=None): from urlparse import urlsplit from subprocess import CalledProcessError from jcvi.formats.base import FileShredder scheme, netloc, path, query, fragment = urlsplit(url) filename = filename or op.basename(path) filename = filename.strip() if not filename: filename = "index.html" if op.exists(filename): if debug: msg = "File `{0}` exists. Download skipped.".format(filename) logging.error(msg) else: from jcvi.utils.ez_setup import get_best_downloader downloader = get_best_downloader() try: downloader(url, filename, cookies=cookies) except (CalledProcessError, KeyboardInterrupt) as e: print >> sys.stderr, e FileShredder([filename]) return filename
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for three modes of BWA - mem (default), aln, bwasw (long reads). """ valid_modes = ("bwasw", "aln", "mem") p = OptionParser(align.__doc__) p.add_option("--mode", default="mem", choices=valid_modes, help="BWA mode [default: %default]") p.add_option("--readtype", choices=("pacbio", "pbread"), help="Read type in bwa-mem") p.set_cutoff(cutoff=800) p.set_sam_options() opts, args = p.parse_args(args) mode = opts.mode nargs = len(args) if nargs not in (2, 3): sys.exit(not p.print_help()) tag = "bwa-{0}: ".format(mode) c = mem if nargs == 2: tag += "Single-end alignment" if mode == "bwasw": c = bwasw elif mode == "aln": c = samse else: assert mode != "bwasw", "Cannot use --bwasw with paired-end mode" tag += "Paired-end alignment" if mode == "aln": c = sampe logging.debug(tag) args[0] = get_abs_path(args[0]) cmd, samfile = c(args, opts) if cmd: cmd = output_bam(cmd, samfile) bam = opts.bam unmapped = opts.unmapped sh(cmd) if unmapped: dbfile, readfile = args[:2] mopts = [samfile, "--unmapped"] if not bam: mopts += ["--sam"] mapped(mopts) FileShredder([samfile]) return samfile, None
def refine(args): """ %prog refine breakpoints.bed gaps.bed Find gaps within or near breakpoint region. """ p = OptionParser(refine.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) breakpointsbed, gapsbed = args ncols = len(open(breakpointsbed).next().split()) logging.debug("File {0} contains {1} columns.".format( breakpointsbed, ncols)) cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed) pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0]) ingapsbed = pf + ".bed" sh(cmd, outfile=ingapsbed) fp = open(ingapsbed) data = [x.split() for x in fp] nogapsbed = pf + ".nogaps.bed" largestgapsbed = pf + ".largestgaps.bed" nogapsfw = open(nogapsbed, "w") largestgapsfw = open(largestgapsbed, "w") for b, gaps in groupby(data, key=lambda x: x[:ncols]): gaps = list(gaps) gap = gaps[0] if len(gaps) == 1 and gap[-1] == "0": assert gap[-2] == "." print >> nogapsfw, "\t".join(b) continue gaps = [(int(x[-1]), x) for x in gaps] maxgap = max(gaps)[1] print >> largestgapsfw, "\t".join(maxgap) nogapsfw.close() largestgapsfw.close() closestgapsbed = pf + ".closestgaps.bed" closestgapsfw = open(closestgapsbed, "w") cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed) sh(cmd, outfile=closestgapsbed) refinedbed = pf + ".refined.bed" FileMerger([largestgapsbed, closestgapsbed], outfile=refinedbed).merge() # Clean-up toclean = [nogapsbed, largestgapsbed, closestgapsbed] FileShredder(toclean) return refinedbed
def _needle(fa, fb, needlefile, a, b, results): """ Run single needle job """ from Bio.Emboss.Applications import NeedleCommandline needle_cline = NeedleCommandline(asequence=fa, bsequence=fb, gapopen=10, gapextend=0.5, outfile=needlefile) stdout, stderr = needle_cline() nh = NeedleHeader(needlefile) FileShredder([fa, fb, needlefile], verbose=False) r = ["\t".join((a, b, nh.identity, nh.score))] results.extend(r)
def __init__(self, edges, work_dir=Work_dir, clean=True, verbose=False, precision=0, seed=666): self.work_dir = work_dir self.clean = clean self.verbose = verbose mkdir(work_dir) tspfile = op.join(work_dir, "data.tsp") self.print_to_tsplib(edges, tspfile, precision=precision) retcode, outfile = self.run_concorde(tspfile, seed=seed) self.tour = self.parse_output(outfile) if clean: shutil.rmtree(work_dir) residual_output = ["data.sol", "data.res", "Odata.res"] FileShredder(residual_output, verbose=False)
def write_genes(self, output="gbout", individual=False, pep=True): if not individual: fwbed = must_open(output+".bed", "w") fwcds = must_open(output+".cds", "w") fwpep = must_open(output+".pep", "w") for recid, rec in self.iteritems(): if individual: mkdir(output) fwbed = must_open(op.join(output, recid+".bed"), "w") fwcds = must_open(op.join(output, recid+".cds"), "w") fwpep = must_open(op.join(output, recid+".pep"), "w") GenBank.write_genes_bed(rec, fwbed) GenBank.write_genes_fasta(rec, fwcds, fwpep) if not pep: FileShredder([fwpep.name])
def needle(args): """ %prog needle pairs a.pep.fasta b.pep.fasta Take protein pairs and needle them. """ from Bio.Emboss.Applications import NeedleCommandline from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.base import FileShredder p = OptionParser(needle.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pairsfile, apep, bpep = args afasta = Fasta(apep) bfasta = Fasta(bpep) fp = open(pairsfile) for row in fp: fa = open(pairsfile + "_a.fasta", "w") fb = open(pairsfile + "_b.fasta", "w") a, b = row.split() a = afasta[a] b = bfasta[b] SeqIO.write([a], fa, "fasta") SeqIO.write([b], fb, "fasta") fa.close() fb.close() needlefile = pairsfile + "_ab.needle" needle_cline = NeedleCommandline(asequence=fa.name, bsequence=fb.name, gapopen=10, gapextend=0.5, outfile=needlefile) stdout, stderr = needle_cline() print >> sys.stderr, stdout + stderr #align = AlignIO.read(needlefile, "emboss") nh = NeedleHeader(needlefile) print "\t".join((a.id, b.id, nh.identity, nh.score)) FileShredder([fa.name, fb.name, needlefile])
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome using Globus API. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum The downloader will prompt you to enter Phytozome user name and password during downloading. Please register for a login at: https://phytozome.jgi.doe.gov/pz/portal.html. """ from jcvi.apps.biomart import GlobusXMLParser p = OptionParser(phytozome.__doc__) p.add_option( "--version", default="12", choices=("9", "10", "11", "12", "12_unrestricted", "13"), help="Phytozome version", ) p.add_option( "--assembly", default=False, action="store_true", help="Download assembly", ) p.add_option( "--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference", ) p.set_downloader() opts, args = p.parse_args(args) downloader = opts.downloader directory_listing = ".phytozome_directory_V{}.xml".format(opts.version) # Get directory listing base_url = "http://genome.jgi.doe.gov" dlist = "{}/ext-api/downloads/get-directory?organism=PhytozomeV{}".format( base_url, opts.version ) # Make sure we have a valid cookies cookies = get_cookies() if cookies is None: logging.error("Error fetching cookies ... cleaning up") FileShredder([directory_listing]) sys.exit(1) # Proceed to use the cookies and download the species list try: download( dlist, filename=directory_listing, cookies=cookies, downloader=downloader, ) g = GlobusXMLParser(directory_listing) except: logging.error("Error downloading directory listing ... cleaning up") FileShredder([directory_listing, cookies]) sys.exit(1) genomes = g.get_genomes() valid_species = genomes.keys() species_tile = tile(valid_species) p.set_usage("\n".join((phytozome.__doc__, species_tile))) if len(args) != 1: sys.exit(not p.print_help()) (species,) = args if species == "all": species = ",".join(valid_species) species = species.split(",") for s in species: res = download_species_phytozome( genomes, s, valid_species, base_url, cookies, assembly=opts.assembly, downloader=downloader, ) if not res: logging.error("No files downloaded") gff, fa = res.get("gff"), res.get("cds") if opts.format: format_bed_and_cds(s, gff, fa)
def refine(args): """ %prog refine breakpoints.bed gaps.bed Find gaps within or near breakpoint region. For breakpoint regions with no gaps, there are two options: - Break in the middle of the region - Break at the closest gap (--closest) """ p = OptionParser(refine.__doc__) p.add_option( "--closest", default=False, action="store_true", help="In case of no gaps, use closest", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) breakpointsbed, gapsbed = args ncols = len(open(breakpointsbed).next().split()) logging.debug("File {0} contains {1} columns.".format(breakpointsbed, ncols)) cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed) pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0]) ingapsbed = pf + ".bed" sh(cmd, outfile=ingapsbed) fp = open(ingapsbed) data = [x.split() for x in fp] nogapsbed = pf + ".nogaps.bed" largestgapsbed = pf + ".largestgaps.bed" nogapsfw = open(nogapsbed, "w") largestgapsfw = open(largestgapsbed, "w") for b, gaps in groupby(data, key=lambda x: x[:ncols]): gaps = list(gaps) gap = gaps[0] if len(gaps) == 1 and gap[-1] == "0": assert gap[-3] == "." print("\t".join(b), file=nogapsfw) continue gaps = [(int(x[-1]), x) for x in gaps] maxgap = max(gaps)[1] print("\t".join(maxgap), file=largestgapsfw) nogapsfw.close() largestgapsfw.close() beds = [largestgapsbed] toclean = [nogapsbed, largestgapsbed] if opts.closest: closestgapsbed = pf + ".closestgaps.bed" cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed) sh(cmd, outfile=closestgapsbed) beds += [closestgapsbed] toclean += [closestgapsbed] else: pointbed = pf + ".point.bed" pbed = Bed() bed = Bed(nogapsbed) for b in bed: pos = (b.start + b.end) / 2 b.start, b.end = pos, pos pbed.append(b) pbed.print_to_file(pointbed) beds += [pointbed] toclean += [pointbed] refinedbed = pf + ".refined.bed" FileMerger(beds, outfile=refinedbed).merge() # Clean-up FileShredder(toclean) return refinedbed
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() if len(atoms) <= 6: continue unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) finalbed = Bed() for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: finalbed.add("{0}\n".format(b)) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] finalbed.add("\t".join(str(x) for x in (scf, 0, size, sid, 1000, strand))) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def reindex(args): """ %prog reindex gffile pep.fasta ref.pep.fasta Reindex the splice isoforms (mRNA) in input GFF file, preferably generated after PASA annotation update In the input GFF file, there can be several types of mRNA within a locus: * CDS matches reference, UTR extended, inherits reference mRNA ID * CDS (slightly) different from reference, inherits reference mRNA ID * Novel isoform added by PASA, have IDs like "LOCUS.1.1", "LOCUS.1.2" * Multiple mRNA collapsed due to shared structure, have IDs like "LOCUS.1-LOCUS.1.1" In the case of multiple mRNA which have inherited the same reference mRNA ID, break ties by comparing the new protein with the reference protein using EMBOSS `needle` to decide which mRNA retains ID and which is assigned a new ID. All mRNA identifiers should follow the AGI naming conventions. When reindexing the isoform identifiers, order mRNA based on: * decreasing transcript length * decreasing support from multiple input datasets used to run pasa.consolidate() """ from jcvi.formats.gff import make_index from jcvi.formats.fasta import Fasta from jcvi.apps.emboss import needle from jcvi.formats.base import FileShredder from tempfile import mkstemp p = OptionParser(reindex.__doc__) p.add_option("--scores", type="str", \ help="read from existing EMBOSS `needle` scores file") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) gffile, pep, refpep, = args gffdb = make_index(gffile) reffasta = Fasta(refpep) if not opts.scores: fh, pairsfile = mkstemp(prefix='pairs', suffix=".txt", dir=".") fw = must_open(pairsfile, "w") conflict, novel = AutoVivification(), {} for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')): geneid = atg_name(gene.id, retval='locus') novel[geneid] = [] updated_mrna, hybrid_mrna = [], [] for mrna in gffdb.children(gene, featuretype='mRNA', order_by=('seqid', 'start')): if re.match(atg_name_pat, mrna.id) is not None and "_" not in mrna.id: pf, mrnaid = parse_prefix(mrna.id) mlen = gffdb.children_bp(mrna, child_featuretype='exon') if "-" in mrna.id: hybrid_mrna.append((mrna.id, mrna.start, mlen, len(pf))) else: updated_mrna.append((mrna.id, mrna.start, mlen, len(pf))) for mrna in sorted(updated_mrna, key=lambda k:(k[1], -k[2], -k[3])): pf, mrnaid = parse_prefix(mrna[0]) mstart, mlen = mrna[1], mrna[2] iso = atg_name(mrnaid, retval='iso') newiso = "{0}{1}".format(iso, re.sub(atg_name_pat, "", mrnaid)) if iso == newiso: if iso not in conflict[geneid]: conflict[geneid][iso] = [] conflict[geneid][iso].append((mrna[0], iso, newiso, \ mstart, mlen, len(pf))) else: novel[geneid].append((mrna[0], None, newiso, \ mstart, mlen, len(pf))) for mrna in sorted(hybrid_mrna, key=lambda k:(k[1], -k[2], -k[3])): pf, mrnaid = parse_prefix(mrna[0]) mstart, mlen = mrna[1], mrna[2] _iso, _newiso = [], [] for id in sorted(mrnaid.split("-")): a = atg_name(id, retval='iso') b = "{0}{1}".format(a, re.sub(atg_name_pat, "", id)) _iso.append(a) _newiso.append(b) _novel = None newiso = "-".join(str(x) for x in set(_newiso)) for iso, niso in zip(_iso, _newiso): if iso == niso: if iso not in conflict[geneid]: conflict[geneid][iso] = \ [(mrna[0], iso, newiso, mstart, mlen, len(pf))] _novel = None break _novel = True if _novel is not None: novel[geneid].append((mrna[0], None, newiso, \ mstart, mlen, len(pf))) if not opts.scores: for isoform in sorted(conflict[geneid]): mrnaid = "{0}.{1}".format(geneid, isoform) if mrnaid in reffasta.keys(): for mrna in conflict[geneid][isoform]: print >> fw, "\t".join(str(x) for x in (mrnaid, mrna[0])) scoresfile = None if not opts.scores: fw.close() needle([pairsfile, refpep, pep]) FileShredder([pairsfile], verbose=False) scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0]) else: scoresfile = opts.scores scores = read_scores(scoresfile, sort=True, trimsuffix=False) primary = {} for geneid in conflict: primary[geneid] = [] for iso in sorted(conflict[geneid]): conflict[geneid][iso].sort(key=lambda k:(k[3], -k[4], -k[5])) _iso = "{0}.{1}".format(geneid, iso) if _iso not in scores: novel[geneid].extend(conflict[geneid][iso]) continue top_score = scores[_iso][0][1] result = next((i for i, v in enumerate(conflict[geneid][iso]) if v[0] == top_score), None) if result is not None: primary[geneid].append(conflict[geneid][iso][result]) del conflict[geneid][iso][result] if geneid not in novel: novel[geneid] = [] novel[geneid].extend(conflict[geneid][iso]) novel[geneid].sort(key=lambda k:(k[3], -k[4], -k[5])) fw = must_open(opts.outfile, 'w') for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')): geneid = gene.id print >> fw, gene seen = [] if geneid in primary: all_mrna = primary[geneid] all_mrna.extend(novel[geneid]) for iso, mrna in enumerate(all_mrna): _mrna = gffdb[mrna[0]] _iso = mrna[1] if mrna not in novel[geneid]: seen.append(int(mrna[1])) else: mseen = 0 if len(seen) == 0 else max(seen) _iso = (mseen + iso + 1) - len(seen) _mrnaid = "{0}.{1}".format(geneid, _iso) _mrna['ID'], _mrna['_old_ID'] = [_mrnaid], [_mrna.id] print >> fw, _mrna for c in gffdb.children(_mrna, order_by=('start')): c['Parent'] = [_mrnaid] print >> fw, c else: for feat in gffdb.children(gene, order_by=('seqid', 'start')): print >> fw, feat fw.close()
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta
def maker(args): """ %prog maker maker.gff3 genome.fasta Prepare EVM inputs by separating tracks from MAKER. """ from jcvi.formats.base import SetFile, FileShredder A, T, P = "ABINITIO_PREDICTION", "TRANSCRIPT", "PROTEIN" # Stores default weights and types Registry = {\ "maker": (A, 5), "augustus_masked": (A, 1), "snap_masked": (A, 1), "genemark": (A, 1), "est2genome": (T, 5), "est_gff": (T, 5), "protein2genome": (P, 5), "blastx": (P, 1) } p = OptionParser(maker.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, fastafile = args types = "type.ids" if need_update(gffile, types): cmd = "cut -f2 -s {0} | sort -u".format(gffile) sh(cmd, outfile=types) types = SetFile(types) reg = defaultdict(list) weightsfile = "weights.txt" contents = [] for s in types: rs = s.split(":")[0] if rs not in Registry: continue type, weight = Registry[rs] reg[type].append(s) contents.append("\t".join(str(x) for x in (type, s, weight))) contents = "\n".join(sorted(contents)) write_file(weightsfile, contents, meta="weights file") evs = [x + ".gff" for x in (A, T, P)] FileShredder(evs) for type, tracks in reg.items(): for t in tracks: cmd = "grep '\t{0}' {1} | grep -v '_match\t' >> {2}.gff".format(t, gffile, type) sh(cmd) partition(evs) runfile = "run.sh" contents = EVMRUN.format(*evs) write_file(runfile, contents, meta="run script")