def index(args): """ %prog index bedfile Compress frgscffile.sorted and index it using `tabix`. """ p = OptionParser(index.__doc__) p.add_option("--query", help="Chromosome location [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args gzfile = bedfile + ".gz" if need_update(bedfile, gzfile): bedfile = sort([bedfile]) cmd = "bgzip -c {0}".format(bedfile) sh(cmd, outfile=gzfile) tbifile = gzfile + ".tbi" if need_update(gzfile, tbifile): cmd = "tabix -p bed {0}".format(gzfile) sh(cmd) query = opts.query if not query: return cmd = "tabix {0} {1}".format(gzfile, query) sh(cmd, outfile=opts.outfile)
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ p = OptionParser(calc.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print >> sys.stderr, "Incorrect arguments" sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n") work_dir = op.join(os.getcwd(), "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = translate_dna(dna_file) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print >> sys.stderr, "--------", p_rec_1.name, p_rec_2.name align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir) mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join( str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def bcf(args): """ %prog bcf fastafile bamfiles > bcffile Run mpileup on bam files. """ from jcvi.apps.grid import Jobs p = OptionParser(bcf.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] unsorted = [x for x in bamfiles if ".sorted." not in x] jargs = [[[x, "--unique"]] for x in unsorted] jobs = Jobs(index, args=jargs) jobs.run() bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles] bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles] cmd = "samtools mpileup -P ILLUMINA -E -ugDf" cmd += " {0} {1}".format(fastafile, " ".join(bamfiles)) cmd += " | bcftools view -bcvg -" sh(cmd, outfile=opts.outfile)
def extract(args): """ %prog extract gffile --contigs: Extract particular contig(s) from the gff file. If multiple contigs are involved, use "," to separate, e.g. "contig_12,contig_150" --names: Provide a file with IDs, one each line """ p = OptionParser(extract.__doc__) p.add_option("--contigs", help="Extract features from certain contigs [default: %default]") p.add_option("--names", help="Extract features with certain names [default: %default]") p.add_option("--fasta", default=False, action="store_true", help="Write FASTA if available [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args contigID = opts.contigs namesfile = opts.names contigID = set(contigID.split(",")) if contigID else None names = set(x.strip() for x in open(namesfile)) if namesfile else None outfile = opts.outfile fp = open(gffile) fw = must_open(outfile, "w") for row in fp: atoms = row.split() if len(atoms) == 0: continue tag = atoms[0] if row[0] == "#": if not (tag == RegionTag and contigID and atoms[1] not in contigID): print >> fw, row.rstrip() if tag == FastaTag: break continue b = GffLine(row) is_right_contig = (contigID and tag in contigID) or (not contigID) is_right_names = (names and b.attributes["Name"][0] in names) or \ (not names) if is_right_contig and is_right_names: print >> fw, row.rstrip() if not opts.fasta: return f = Fasta(gffile) for s in contigID: if s in f: SeqIO.write([f[s]], fw, "fasta")
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ p = OptionParser(calc.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print >>sys.stderr, "Incorrect arguments" sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n") work_dir = op.join(os.getcwd(), "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = translate_dna(dna_file) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir) mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join(str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def join(args): """ %prog join file1.txt file2.txt .. Join tabular files based on common column. --column specifies the column index to pivot on. Use comma to separate multiple values if the pivot column is different in each file. Maintain the order in the first file. """ from jcvi.utils.iter import flatten p = OptionParser(join.__doc__) p.add_option("--column", default="0", help="0-based column id, multiple values allowed [default: %default]") p.add_option("--noheader", default=False, action="store_true", help="Do not print header [default: %default]") set_outfile(p) opts, args = p.parse_args(args) nargs = len(args) if len(args) < 2: sys.exit(not p.print_help()) c = opts.column if "," in c: cc = [int(x) for x in c.split(",")] else: cc = [int(c)] * nargs assert len(cc) == nargs, "Column index number != File number" # Maintain the first file line order, and combine other files into it pivotfile = args[0] files = [DictFile(f, keypos=c, valuepos=None, delimiter="\t") \ for f, c in zip(args, cc)] otherfiles = files[1:] header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \ for x in files)) fp = open(pivotfile) fw = must_open(opts.outfile, "w") if not opts.noheader: print >> fw, header for row in fp: row = row.rstrip() atoms = row.split("\t") newrow = atoms key = atoms[cc[0]] for d in otherfiles: drow = d.get(key, ["na"] * d.ncols) newrow += drow print >> fw, "\t".join(newrow)
def asn(args): """ %prog asn asnfiles Mainly to get this block, and extract `str` field: general { db "TIGR" , tag str "mtg2_12952" } , genbank { accession "AC148996" , """ from jcvi.formats.base import must_open p = OptionParser(asn.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fw = must_open(opts.outfile, "w") for asnfile in args: fp = open(asnfile) ingeneralblock = False ingenbankblock = False gb, name = None, None for row in fp: if row.strip() == "": continue tag = row.split()[0] if tag == "general": ingeneralblock = True if ingeneralblock and tag == "str": if name is None: # Only allow first assignment name = row.split("\"")[1] ingeneralblock = False if tag == "genbank": ingenbankblock = True if ingenbankblock and tag == "accession": if gb is None: gb = row.split("\"")[1] ingenbankblock = False assert gb and name print >> fw, "{0}\t{1}".format(gb, name)
def merge(args): """ %prog merge gffiles Merge several gff files into one. When only one file is given, it is assumed to be a file with a list of gff files. """ p = OptionParser(merge.__doc__) set_outfile(p) opts, args = p.parse_args(args) nargs = len(args) if nargs < 1: sys.exit(not p.print_help()) if nargs == 1: listfile, = args fp = open(listfile) gffiles = [x.strip() for x in fp] else: gffiles = args outfile = opts.outfile deflines = set() fw = must_open(outfile, "w") fastarecs = {} for gffile in gffiles: fp = open(gffile) for row in fp: row = row.rstrip() if row[0] == '#': if row == FastaTag: break if row in deflines: continue else: deflines.add(row) print >> fw, row f = Fasta(gffile, lazy=True) for key, rec in f.iteritems_ordered(): if key in fastarecs.keys(): continue fastarecs[key] = rec print >> fw, FastaTag SeqIO.write(fastarecs.values(), fw, "fasta")
def last(args): """ %prog last old.fasta new.fasta Generate psl file using last. Calles apps.last() but with special parameters: -r5 -q95 -a0 -b95 -e500, which only reports alignments larger than 100 bp and >=95 % identity. """ from jcvi.apps.last import main as lastapp p = OptionParser(last.__doc__) p.add_option("--distant", default=False, action="store_true", help="Assume distant relations") p.add_option( "--minscore", default=100, type="int", help="Filter alignments by how many bases match [default: %default]") p.add_option("--minid", default=95, type="int", help="Minimum sequence identity [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args args = [oldfasta, newfasta, "--format=maf", \ "--outfile={0}".format(opts.outfile)] minscore = opts.minscore minid = opts.minid r = 100 - minid q = minid e = minscore * r extra = r'--params=-r{0} -q{1} -a0 -b{1} -e{2}'.format(r, q, e) if not opts.distant: args.append(extra) lastapp(args)
def bed(args): ''' %prog bed gff_file [--options] Parses the start, stop locations of the selected features out of GFF and generate a bed file ''' p = OptionParser(bed.__doc__) p.add_option( "--type", dest="type", default="gene", help= "Feature type to extract, use comma for multiple [default: %default]") p.add_option("--key", dest="key", default="ID", help="Key in the attributes to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args key = opts.key if key == "None": key = None type = set(x.strip() for x in opts.type.split(",")) gff = Gff(gffile, key=key) b = Bed() for g in gff: if g.type not in type: continue b.append(g.bedline) b.sort(key=b.key) b.print_to_file(opts.outfile)
def last(args): """ %prog last old.fasta new.fasta Generate psl file using last. Calles apps.last() but with special parameters: -r5 -q95 -a0 -b95 -e500, which only reports alignments larger than 100 bp and >=95 % identity. """ from jcvi.apps.last import main as lastapp p = OptionParser(last.__doc__) p.add_option("--distant", default=False, action="store_true", help="Assume distant relations") p.add_option("--minscore", default=100, type="int", help="Filter alignments by how many bases match [default: %default]") p.add_option("--minid", default=95, type="int", help="Minimum sequence identity [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args args = [oldfasta, newfasta, "--format=maf", \ "--outfile={0}".format(opts.outfile)] minscore = opts.minscore minid = opts.minid r = 100 - minid q = minid e = minscore * r extra = r'--params=-r{0} -q{1} -a0 -b{1} -e{2}'.format(r, q, e) if not opts.distant: args.append(extra) lastapp(args)
def phase(args): """ %prog phase genbankfiles Input has to be gb file. Search the `KEYWORDS` section to look for PHASE. Also look for "chromosome" and "clone" in the definition line. """ p = OptionParser(phase.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fw = must_open(opts.outfile, "w") for gbfile in args: for rec in SeqIO.parse(gbfile, "gb"): bac_phase, keywords = get_phase(rec) chr, clone = get_clone(rec) keyword_field = ";".join(keywords) print >> fw, "\t".join( (rec.id, str(bac_phase), keyword_field, chr, clone))
def phase(args): """ %prog phase genbankfiles Input has to be gb file. Search the `KEYWORDS` section to look for PHASE. Also look for "chromosome" and "clone" in the definition line. """ p = OptionParser(phase.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fw = must_open(opts.outfile, "w") for gbfile in args: for rec in SeqIO.parse(gbfile, "gb"): bac_phase, keywords = get_phase(rec) chr, clone = get_clone(rec) keyword_field = ";".join(keywords) print >> fw, "\t".join((rec.id, str(bac_phase), keyword_field, chr, clone))
def bed(args): ''' %prog bed gff_file [--options] Parses the start, stop locations of the selected features out of GFF and generate a bed file ''' p = OptionParser(bed.__doc__) p.add_option("--type", dest="type", default="gene", help="Feature type to extract, use comma for multiple [default: %default]") p.add_option("--key", dest="key", default="ID", help="Key in the attributes to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args key = opts.key if key == "None": key = None type = set(x.strip() for x in opts.type.split(",")) gff = Gff(gffile, key=key) b = Bed() for g in gff: if g.type not in type: continue b.append(g.bedline) b.sort(key=b.key) b.print_to_file(opts.outfile)
def extendbed(args): """ %prog extend agpfile componentfasta Extend the components to fill the component range. For example, a bed/gff3 file that was converted from the agp will contain only the BAC sequence intervals that are 'represented' - sometimes leaving the 5` and 3` out (those that overlap with adjacent sequences. This script fill up those ranges, potentially to make graphics for tiling path. """ from jcvi.formats.sizes import Sizes p = OptionParser(extendbed.__doc__) p.add_option("--nogaps", default=False, action="store_true", help="Do not print bed lines for gaps [default: %default]") p.add_option("--bed12", default=False, action="store_true", help="Produce bed12 formatted output [default: %default]") p.add_option("--gff", default=False, action="store_true", help="Produce gff3 formatted output. By default, ignores " +\ " AGP gap lines. [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) # If output format is GFF3, ignore AGP gap lines. if opts.gff: opts.nogaps = True agpfile, fastafile = args agp = AGP(agpfile) fw = must_open(opts.outfile, "w") if opts.gff: print >> fw, "##gff-version 3" ranges = defaultdict(list) thickCoords = [] # These are the coordinates before modify ranges # Make the first pass to record all the component ranges for a in agp: thickCoords.append((a.object_beg, a.object_end)) if a.is_gap: continue ranges[a.component_id].append(a) # Modify the ranges sizes = Sizes(fastafile).mapping for accn, rr in ranges.items(): alen = sizes[accn] a = rr[0] if a.orientation == "+": hang = a.component_beg - 1 else: hang = alen - a.component_end a.object_beg -= hang a = rr[-1] if a.orientation == "+": hang = alen - a.component_end else: hang = a.component_beg - 1 a.object_end += hang for a, (ts, te) in zip(agp, thickCoords): if opts.nogaps and a.is_gap: continue if opts.bed12: line = a.bedline a.object_beg, a.object_end = ts, te line += "\t" + a.bedextra print >> fw, line elif opts.gff: print >> fw, a.gffline() else: print >> fw, a.bedline
def bed(args): """ %prog bed agpfile print out the tiling paths in bed/gff3 format """ p = OptionParser(bed.__doc__) p.add_option("--gaps", default=False, action="store_true", help="Only print bed lines for gaps [default: %default]") p.add_option("--nogaps", default=False, action="store_true", help="Do not print bed lines for gaps [default: %default]") p.add_option("--bed12", default=False, action="store_true", help="Produce bed12 formatted output [default: %default]") set_outfile(p) g1 = OptionGroup(p, "GFF specific parameters", "Note: If not specified, output will be in `bed` format") g1.add_option("--gff", default=False, action="store_true", help="Produce gff3 formatted output. By default, ignores " +\ "AGP gap lines. [default: %default]") g1.add_option("--source", default="MGSC", help="Specify a gff3 source [default: `%default`]") g1.add_option("--feature", default="golden_path_fragment", help="Specify a gff3 feature type [default: `%default`]") g1.add_option("--verifySO", default=False, action="store_true", help="Verify gff3 feature type againt SO for validity. " +\ "Looks for `so.obo` in current folder. If not exists, " +\ "it downloads the obo file. [default: %default]") p.add_option_group(g1) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) # If output format is gff3, ignore AGP gap lines. if opts.gff: opts.nogaps = True # If 'verifySO' option is invoked, validate the SO term if opts.verifySO: validate_term(opts.feature) agpfile, = args agp = AGP(agpfile) fw = must_open(opts.outfile, "w") if opts.gff: print >> fw, "##gff-version 3" for a in agp: if opts.nogaps and a.is_gap: continue if opts.gaps and not a.is_gap: continue if opts.bed12: print >> fw, a.bed12line elif opts.gff: print >> fw, a.gffline(gff_source=opts.source, gff_feat_type=opts.feature) else: print >> fw, a.bedline fw.close() return fw.name
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ p = OptionParser(covfilter.__doc__) p.add_option("--pctid", dest="pctid", default=90, type="int", help="Percentage identity cutoff [default: %default]") p.add_option("--pctcov", dest="pctcov", default=50, type="int", help="Percentage identity cutoff [default: %default]") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") set_outfile(p, outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) from jcvi.algorithms.supermap import supermap blastfile, fastafile = args sizes = Sizes(fastafile).mapping querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(querysupermap) for query, blines in blast.iter_hits(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 for b in blines: this_covered += abs(b.qstart - b.qstop + 1) this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen this_coverage = this_covered * 100. / sizes[query] if opts.list: print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) if this_identity >= opts.pctid and this_coverage >= opts.pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\ format(mapped_count, mapped_count * 100. / total, total) print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) print >> sys.stderr, "Average id = {0:.2f}%".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sum(sizes[x] for x in queries) print >> sys.stderr, "Coverage: {0} covered, {1} total".\ format(covered, queries_combined) print >> sys.stderr, "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fp = open(blastfile) fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast.iter_line(): if b.query in valid: print >> fw, b
def bed12(args): """ %prog bed12 gffile > bedfile Produce bed12 file for coding features. The exons will be converted to blocks. The CDS range will be shown between thickStart to thickEnd. For reference, bed format consists of the following fields: 1. chrom 2. chromStart 3. chromEnd 4. name 5. score 6. strand 7. thickStart 8. thickEnd 9. itemRgb 10. blockCount 11. blockSizes 12. blockStarts """ p = OptionParser(bed12.__doc__) p.add_option("--parent", default="mRNA", help="Top feature type [default: %default]") p.add_option("--block", default="exon", help="Feature type for regular blocks [default: %default]") p.add_option("--thick", default="CDS", help="Feature type for thick blocks [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args parent, block, thick = opts.parent, opts.block, opts.thick outfile = opts.outfile g = make_index(gffile) fw = must_open(outfile, "w") for f in g.features_of_type(parent): chrom = f.chrom chromStart = f.start - 1 chromEnd = f.stop name = f.id score = 0 strand = f.strand thickStart = 1e15 thickEnd = 0 blocks = [] for c in g.children(name, 1): cstart, cend = c.start - 1, c.stop if c.featuretype == block: blockStart = cstart - chromStart blockSize = cend - cstart blocks.append((blockStart, blockSize)) elif c.featuretype == thick: thickStart = min(thickStart, cstart) thickEnd = max(thickEnd, cend) blocks.sort() blockStarts, blockSizes = zip(*blocks) blockCount = len(blocks) blockSizes = ",".join(str(x) for x in blockSizes) + "," blockStarts = ",".join(str(x) for x in blockStarts) + "," itemRgb = 0 print >> fw, "\t".join(str(x) for x in (chrom, chromStart, chromEnd, \ name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts))
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--format", default="BLASTN-", choices=supported_formats, help="output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option("--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") set_params(p) set_outfile(p) set_grid(p) opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") grid = opts.grid if grid: print >>sys.stderr, "Running jobs on JCVI grid" extra = opts.extra lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith("lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format, grid)) if grid: cmds = [lastz_2bit(x) for x in args] g = Grid(cmds) g.run() g.writestatus() p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() if grid: cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \ lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)] mkdir(outdir) g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\ format(i) for i in range(len(cmds))]) g.run() g.writestatus() else: args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def load(args): ''' %prog load gff_file fasta_file [--options] Parses the selected features out of GFF, with subfeatures concatenated. For example, to get the CDS sequences, do this:: $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS ''' from jcvi.formats.fasta import Seq, SeqRecord p = OptionParser(load.__doc__) p.add_option("--parents", dest="parents", default="mRNA", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option("--children", dest="children", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") p.add_option("--attribute", help="The attribute field to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) gff_file, fasta_file = args g = make_index(gff_file) f = Fasta(fasta_file, index=False) fw = must_open(opts.outfile, "w") parents = set(opts.parents.split(',')) children_list = set(opts.children.split(',')) attr = opts.attribute for feat in get_parents(gff_file, parents): children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) description = ",".join(feat.attributes[attr]) \ if attr and attr in feat.attributes else "" description = description.replace("\"", "") rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description) SeqIO.write([rec], fw, "fasta") fw.flush()
def load(args): ''' %prog load gff_file fasta_file [--options] Parses the selected features out of GFF, with subfeatures concatenated. For example, to get the CDS sequences, do this:: $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS ''' from jcvi.formats.fasta import Seq, SeqRecord p = OptionParser(load.__doc__) p.add_option( "--parents", dest="parents", default="mRNA", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option( "--children", dest="children", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") p.add_option("--attribute", help="The attribute field to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) gff_file, fasta_file = args g = make_index(gff_file) f = Fasta(fasta_file, index=False) fw = must_open(opts.outfile, "w") parents = set(opts.parents.split(',')) children_list = set(opts.children.split(',')) attr = opts.attribute for feat in get_parents(gff_file, parents): children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence( dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) description = ",".join(feat.attributes[attr]) \ if attr and attr in feat.attributes else "" description = description.replace("\"", "") rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description) SeqIO.write([rec], fw, "fasta") fw.flush()
def extract(args): """ %prog extract gffile --contigs: Extract particular contig(s) from the gff file. If multiple contigs are involved, use "," to separate, e.g. "contig_12,contig_150" --names: Provide a file with IDs, one each line """ p = OptionParser(extract.__doc__) p.add_option( "--contigs", help="Extract features from certain contigs [default: %default]") p.add_option( "--names", help="Extract features with certain names [default: %default]") p.add_option("--fasta", default=False, action="store_true", help="Write FASTA if available [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args contigID = opts.contigs namesfile = opts.names contigID = set(contigID.split(",")) if contigID else None names = set(x.strip() for x in open(namesfile)) if namesfile else None outfile = opts.outfile fp = open(gffile) fw = must_open(outfile, "w") for row in fp: atoms = row.split() if len(atoms) == 0: continue tag = atoms[0] if row[0] == "#": if not (tag == RegionTag and contigID and atoms[1] not in contigID): print >> fw, row.rstrip() if tag == FastaTag: break continue b = GffLine(row) is_right_contig = (contigID and tag in contigID) or (not contigID) is_right_names = (names and b.attributes["Name"][0] in names) or \ (not names) if is_right_contig and is_right_names: print >> fw, row.rstrip() if not opts.fasta: return f = Fasta(gffile) for s in contigID: if s in f: SeqIO.write([f[s]], fw, "fasta")
def main(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB, LASTAL and LASTEX. """ supported_formats = ("tab", "maf", "blast") p = OptionParser(main.__doc__) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--path", help="specify LAST path") p.add_option("--format", default="blast", choices=supported_formats, help="Output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--eval", default=False, action="store_true", help="Use lastex to recalculate E-value [default: %default]") set_params(p) set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args if opts.eval and opts.cpus > 1: raise Exception, "Option --eval cannnot work with multiple threads" path = opts.path getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") lastex_bin = getpath("lastex") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", lastdb_bin=lastdb_bin) cpus = opts.cpus logging.debug("Dispatch job to {0} cpus".format(cpus)) if opts.format == "maf": cmd = 'echo "##maf version=1"' sh(cmd) cmd = "{0} -u 0".format(lastal_bin) f = supported_formats.index(opts.format) cmd += " -f {0}".format(f) cmd += " {0} -".format(subjectdb) extra = opts.extra if extra: cmd += " " + extra if opts.eval: querydb = query.rsplit(".", 1)[0] run_lastdb(infile=query, outfile=querydb + ".prj") cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb) out_fh = must_open(opts.outfile, "w") lock = Lock() args = [(k + 1, cpus, out_fh, cmd, query, lock) \ for k in xrange(cpus)] g = Jobs(target=last, args=args) g.run()
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--format", default="BLASTN-", choices=supported_formats, help="output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option( "--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") p.add_option( "--similar", default=False, action="store_true", help="Use options tuned for close comparison [default: %default]") set_params(p) set_outfile(p) set_grid(p) opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") grid = opts.grid if grid: print >> sys.stderr, "Running jobs on JCVI grid" extra = opts.extra if opts.similar: extra += similarOptions lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith( "lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format, grid)) if grid: cmds = [lastz_2bit(x) for x in args] g = Grid(cmds) g.run() g.writestatus() p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() if grid: cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \ lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)] mkdir(outdir) g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\ format(i) for i in range(len(cmds))]) g.run() g.writestatus() else: args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()