def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format( len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print("\t".join((a, b, str(c))), file=fw) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def gatk(args): """ %prog gatk bamfile reference.fasta Call SNPs based on GATK best practices. """ p = OptionParser(gatk.__doc__) p.add_option( "--indelrealign", default=False, action="store_true", help="Perform indel realignment", ) p.set_home("gatk") p.set_home("picard") p.set_phred() p.set_cpus(cpus=24) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, ref = args pf = bamfile.rsplit(".", 1)[0] mm = MakeManager() picard = "java -Xmx32g -jar {0}/picard.jar".format(opts.picard_home) tk = "java -Xmx32g -jar {0}/GenomeAnalysisTK.jar".format(opts.gatk_home) tk += " -R {0}".format(ref) # Step 0 - build reference dictfile = ref.rsplit(".", 1)[0] + ".dict" cmd1 = picard + " CreateSequenceDictionary" cmd1 += " R={0} O={1}".format(ref, dictfile) cmd2 = "samtools faidx {0}".format(ref) mm.add(ref, dictfile, (cmd1, cmd2)) # Step 1 - sort bam sortedbamfile = pf + ".sorted.bam" cmd = picard + " SortSam" cmd += " INPUT={0} OUTPUT={1}".format(bamfile, sortedbamfile) cmd += " SORT_ORDER=coordinate CREATE_INDEX=true" mm.add(bamfile, sortedbamfile, cmd) # Step 2 - mark duplicates dedupbamfile = pf + ".dedup.bam" cmd = picard + " MarkDuplicates" cmd += " INPUT={0} OUTPUT={1}".format(sortedbamfile, dedupbamfile) cmd += " METRICS_FILE=dedup.log CREATE_INDEX=true" mm.add(sortedbamfile, dedupbamfile, cmd) if opts.indelrealign: # Step 3 - create indel realignment targets intervals = pf + ".intervals" cmd = tk + " -T RealignerTargetCreator" cmd += " -I {0} -o {1}".format(dedupbamfile, intervals) mm.add(dedupbamfile, intervals, cmd) # Step 4 - indel realignment realignedbamfile = pf + ".realigned.bam" cmd = tk + " -T IndelRealigner" cmd += " -targetIntervals {0}".format(intervals) cmd += " -I {0} -o {1}".format(dedupbamfile, realignedbamfile) mm.add((dictfile, intervals), realignedbamfile, cmd) else: realignedbamfile = dedupbamfile # Step 5 - SNP calling vcf = pf + ".vcf" cmd = tk + " -T HaplotypeCaller" cmd += " -I {0}".format(realignedbamfile) cmd += " --genotyping_mode DISCOVERY" cmd += " -stand_emit_conf 10 -stand_call_conf 30" cmd += " -nct {0}".format(opts.cpus) cmd += " -o {0}".format(vcf) if opts.phred == "64": cmd += " --fix_misencoded_quality_scores" mm.add(realignedbamfile, vcf, cmd) # Step 6 - SNP filtering filtered_vcf = pf + ".filtered.vcf" cmd = tk + " -T VariantFiltration" cmd += " -V {0}".format(vcf) cmd += ' --filterExpression "DP < 10 || DP > 300 || QD < 2.0 || FS > 60.0 || MQ < 40.0"' cmd += ' --filterName "LOWQUAL"' cmd += ' --genotypeFilterExpression "isHomVar == 1"' cmd += ' --genotypeFilterName "HOMOVAR"' cmd += ' --genotypeFilterExpression "isHet == 1"' cmd += ' --genotypeFilterName "HET"' cmd += " -o {0}".format(filtered_vcf) mm.add(vcf, filtered_vcf, cmd) mm.write()
def embed(args): """ %prog embed evidencefile scaffolds.fasta contigs.fasta Use SSPACE evidencefile to scaffold contigs into existing scaffold structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE directly to scaffold. Rules: 1. Only update existing structure by embedding contigs small enough to fit. 2. Promote singleton contigs only if they are big (>= min_length). """ p = OptionParser(embed.__doc__) p.set_mingap(default=10) p.add_option("--min_length", default=200, type="int", help="Minimum length to consider [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) evidencefile, scaffolds, contigs = args min_length = opts.min_length splitfasta, oagp, cagp = gaps( [scaffolds, "--split", "--mingap={0}".format(opts.mingap)]) agp = AGP(cagp) p = agp.graph ef = EvidenceFile(evidencefile, contigs) sizes = ef.sz q = ef.graph logging.debug("Reference graph: {0}".format(p)) logging.debug("Patch graph: {0}".format(q)) newagp = deepcopy(agp) seen = set() deleted = set() for a in agp: if a.is_gap: continue name = a.component_id object = a.object if name in deleted: print >> sys.stderr, "* Skip {0}, already embedded".format(name) continue seen.add(name) target_name, tag = get_target(p, name) path = q.get_path(name, target_name, tag=tag) path_size = sum([sizes[x.v] for x, t in path]) if path else None status = NO_UPDATE # Heuristic, the patch must not be too long if path and path_size > min_length and len(path) > 3: path = None if not path: print >> sys.stderr, name, target_name, path, path_size, status continue backward = False for x, t in path: if x.v in seen: print >> sys.stderr, "* Does not allow backward" \ " patch on {0}".format(x.v) backward = True break if backward: continue # Build the path plus the ends vv = q.get_node(name) path.appendleft((vv, tag)) if tag == ">": path.reverse() status = INSERT_BEFORE elif target_name is None: status = INSERT_AFTER else: target = q.get_node(target_name) path.append((target, tag)) status = INSERT_BETWEEN print >> sys.stderr, name, target_name, path, path_size, status # Trim the ends off from the constructed AGPLines lines = path_to_agp(q, path, object, sizes, status) if status == INSERT_BEFORE: lines = lines[:-1] td = newagp.insert_lines(name, lines, \ delete=True, verbose=True) elif status == INSERT_AFTER: lines = lines[1:] td = newagp.insert_lines(name, lines, after=True, \ delete=True, verbose=True) else: lines = lines[1:-1] td = newagp.update_between(name, target_name, lines, \ delete=True, verbose=True) deleted |= td seen |= td # Recruite big singleton contigs CUTOFF = opts.min_length for ctg, size in sizes.items(): if ctg in seen: continue if size < CUTOFF: continue newagp.append(AGPLine.cline(ctg, ctg, sizes, "?")) # Write a new AGP file newagpfile = "embedded.agp" newagp.print_to_file(newagpfile, index=True) tidy([newagpfile, contigs])
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option("--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir [default: %default]") p.add_option("--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig [default: %default]") p.add_option("--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig [default: %default]") p.add_option("--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig [default: %default]") p.add_option("--astat", default=False, action="store_true", help="create .astat to list repetitiveness [default: %default]") p.add_option("--readids", default=False, action="store_true", help="create file of mapped and unmapped ids [default: %default]") from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print >> fw, "AS {0} {1}".format(ncontigs, totalreads) print >> fw for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments) print >> fw, fill(str(cseq.seq)) print >> fw if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print >> astatfw, "{0}\t{1:.1f}".format(contig, astat) text = fill([qual] * nbases, delimiter=" ", width=30) print >> fw, "BQ\n{0}".format(text) print >> fw rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print >> readsfw, readname rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print >> fw, af print >> fw for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags, fill(aseq)) qs = "QA 1 {0} 1 {0}".format(alen) print >> fw, rd print >> fw print >> fw, qs print >> fw
def main(): """ %prog database.fa query.fa [options] Wrapper for NCBI BLAST+. """ p = OptionParser(main.__doc__) p.add_option("--format", default=" \'6 qseqid sseqid pident length " \ "mismatch gapopen qstart qend sstart send evalue bitscore\' ", help="0-11, learn more with \"blastp -help\". [default: %default]") p.add_option("--path", dest="blast_path", default=None, help="specify BLAST+ path including the program name") p.add_option("--prog", dest="blast_program", default="blastp", help="specify BLAST+ program to use. See complete list here: " \ "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation" " [default: %default]") p.set_align(evalue=.01) p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.set_cpus() p.add_option("--nprocs", default=1, type="int", help="number of BLAST processes to run in parallel. " + \ "split query.fa into `nprocs` chunks, " + \ "each chunk uses -num_threads=`cpus`") p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2 or opts.blast_program is None: sys.exit(not p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra blast_path = opts.blast_path blast_program = opts.blast_program blast_bin = blast_path or blast_program if op.basename(blast_bin) != blast_program: blast_bin = op.join(blast_bin, blast_program) nprocs, cpus = opts.nprocs, opts.cpus if nprocs > 1: logging.debug("Dispatch job to %d processes" % nprocs) outdir = "outdir" fs = split([afasta_fn, outdir, str(nprocs)]) queries = fs.names else: queries = [afasta_fn] dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \ else "nucl" db = bfasta_fn if dbtype == "prot": nin = db + ".pin" else: nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin, dbtype=dbtype) lock = Lock() blastplus_template = "{0} -db {1} -outfmt {2}" blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format) blast_cmd += " -evalue {0} -max_target_seqs {1}".\ format(opts.evalue, opts.best) blast_cmd += " -num_threads {0}".format(cpus) if extra: blast_cmd += " " + extra.strip() args = [(out_fh, blast_cmd, query, lock) for query in queries] g = Jobs(target=blastplus, args=args) g.run()
def filter(args): """ %prog filter test.blast Produce a new blast file and filter based on: - score: >= cutoff - pctid: >= cutoff - hitlen: >= cutoff - evalue: <= cutoff - ids: valid ids Use --inverse to obtain the complementary records for the criteria above. - noself: remove self-self hits """ p = OptionParser(filter.__doc__) p.add_option("--score", dest="score", default=0, type="int", help="Score cutoff") p.set_align(pctid=95, hitlen=100, evalue=.01) p.add_option("--noself", default=False, action="store_true", help="Remove self-self hits") p.add_option("--ids", help="Path to file with ids to retain") p.add_option("--inverse", default=False, action="store_true", help="Similar to grep -v, inverse") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) if opts.ids: ids = set() for row in must_open(opts.ids): if row[0] == "#": continue row = row.replace(",", "\t") ids.update(row.split()) else: ids = None blastfile, = args inverse = opts.inverse outfile = opts.outfile fp = must_open(blastfile) score, pctid, hitlen, evalue, noself = \ opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \ outfile is None else outfile if inverse: newblastfile += ".inverse" fw = must_open(newblastfile, "w") for row in fp: if row[0] == '#': continue c = BlastLine(row) if ids: if c.query in ids and c.subject in ids: noids = False else: noids = True else: noids = None remove = c.score < score or \ c.pctid < pctid or \ c.hitlen < hitlen or \ c.evalue > evalue or \ noids if inverse: remove = not remove remove = remove or (noself and c.query == c.subject) if not remove: print >> fw, row.rstrip() fw.close() return newblastfile
def cscore(args): """ %prog cscore blastfile > cscoreOut See supplementary info for sea anemone genome paper, C-score formula: cscore(A,B) = score(A,B) / max(best score for A, best score for B) A C-score of one is the same as reciprocal best hit (RBH). Output file will be 3-column (query, subject, cscore). Use --cutoff to select a different cutoff. """ from jcvi.utils.cbook import gene_name p = OptionParser(cscore.__doc__) p.add_option("--cutoff", default=.9999, type="float", help="Minimum C-score to report [default: %default]") p.add_option("--pct", default=False, action="store_true", help="Also include pct as last column [default: %default]") p.add_option("--writeblast", default=False, action="store_true", help="Also write filtered blast file [default: %default]") p.set_stripnames() p.set_outfile() opts, args = p.parse_args(args) ostrip = opts.strip_names writeblast = opts.writeblast outfile = opts.outfile if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args blast = Blast(blastfile) logging.debug("Register best scores ..") best_score = defaultdict(float) for b in blast: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) score = b.score if score > best_score[query]: best_score[query] = score if score > best_score[subject]: best_score[subject] = score blast = Blast(blastfile) pairs = {} cutoff = opts.cutoff for b in blast: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) score = b.score pctid = b.pctid s = score / max(best_score[query], best_score[subject]) if s > cutoff: pair = (query, subject) if pair not in pairs or s > pairs[pair][0]: pairs[pair] = (s, pctid, b) fw = must_open(outfile, "w") if writeblast: fwb = must_open(outfile + ".filtered.blast", "w") pct = opts.pct for (query, subject), (s, pctid, b) in sorted(pairs.items()): args = [query, subject, "{0:.2f}".format(s)] if pct: args.append("{0:.1f}".format(pctid)) print >> fw, "\t".join(args) if writeblast: print >> fwb, b fw.close() if writeblast: fwb.close()
def genestats(args): """ %prog genestats gffile Print summary stats, including: - Number of genes - Number of single-exon genes - Number of multi-exon genes - Number of distinct exons - Number of genes with alternative transcript variants - Number of predicted transcripts - Mean number of distinct exons per gene - Mean number of transcripts per gene - Mean gene locus size (first to last exon) - Mean transcript size (UTR, CDS) - Mean exon size Stats modeled after barley genome paper Table 1. A physical, genetic and functional sequence assembly of the barley genome """ p = OptionParser(genestats.__doc__) p.add_option("--groupby", default="conf_class", help="Print separate stats groupby") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gff_file, = args gb = opts.groupby g = make_index(gff_file) tf = "transcript.sizes" if need_update(gff_file, tf): fw = open(tf, "w") for feat in g.features_of_type("mRNA"): fid = feat.id conf_class = feat.attributes.get(gb, "all") tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \ if c.featuretype == "exon") print("\t".join((fid, str(tsize), conf_class)), file=fw) fw.close() tsizes = DictFile(tf, cast=int) conf_classes = DictFile(tf, valuepos=2) logging.debug("A total of {0} transcripts populated.".format(len(tsizes))) genes = [] for feat in g.features_of_type("gene"): fid = feat.id transcripts = [c.id for c in g.children(fid, 1) \ if c.featuretype == "mRNA"] transcript_sizes = [tsizes[x] for x in transcripts] exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") conf_class = conf_classes[transcripts[0]] gs = GeneStats(feat, conf_class, transcript_sizes, exons) genes.append(gs) r = {} # Report distinct_groups = set(conf_classes.values()) for g in distinct_groups: num_genes = num_single_exon_genes = num_multi_exon_genes = 0 num_genes_with_alts = num_transcripts = num_exons = max_transcripts = 0 cum_locus_size = cum_transcript_size = cum_exon_size = 0 for gs in genes: if gs.conf_class != g: continue num_genes += 1 if gs.num_exons == 1: num_single_exon_genes += 1 else: num_multi_exon_genes += 1 num_exons += gs.num_exons if gs.num_transcripts > 1: num_genes_with_alts += 1 if gs.num_transcripts > max_transcripts: max_transcripts = gs.num_transcripts num_transcripts += gs.num_transcripts cum_locus_size += gs.locus_size cum_transcript_size += gs.cum_transcript_size cum_exon_size += gs.cum_exon_size mean_num_exons = num_exons * 1. / num_genes mean_num_transcripts = num_transcripts * 1. / num_genes mean_locus_size = cum_locus_size * 1. / num_genes mean_transcript_size = cum_transcript_size * 1. / num_transcripts mean_exon_size = cum_exon_size * 1. / num_exons r[("Number of genes", g)] = num_genes r[("Number of single-exon genes", g)] = \ percentage(num_single_exon_genes, num_genes, mode=1) r[("Number of multi-exon genes", g)] = \ percentage(num_multi_exon_genes, num_genes, mode=1) r[("Number of distinct exons", g)] = num_exons r[("Number of genes with alternative transcript variants", g)] = \ percentage(num_genes_with_alts, num_genes, mode=1) r[("Number of predicted transcripts", g)] = num_transcripts r[("Mean number of distinct exons per gene", g)] = mean_num_exons r[("Mean number of transcripts per gene", g)] = mean_num_transcripts r[("Max number of transcripts per gene", g)] = max_transcripts r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size r[("Mean exon size", g)] = mean_exon_size fw = must_open(opts.outfile, "w") print(tabulate(r), file=fw) fw.close()
def report(args): ''' %prog report ksfile generate a report given a Ks result file (as produced by synonymous_calc.py). describe the median Ks, Ka values, as well as the distribution in stem-leaf plot ''' from jcvi.utils.cbook import SummaryStats from jcvi.graphics.histogram import stem_leaf_plot p = OptionParser(report.__doc__) p.add_option( "--pdf", default=False, action="store_true", help="Generate graphic output for the histogram [default: %default]") p.add_option( "--components", default=1, type="int", help="Number of components to decompose peaks [default: %default]") add_plot_options(p) opts, args, iopts = p.set_image_options(args, figsize="5x5") if len(args) != 1: sys.exit(not p.print_help()) ks_file, = args data = KsFile(ks_file) ks_min = opts.vmin ks_max = opts.vmax bins = opts.bins for f in fields.split(",")[1:]: columndata = [getattr(x, f) for x in data] ks = ("ks" in f) if not ks: continue columndata = [x for x in columndata if ks_min <= x <= ks_max] st = SummaryStats(columndata) title = "{0} ({1}): ".format(descriptions[f], ks_file) title += "Median:{0:.3f} (1Q:{1:.3f}|3Q:{2:.3f}||".\ format(st.median, st.firstq, st.thirdq) title += "Mean:{0:.3f}|Std:{1:.3f}||N:{2})".\ format(st.mean, st.sd, st.size) tbins = (0, ks_max, bins) if ks else (0, .6, 10) digit = 2 if (ks_max * 1. / bins) < .1 else 1 stem_leaf_plot(columndata, *tbins, digit=digit, title=title) if not opts.pdf: return components = opts.components data = [x.ng_ks for x in data] data = [x for x in data if ks_min <= x <= ks_max] fig = plt.figure(1, (iopts.w, iopts.h)) ax = fig.add_axes([.12, .1, .8, .8]) kp = KsPlot(ax, ks_max, opts.bins, legendp=opts.legendp) kp.add_data(data, components, fill=opts.fill, fitted=opts.fit) kp.draw(title=opts.title)
def dotplot(args): """ %prog dotplot map.csv ref.fasta Make dotplot between chromosomes and linkage maps. The input map is csv formatted, for example: ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition scaffold_2707,11508,1,0 scaffold_2707,11525,1,1.2 """ from jcvi.assembly.allmaps import CSVMapLine from jcvi.formats.sizes import Sizes from jcvi.utils.natsort import natsorted from jcvi.graphics.base import shorten from jcvi.graphics.dotplot import plt, savefig, markup, normalize_axes, \ downsample, plot_breaks_and_labels, thousands p = OptionParser(dotplot.__doc__) p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 2: sys.exit(not p.print_help()) csvfile, fastafile = args sizes = natsorted(Sizes(fastafile).mapping.items()) seen = set() raw_data = [] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([.1, .1, .8, .8]) # the dot plot fp = must_open(csvfile) for row in fp: m = CSVMapLine(row) seen.add(m.seqid) raw_data.append(m) # X-axis is the genome assembly ctgs, ctg_sizes = zip(*sizes) xsize = sum(ctg_sizes) qb = list(np.cumsum(ctg_sizes)) qbreaks = list(zip(ctgs, [0] + qb, qb)) qstarts = dict(zip(ctgs, [0] + qb)) # Y-axis is the map key = lambda x: x.lg raw_data.sort(key=key) ssizes = {} for lg, d in groupby(raw_data, key=key): ssizes[lg] = max([x.cm for x in d]) ssizes = natsorted(ssizes.items()) lgs, lg_sizes = zip(*ssizes) ysize = sum(lg_sizes) sb = list(np.cumsum(lg_sizes)) sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb)) sstarts = dict(zip(lgs, [0] + sb)) # Re-code all the scatter dots data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, 'g') \ for x in raw_data if (x.seqid in qstarts)] npairs = downsample(data) x, y, c = zip(*data) ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) # Flip X-Y label gy, gx = op.basename(csvfile).split(".")[:2] gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30) xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks) ax.set_xlim(xlim) ax.set_ylim(ylim) title = "Alignment: {} vs {}".format(gx, gy) title += " ({} markers)".format(thousands(npairs)) root.set_title(markup(title), x=.5, y=.96, color="k") logging.debug(title) normalize_axes(root) image_name = opts.outfile or \ (csvfile.rsplit(".", 1)[0] + "." + iopts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()
def ld(args): """ %prog ld map Calculate pairwise linkage disequilibrium given MSTmap. """ import numpy as np from random import sample from jcvi.algorithms.matrix import symmetrize p = OptionParser(ld.__doc__) p.add_option("--subsample", default=1000, type="int", help="Subsample markers to speed up [default: %default]") opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 1: sys.exit(not p.print_help()) mstmap, = args subsample = opts.subsample data = MSTMap(mstmap) markerbedfile = mstmap + ".subsample.bed" ldmatrix = mstmap + ".subsample.matrix" # Take random subsample while keeping marker order if subsample < data.nmarkers: data = [data[x] for x in \ sorted(sample(xrange(len(data)), subsample))] else: logging.debug("Use all markers, --subsample ignored") nmarkers = len(data) if need_update(mstmap, (ldmatrix, markerbedfile)): fw = open(markerbedfile, "w") print("\n".join(x.bedline for x in data), file=fw) logging.debug("Write marker set of size {0} to file `{1}`."\ .format(nmarkers, markerbedfile)) fw.close() M = np.zeros((nmarkers, nmarkers), dtype=float) for i, j in combinations(range(nmarkers), 2): a = data[i] b = data[j] M[i, j] = calc_ldscore(a.genotype, b.genotype) M = symmetrize(M) logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix)) M.tofile(ldmatrix) else: nmarkers = len(Bed(markerbedfile)) M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers) logging.debug("LD matrix `{0}` exists ({1}x{1})."\ .format(ldmatrix, nmarkers)) from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap plt.rcParams["axes.linewidth"] = 0 fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) ax = fig.add_axes([.1, .1, .8, .8]) # the heatmap ax.matshow(M, cmap=iopts.cmap) # Plot chromosomes breaks bed = Bed(markerbedfile) xsize = len(bed) extent = (0, nmarkers) chr_labels = [] ignore_size = 20 for (seqid, beg, end) in bed.get_breaks(): ignore = abs(end - beg) < ignore_size pos = (beg + end) / 2 chr_labels.append((seqid, pos, ignore)) if ignore: continue ax.plot((end, end), extent, "w-", lw=1) ax.plot(extent, (end, end), "w-", lw=1) # Plot chromosome labels for label, pos, ignore in chr_labels: pos = .1 + pos * .8 / xsize if not ignore: root.text(pos, .91, label, ha="center", va="bottom", rotation=45, color="grey") root.text(.09, pos, label, ha="right", va="center", color="grey") ax.set_xlim(extent) ax.set_ylim(extent) ax.set_axis_off() draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=iopts.cmap) root.add_patch(Rectangle((.1, .1), .8, .8, fill=False, ec="k", lw=2)) m = mstmap.split(".")[0] root.text(.5, .06, "Linkage Disequilibrium between {0} markers".format(m), ha="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = m + ".subsample" + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def prepare(args): """ %prog prepare *.fastq Scan input fastq files (see below) and write SOAP config files based on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding. """ from jcvi.formats.base import write_file p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=45, type="int", help="K-mer size") p.add_option( "--assemble_1st_rank_only", default=False, action="store_true", help="Assemble the first rank only, other libs asm_flags=2", ) p.add_option("--scaffold", help="Only perform scaffolding") p.add_option("--gapclose", help="Only perform gap closure") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args K = opts.K for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) a1st = opts.assemble_1st_rank_only cfgfile = "soap.config" gc_cfgfile = "soap.gc.config" fw = open(cfgfile, "w") fw_gc = open(gc_cfgfile, "w") libs = get_libs(fnames) rank = 0 max_rd_len = max(readlen([f]) for f in fnames) block = "max_rd_len={0}\n".format(max_rd_len) for stream in (sys.stderr, fw, fw_gc): print(block, file=stream) # Collect singletons first singletons = [] for lib, fs in libs: if lib.size == 0: singletons += fs continue for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 block = "[LIB]\n" block += "avg_ins={0}\n".format(size) block += "reverse_seq={0}\n".format(lib.reverse_seq) asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags block += "asm_flags={0}\n".format(asm_flags) block += "rank={0}\n".format(rank) if lib.reverse_seq: pair_num_cutoff = 3 block += "pair_num_cutoff={0}\n".format(pair_num_cutoff) block += "map_len=35\n" for f in fs: if ".1." in f: tag = "q1" elif ".2." in f: tag = "q2" block += "{0}={1}\n".format(tag, f) if rank == 1: for s in singletons: tag = "q" if is_fastq(s) else "f" block += tag + "={0}\n".format(s) print(block, file=sys.stderr) print(block, file=fw) if asm_flags > 2: print(block, file=fw_gc) runfile = "run.sh" scaffold = opts.scaffold bb = 63 if K <= 63 else 127 binary = "SOAPdenovo-{0}mer".format(bb) header = SOAPHEADER.format(opts.cpus, K, binary) if opts.gapclose: gapclose = opts.gapclose outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta" template = header + GCRUNG.format(gapclose, outfile) else: template = header + (SCFRUN % scaffold if scaffold else SOAPRUN) write_file(runfile, template) fw.close() fw_gc.close()
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("--format", default="BLASTN-", choices=supported_formats, help="Ooutput format [default: %default]") p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option("--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") p.add_option("--similar", default=False, action="store_true", help="Use options tuned for close comparison [default: %default]") p.set_cpus(cpus=32) p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra if opts.similar: extra += similarOptions lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith("lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format)) p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def ortholog(args): """ %prog ortholog species_a species_b Run a sensitive pipeline to find orthologs between two species a and b. The pipeline runs LAST and generate .lifted.anchors. `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of such predictions. Extra orthologs will be recruited from reciprocal best match (RBH). """ from jcvi.apps.align import last as last_main from jcvi.compara.blastfilter import main as blastfilter_main from jcvi.compara.quota import main as quota_main from jcvi.compara.synteny import scan, mcscan, liftover from jcvi.formats.blast import cscore, filter p = OptionParser(ortholog.__doc__) p.add_option("--dbtype", default="nucl", choices=("nucl", "prot"), help="Molecule type of subject database") p.add_option("--full", default=False, action="store_true", help="Run in full mode, including blocks and RBH") p.add_option("--cscore", default=0.7, type="float", help="C-score cutoff [default: %default]") p.add_option("--dist", default=20, type="int", help="Extent of flanking regions to search") p.add_option("--quota", help="Quota align parameter") p.add_option("--nostdpf", default=False, action="store_true", help="Do not standardize contig names") p.add_option("--no_strip_names", default=False, action="store_true", help="Do not strip alternative splicing " "(e.g. At5g06540.1 -> At5g06540)") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) a, b = args dbtype = opts.dbtype suffix = ".cds" if dbtype == "nucl" else ".pep" abed, afasta = a + ".bed", a + suffix bbed, bfasta = b + ".bed", b + suffix ccscore = opts.cscore quota = opts.quota dist = "--dist={0}".format(opts.dist) aprefix = afasta.split(".")[0] bprefix = bfasta.split(".")[0] pprefix = ".".join((aprefix, bprefix)) qprefix = ".".join((bprefix, aprefix)) last = pprefix + ".last" if need_update((afasta, bfasta), last): last_main([bfasta, afasta], dbtype) if a == b: lastself = last + ".P98L0.inverse" if need_update(last, lastself): filter([last, "--hitlen=0", "--pctid=98", "--inverse", "--noself"]) last = lastself filtered_last = last + ".filtered" if need_update(last, filtered_last): if opts.no_strip_names: blastfilter_main( [last, "--cscore={0}".format(ccscore), "--no_strip_names"]) else: blastfilter_main([last, "--cscore={0}".format(ccscore)]) anchors = pprefix + ".anchors" lifted_anchors = pprefix + ".lifted.anchors" pdf = pprefix + ".pdf" if not opts.full: if need_update(filtered_last, lifted_anchors): if opts.no_strip_names: scan([ filtered_last, anchors, dist, "--liftover={0}".format(last), "--no_strip_names" ]) else: scan([ filtered_last, anchors, dist, "--liftover={0}".format(last) ]) if quota: quota_main( [lifted_anchors, "--quota={0}".format(quota), "--screen"]) if need_update(anchors, pdf): from jcvi.graphics.dotplot import dotplot_main dargs = [anchors] if opts.nostdpf: dargs += ["--nostdpf", "--skipempty"] dotplot_main(dargs) return if need_update(filtered_last, anchors): if opts.no_strip_names: scan([filtered_last, anchors, dist, "--no_strip_names"]) else: scan([filtered_last, anchors, dist]) ooanchors = pprefix + ".1x1.anchors" if need_update(anchors, ooanchors): quota_main([anchors, "--quota=1:1", "--screen"]) lifted_anchors = pprefix + ".1x1.lifted.anchors" if need_update((last, ooanchors), lifted_anchors): if opts.no_strip_names: liftover([last, ooanchors, dist, "--no_strip_names"]) else: liftover([last, ooanchors, dist]) pblocks = pprefix + ".1x1.blocks" qblocks = qprefix + ".1x1.blocks" if need_update(lifted_anchors, [pblocks, qblocks]): mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks]) mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks]) rbh = pprefix + ".rbh" if need_update(last, rbh): cscore([last, "-o", rbh]) portho = pprefix + ".ortholog" qortho = qprefix + ".ortholog" if need_update([pblocks, qblocks, rbh], [portho, qortho]): make_ortholog(pblocks, rbh, portho) make_ortholog(qblocks, rbh, qortho)
def pad(args): """ %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed Test and reconstruct candidate PADs. """ from jcvi.formats.cdt import CDT p = OptionParser(pad.__doc__) p.set_beds() p.add_option( "--cutoff", default=0.3, type="float", help="The clustering cutoff to call similar", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cutoff = opts.cutoff blastfile, cdtfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) cdt = CDT(cdtfile) qparts = list(cdt.iter_partitions(cutoff=cutoff)) sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False)) qid, sid = {}, {} for i, part in enumerate(qparts): qid.update(dict((x, i) for x in part)) for i, part in enumerate(sparts): sid.update(dict((x, i) for x in part)) # Without writing files, conversion from PAD to merged PAD is done in memory for q in qbed: q.seqid = qid[q.seqid] for s in sbed: s.seqid = sid[s.seqid] qnames = range(len(qparts)) snames = range(len(sparts)) logmp = make_arrays(blastfile, qbed, sbed, qnames, snames) m, n = logmp.shape pvalue_cutoff = 1e-30 cutoff = -log(pvalue_cutoff) significant = [] for i in range(m): for j in range(n): score = logmp[i, j] if score < cutoff: continue significant.append((qparts[i], sparts[j], score)) for a, b, score in significant: print("|".join(a), "|".join(b), score) logging.debug( "Collected {0} PAR comparisons significant at (P < {1}).".format( len(significant), pvalue_cutoff)) return significant
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw (default) or muscle. 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ from jcvi.formats.fasta import translate p = OptionParser(calc.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option( "--msa", default="clustalw", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--workdir", default=os.getcwd(), help="Work directory") p.set_outfile() opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print >> sys.stderr, "Incorrect arguments" sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") print >> output_h, fields work_dir = op.join(opts.workdir, "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print >> sys.stderr, "--------", p_rec_1.name, p_rec_2.name if opts.msa == "clustalw": align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir) mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join( str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cmd = op.join(opts.pbjelly_home, "setup.sh") if not which("fakeQuals.py"): setup = "source {0}".format(cmd) sh(setup) # Check environment try: import networkx version = networkx.version except: logging.error("You need networkx==1.1 to run PBJELLY") return try: import argparse except ImportError: logging.error("You need Python2.7 or at least argparse lib") return pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" sh("mkdir -p {0}".format(dref)) sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) sh("cp {0} {1}/".format(readsfiles, dreads)) cwd = os.getcwd() outputDir = cwd reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Make sure we have the patched version of Extraction.py # See discussion <http://seqanswers.com/forums/showthread.php?t=27599> # This check has been removed # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) #pcmds = """find assembly -name "ref*" -exec echo \\ # "Assembly.py {} \\ # > {}/assembly.out 2> {}/assembly.err" \; > commands.list""" #runsh.append(pcmds) runsh.append("Jelly.py assembly Protocol.xml") runsh.append("cp assembly/assembly_chunk0.sh commands.list") runsh.append("parallel < commands.list") runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents, meta="run script")
def subset(args): """ %prog subset pairsfile ksfile1 ksfile2 ... -o pairs.ks Subset some pre-calculated ks ka values (in ksfile) according to pairs in tab delimited pairsfile/anchorfile. """ p = OptionParser(subset.__doc__) p.add_option("--noheader", action="store_true", help="don't write ksfile header line [default: %default]") p.add_option("--block", action="store_true", help="preserve block structure in input [default: %default]") p.set_stripnames() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, ksfiles = args[0], args[1:] noheader = opts.noheader block = opts.block if block: noheader = True outfile = opts.outfile ksvals = {} for ksfile in ksfiles: ksvals.update(dict((line.name, line) for line in \ KsFile(ksfile, strip_names=opts.strip_names))) fp = open(pairsfile) fw = must_open(outfile, "w") if not noheader: print >> fw, fields i = j = 0 for row in fp: if row[0] == '#': if block: print >> fw, row.strip() continue a, b = row.split()[:2] name = ";".join((a, b)) if name not in ksvals: name = ";".join((b, a)) if name not in ksvals: j += 1 print >> fw, "\t".join((a, b, ".", ".")) continue ksline = ksvals[name] if block: print >> fw, "\t".join(str(x) for x in (a, b, ksline.ks)) else: ksline.name = ";".join((a, b)) print >> fw, ksline i += 1 fw.close() logging.debug("{0} pairs not found in ksfiles".format(j)) logging.debug("{0} ks records written to `{1}`".format(i, outfile)) return outfile
def completeness(args): """ %prog completeness blastfile ref.fasta > outfile Print statistics for each gene, the coverage of the alignment onto the best hit, as an indicator for completeness of the gene model. For example, one might BLAST sugarcane ESTs against sorghum annotations as reference, to find full-length transcripts. """ from jcvi.utils.range import range_minmax from jcvi.utils.cbook import SummaryStats p = OptionParser(completeness.__doc__) p.add_option( "--ids", help="Save ids that are over 50% complete [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args idsfile = opts.ids f = Sizes(fastafile).mapping b = BlastSlow(blastfile) valid = [] data = [] cutoff = 50 for query, blines in groupby(b, key=lambda x: x.query): blines = list(blines) ranges = [(x.sstart, x.sstop) for x in blines] b = blines[0] query, subject = b.query, b.subject rmin, rmax = range_minmax(ranges) subject_len = f[subject] nterminal_dist = rmin - 1 cterminal_dist = subject_len - rmax covered = (rmax - rmin + 1) * 100 / subject_len if covered > cutoff: valid.append(query) data.append((nterminal_dist, cterminal_dist, covered)) print "\t".join( str(x) for x in (query, subject, nterminal_dist, cterminal_dist, covered)) nd, cd, cv = zip(*data) m = "Total: {0}, Coverage > {1}%: {2}\n".\ format(len(data), cutoff, len(valid)) m += "N-terminal: {0}\n".format(SummaryStats(nd)) m += "C-terminal: {0}\n".format(SummaryStats(cd)) m += "Coverage: {0}".format(SummaryStats(cv)) print >> sys.stderr, m if idsfile: fw = open(idsfile, "w") print >> fw, "\n".join(valid) logging.debug("A total of {0} ids (cov > {1} %) written to `{2}`.".\ format(len(valid), cutoff, idsfile)) fw.close()
def prepare(args): """ %prog prepare *.fastq Generate run.sh script to run clc_novo_assemble. """ from itertools import groupby from jcvi.assembly.base import FastqNamings, Library p = OptionParser(prepare.__doc__ + FastqNamings) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) library_name = lambda x: "-".join(\ op.basename(x).split(".")[0].split("-")[:2]) libs = [(Library(x), sorted(fs)) for x, fs in \ groupby(fnames, key=library_name)] libs.sort(key=lambda x: x[0].size) singletons = [] pairs = [] write_file("license.properties", CLCLICENSE, skipcheck=True) for lib, fs in libs: size = lib.size stddev = lib.stddev if size == 0: singletons += fs continue for f in fs: reverse_seq = 0 if ".corr." in f else lib.reverse_seq fb = "bf" if reverse_seq else "fb" minsize, maxsize = size - 2 * stddev, size + 2 * stddev pair_opt = "-p {0} ss {1} {2} ".format(fb, minsize, maxsize) if ".1." in f: f = f.replace(".1.", ".?.") pairs.append(pair_opt + "-i {0}".format(f)) elif ".2." in f: continue else: pairs.append(pair_opt + f) cmd = "clc_novo_assemble --cpus {0} -o contigs.fasta \\\n".format( opts.cpus) cmd += "\t-q {0} \\\n".format(" ".join(singletons)) cmd += "\n".join("\t{0} \\".format(x) for x in pairs) runfile = "run.sh" write_file(runfile, cmd, meta="run script")
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option( "--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format( "\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def align(args): """ %prog align reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(align.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option( "--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") p.add_option( "--fraction", default=0.5, help="Fraction of the read that must match [default: %default]") p.add_option("--similarity", default=0.95, help="Similarity of the matching region [default: %default]") p.set_params() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) write_file("license.properties", CLCLICENSE, skipcheck=True) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus {0}".format(opts.cpus) cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity) sh(cmd) return outfile, None
def minimac(args): """ %prog batchminimac input.txt Use MINIMAC3 to impute vcf on all chromosomes. """ p = OptionParser(minimac.__doc__) p.set_home("shapeit") p.set_home("minimac") p.set_outfile() p.set_chr() p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (txtfile, ) = args ref = opts.ref mm = MakeManager() pf = txtfile.split(".")[0] allrawvcf = [] alloutvcf = [] chrs = opts.chr.split(",") for x in chrs: px = CM[x] chrvcf = pf + ".{0}.vcf".format(px) if txtfile.endswith(".vcf"): cmd = "vcftools --vcf {0} --chr {1}".format(txtfile, x) cmd += " --out {0}.{1} --recode".format(pf, px) cmd += " && mv {0}.{1}.recode.vcf {2}".format(pf, px, chrvcf) else: # 23andme cmd = "python -m jcvi.formats.vcf from23andme {0} {1}".format( txtfile, x) cmd += " --ref {0}".format(ref) mm.add(txtfile, chrvcf, cmd) chrvcf_hg38 = pf + ".{0}.23andme.hg38.vcf".format(px) minimac_liftover(mm, chrvcf, chrvcf_hg38, opts) allrawvcf.append(chrvcf_hg38) minimacvcf = "{0}.{1}.minimac.dose.vcf".format(pf, px) if x == "X": minimac_X(mm, x, chrvcf, opts) elif x in ["Y", "MT"]: cmd = "python -m jcvi.variation.impute passthrough" cmd += " {0} {1}".format(chrvcf, minimacvcf) mm.add(chrvcf, minimacvcf, cmd) else: minimac_autosome(mm, x, chrvcf, opts) # keep the best line for multi-allelic markers uniqvcf = "{0}.{1}.minimac.uniq.vcf".format(pf, px) cmd = "python -m jcvi.formats.vcf uniq {0} > {1}".format( minimacvcf, uniqvcf) mm.add(minimacvcf, uniqvcf, cmd) minimacvcf_hg38 = "{0}.{1}.minimac.hg38.vcf".format(pf, px) minimac_liftover(mm, uniqvcf, minimacvcf_hg38, opts) alloutvcf.append(minimacvcf_hg38) if len(allrawvcf) > 1: rawhg38vcfgz = pf + ".all.23andme.hg38.vcf.gz" cmd = "vcf-concat {0} | bgzip > {1}".format(" ".join(allrawvcf), rawhg38vcfgz) mm.add(allrawvcf, rawhg38vcfgz, cmd) if len(alloutvcf) > 1: outhg38vcfgz = pf + ".all.minimac.hg38.vcf.gz" cmd = "vcf-concat {0} | bgzip > {1}".format(" ".join(alloutvcf), outhg38vcfgz) mm.add(alloutvcf, outhg38vcfgz, cmd) mm.write()
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.add_option("--coverage", default=40, type="int", help="Expected sequence coverage [default: %default]") p.add_option("--prefix", default="jf", help="Database prefix [default: %default]") p.add_option("--nohist", default=False, action="store_true", help="Do not print histogram [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".\ format(human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def frommaf(args): """ %prog frommaf maffile Convert to four-column tabular format from MAF. """ p = OptionParser(frommaf.__doc__) p.add_option("--validate", help="Validate coordinates against FASTA") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (maf,) = args snpfile = maf.rsplit(".", 1)[0] + ".vcf" fp = open(maf) fw = open(snpfile, "w") total = 0 id = "." qual = 20 filter = "PASS" info = "DP=20" print("##fileformat=VCFv4.0", file=fw) print("#CHROM POS ID REF ALT QUAL FILTER INFO".replace(" ", "\t"), file=fw) for row in fp: atoms = row.split() c, pos, ref, alt = atoms[:4] if is_number(c, int): c = int(c) else: continue c = "chr{0:02d}".format(c) pos = int(pos) print( "\t".join(str(x) for x in (c, pos, id, ref, alt, qual, filter, info)), file=fw, ) total += 1 fw.close() validate = opts.validate if not validate: return from jcvi.utils.cbook import percentage f = Fasta(validate) fp = open(snpfile) nsnps = 0 for row in fp: if row[0] == "#": continue c, pos, id, ref, alt, qual, filter, info = row.split("\t") pos = int(pos) feat = dict(chr=c, start=pos, stop=pos) s = f.sequence(feat) s = str(s) assert s == ref, "Validation error: {0} is {1} (expect: {2})".format( feat, s, ref ) nsnps += 1 if nsnps % 50000 == 0: logging.debug("SNPs parsed: {0}".format(percentage(nsnps, total))) logging.debug( "A total of {0} SNPs validated and written to `{1}`.".format(nsnps, snpfile) )
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6 Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: return asciiplot(x, y, title=title) plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100)
def adjgraph(args): """ %prog adjgraph adjacency.txt subgraph.txt Construct adjacency graph for graphviz. The file may look like sample below. The lines with numbers are chromosomes with gene order information. genome 0 chr 0 -1 -13 -16 3 4 -6126 -5 17 -6 7 18 5357 8 -5358 5359 -9 -10 -11 5362 5360 chr 1 138 6133 -5387 144 -6132 -139 140 141 146 -147 6134 145 -170 -142 -143 """ import pygraphviz as pgv from jcvi.utils.iter import pairwise from jcvi.formats.base import SetFile p = OptionParser(adjgraph.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) infile, subgraph = args subgraph = SetFile(subgraph) subgraph = set(x.strip("-") for x in subgraph) G = pgv.AGraph(strict=False) # allow multi-edge SG = pgv.AGraph(strict=False) palette = ("green", "magenta", "tomato", "peachpuff") fp = open(infile) genome_id = -1 key = 0 for row in fp: if row.strip() == "": continue atoms = row.split() tag = atoms[0] if tag in ("ChrNumber", "chr"): continue if tag == "genome": genome_id += 1 gcolor = palette[genome_id] continue nodeseq = [] for p in atoms: np = p.strip("-") nodeL, nodeR = np + "L", np + "R" if p[0] == "-": # negative strand nodeseq += [nodeR, nodeL] else: nodeseq += [nodeL, nodeR] for a, b in pairwise(nodeseq): G.add_edge(a, b, key, color=gcolor) key += 1 na, nb = a[:-1], b[:-1] if na not in subgraph and nb not in subgraph: continue SG.add_edge(a, b, key, color=gcolor) G.graph_attr.update(dpi="300") fw = open("graph.dot", "w") G.write(fw) fw.close() fw = open("subgraph.dot", "w") SG.write(fw) fw.close()
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) p.set_beds() p.add_option("--minsize", default=10, type="int", help="Only segment using blocks >= size") p.add_option("--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for block in ac.iter_blocks(minsize=minsize): q, s = list(zip(*block))[:2] q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print("\t".join(header), file=fw) for i in range(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print("\t".join(row), file=fw) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
def segment(args): """ %prog segment loss.ids bedfile Merge adjacent gene loss into segmental loss. Then based on the segmental loss, estimate amount of DNA loss in base pairs. Two estimates can be given: - conservative: just within the start and end of a single gene - aggressive: extend the deletion track to the next gene The real deletion size is within these estimates. """ from jcvi.formats.base import SetFile p = OptionParser(segment.__doc__) p.add_option( "--chain", default=1, type="int", help="Allow next N genes to be chained", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) idsfile, bedfile = args bed = Bed(bedfile) order = bed.order ids = SetFile(idsfile) losses = Grouper() skip = opts.chain for i, a in enumerate(bed): a = a.accn for j in range(i + 1, i + 1 + skip): if j >= len(bed): break b = bed[j].accn if a in ids: losses.join(a, a) if a in ids and b in ids: losses.join(a, b) losses = list(losses) singletons = [x for x in losses if len(x) == 1] segments = [x for x in losses if len(x) > 1] ns, nm, nt = len(singletons), len(segments), len(losses) assert ns + nm == nt # Summary for all segments for x in sorted(singletons) + sorted(segments): print( "\t".join( str(x) for x in ("|".join(sorted(x)), len(x), estimate_size(x, bed, order)) ) ) # Find longest segment stretch if segments: mx, maxsegment = max([(len(x), x) for x in segments]) print("Longest stretch: run of {0} genes".format(mx), file=sys.stderr) print(" {0}".format("|".join(sorted(maxsegment))), file=sys.stderr) seg_asize = sum(estimate_size(x, bed, order) for x in segments) seg_bsize = sum( estimate_size(x, bed, order, conservative=False) for x in segments ) else: seg_asize = seg_bsize = 0 sing_asize = sum(estimate_size(x, bed, order) for x in singletons) sing_bsize = sum( estimate_size(x, bed, order, conservative=False) for x in singletons ) total_asize = sing_asize + seg_asize total_bsize = sing_bsize + seg_bsize print( "Singleton ({0}): {1} - {2} bp".format(ns, sing_asize, sing_bsize), file=sys.stderr, ) print( "Segment ({0}): {1} - {2} bp".format(nm, seg_asize, seg_bsize), file=sys.stderr ) print( "Total ({0}): {1} - {2} bp".format(nt, total_asize, total_bsize), file=sys.stderr, ) print( "Average ({0}): {1} bp".format(nt, (total_asize + total_bsize) / 2), file=sys.stderr, )
def enrich(args): """ %prog enrich omgfile groups ntaxa > enriched.omg Enrich OMG output by pulling genes misses by OMG. """ p = OptionParser(enrich.__doc__) p.add_option("--ghost", default=False, action="store_true", help="Add ghost homologs already used [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) omgfile, groupsfile, ntaxa = args ntaxa = int(ntaxa) ghost = opts.ghost # Get gene pair => weight mapping weights = get_edges() info = get_info() # Get gene => taxon mapping info = dict((k, v.split()[5]) for k, v in info.items()) groups = Grouper() fp = open(groupsfile) for row in fp: members = row.strip().split(",") groups.join(*members) logging.debug("Imported {0} families with {1} members.".\ format(len(groups), groups.num_members)) seen = set() omggroups = Grouper() fp = open(omgfile) for row in fp: genes, idxs = row.split() genes = genes.split(",") seen.update(genes) omggroups.join(*genes) nmembers = omggroups.num_members logging.debug("Imported {0} OMG families with {1} members.".\ format(len(omggroups), nmembers)) assert nmembers == len(seen) alltaxa = set(str(x) for x in range(ntaxa)) recruited = [] fp = open(omgfile) for row in fp: genes, idxs = row.split() genes = genes.split(",") a = genes[0] idxs = set(idxs.split(",")) missing_taxa = alltaxa - idxs if not missing_taxa: print(row.rstrip()) continue leftover = groups[a] if not ghost: leftover = set(leftover) - seen if not leftover: print(row.rstrip()) continue leftover_sorted_by_taxa = dict((k, \ [x for x in leftover if info[x] == k]) \ for k in missing_taxa) #print genes, leftover #print leftover_sorted_by_taxa solutions = [] for solution in product(*leftover_sorted_by_taxa.values()): score = sum( weights.get((a, b), 0) for a in solution for b in genes) if score == 0: continue score += sum( weights.get((a, b), 0) for a, b in combinations(solution, 2)) solutions.append((score, solution)) #print solution, score best_solution = max(solutions) if solutions else None if best_solution is None: print(row.rstrip()) continue #print "best ==>", best_solution best_score, best_addition = best_solution genes.extend(best_addition) recruited.extend(best_addition) genes = sorted([(info[x], x) for x in genes]) idxs, genes = zip(*genes) if ghost: # decorate additions so it's clear that they were added pgenes = [] for g in genes: if g in recruited and g in seen: pgenes.append("|{0}|".format(g)) else: pgenes.append(g) genes = pgenes print("\t".join((",".join(genes), ",".join(idxs)))) if not ghost: seen.update(best_addition) logging.debug("Recruited {0} new genes.".format(len(recruited)))