def write_csv(header, contents, sep=",", filename="stdout", thousands=False, tee=False): """ Write csv that are aligned with the column headers. >>> header = ["x_value", "y_value"] >>> contents = [(1, 100), (2, 200)] >>> write_csv(header, contents) x_value, y_value 1, 100 2, 200 """ from jcvi.formats.base import must_open, is_number from jcvi.utils.cbook import thousands as th fw = must_open(filename, "w") allcontents = [header] + contents if header else contents cols = len(contents[0]) for content in allcontents: assert len(content) == cols # Stringify the contents for i, content in enumerate(allcontents): if thousands: content = [int(x) if is_number(x, cast=int) else x for x in content] content = [th(x) if (is_number(x, cast=int) and x >= 1000) else x for x in content] allcontents[i] = [str(x) for x in content] colwidths = [max(len(x[i]) for x in allcontents) for i in xrange(cols)] sep += " " for content in allcontents: rjusted = [x.rjust(cw) for x, cw in zip(content, colwidths)] formatted = sep.join(rjusted) print >> fw, formatted if tee and filename != "stdout": print formatted
def load_csv(header, contents, sep=",", thousands=False, align=True): from jcvi.formats.base import is_number from jcvi.utils.cbook import thousands as th allcontents = [header] + contents if header else contents cols = len(contents[0]) for content in allcontents: assert len(content) == cols # Stringify the contents for i, content in enumerate(allcontents): if thousands: content = [int(x) if is_number(x, cast=int) else x \ for x in content] content = [th(x) if (is_number(x, cast=int) and x >= 1000) else x \ for x in content] allcontents[i] = [str(x) for x in content] colwidths = [max(len(x[i]) for x in allcontents) for i in xrange(cols)] sep += " " formatted_contents = [] for content in allcontents: rjusted = [x.rjust(cw) for x, cw in zip(content, colwidths)] \ if align else content formatted = sep.join(rjusted) formatted_contents.append(formatted) return formatted_contents
def random(args): """ %prog random bedfile number_of_features Extract a random subset of features. Number of features can be an integer number, or a fractional number in which case a random fraction (for example 0.1 = 10% of all features) will be extracted. """ from random import sample from jcvi.formats.base import flexible_cast p = OptionParser(random.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, N = args assert is_number(N) b = Bed(bedfile) NN = flexible_cast(N) if NN < 1: NN = int(round(NN * len(b))) beds = sample(b, NN) new_bed = Bed() new_bed.extend(beds) outfile = bedfile.rsplit(".", 1)[0] + ".{0}.bed".format(N) new_bed.print_to_file(outfile) logging.debug("Write {0} features to `{1}`".format(NN, outfile))
def guess_method(tag): from jcvi.formats.base import is_number jobids = tag.split(",") for jobid in jobids: if not is_number(jobid): return "pattern" return "jobid"
def gffline(self, type='match', source='default'): score = "." if not self.score or \ (self.score and not is_number(self.score)) \ else self.score strand = "." if not self.strand else self.strand row = "\t".join((self.seqid, source, type, str(self.start + 1), str(self.end), score, strand, '.', 'ID=' + self.accn)) return row
def blat(args): """ %prog blat map1.txt ref.fasta Make ALLMAPS input csv based on sequences. The tab-delimited txt file include: name, LG, position, sequence. """ from jcvi.formats.base import is_number from jcvi.formats.blast import best as blast_best, bed as blast_bed from jcvi.apps.align import blat as blat_align p = OptionParser(blat.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) maptxt, ref = args pf = maptxt.rsplit(".", 1)[0] register = {} fastafile = pf + ".fasta" fp = open(maptxt) fw = open(fastafile, "w") for row in fp: name, lg, pos, seq = row.split() if not is_number(pos): continue register[name] = (pf + '-' + lg, pos) print(">{0}\n{1}\n".format(name, seq), file=fw) fw.close() blatfile = blat_align([ref, fastafile]) bestfile = blast_best([blatfile]) bedfile = blast_bed([bestfile]) b = Bed(bedfile).order pf = ".".join((op.basename(maptxt).split(".")[0], op.basename(ref).split(".")[0])) csvfile = pf + ".csv" fp = open(maptxt) fw = open(csvfile, "w") for row in fp: name, lg, pos, seq = row.split() if name not in b: continue bbi, bb = b[name] scaffold, scaffold_pos = bb.seqid, bb.start print(",".join(str(x) for x in \ (scaffold, scaffold_pos, lg, pos)), file=fw) fw.close()
def blat(args): """ %prog blat map1.txt ref.fasta Make ALLMAPS input csv based on sequences. The tab-delimited txt file include: name, LG, position, sequence. """ from jcvi.formats.base import is_number from jcvi.formats.blast import best as blast_best, bed as blast_bed from jcvi.apps.align import blat as blat_align p = OptionParser(blat.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) maptxt, ref = args pf = maptxt.rsplit(".", 1)[0] register = {} fastafile = pf + ".fasta" fp = open(maptxt) fw = open(fastafile, "w") for row in fp: name, lg, pos, seq = row.split() if not is_number(pos): continue register[name] = (pf + '-' + lg, pos) print(">{0}\n{1}\n".format(name, seq), file=fw) fw.close() blatfile = blat_align([ref, fastafile]) bestfile = blast_best([blatfile]) bedfile = blast_bed([bestfile]) b = Bed(bedfile).order pf = ".".join( (op.basename(maptxt).split(".")[0], op.basename(ref).split(".")[0])) csvfile = pf + ".csv" fp = open(maptxt) fw = open(csvfile, "w") for row in fp: name, lg, pos, seq = row.split() if name not in b: continue bbi, bb = b[name] scaffold, scaffold_pos = bb.seqid, bb.start print(",".join(str(x) for x in \ (scaffold, scaffold_pos, lg, pos)), file=fw) fw.close()
def read_meme(fi): mtfs = [] fhi = open(fi, 'r') for head, content in read_block(fhi, 'MOTIF'): ps = head.split(' ') pre, mid = ps[:2] score = '' if len(ps) >= 3: score = ps[2] #mtf = mid.split("-")[1] if is_number(score): score = float(score) width = len(content) - 2 mtfs.append([mid, width, score]) #print(mid,'\t',width) return mtfs
def rename_seqid(seqid): seqid = seqid.split("_")[-1] seqid = seqid.replace("supercont", "s") seqid = seqid.replace("contig", "c").replace("scaffold", "s") return "c{}".format(int(seqid)) if is_number(seqid, int) else seqid
def frommaf(args): """ %prog frommaf maffile Convert to four-column tabular format from MAF. """ p = OptionParser(frommaf.__doc__) p.add_option("--validate", help="Validate coordinates against FASTA") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (maf,) = args snpfile = maf.rsplit(".", 1)[0] + ".vcf" fp = open(maf) fw = open(snpfile, "w") total = 0 id = "." qual = 20 filter = "PASS" info = "DP=20" print("##fileformat=VCFv4.0", file=fw) print("#CHROM POS ID REF ALT QUAL FILTER INFO".replace(" ", "\t"), file=fw) for row in fp: atoms = row.split() c, pos, ref, alt = atoms[:4] if is_number(c, int): c = int(c) else: continue c = "chr{0:02d}".format(c) pos = int(pos) print( "\t".join(str(x) for x in (c, pos, id, ref, alt, qual, filter, info)), file=fw, ) total += 1 fw.close() validate = opts.validate if not validate: return from jcvi.utils.cbook import percentage f = Fasta(validate) fp = open(snpfile) nsnps = 0 for row in fp: if row[0] == "#": continue c, pos, id, ref, alt, qual, filter, info = row.split("\t") pos = int(pos) feat = dict(chr=c, start=pos, stop=pos) s = f.sequence(feat) s = str(s) assert s == ref, "Validation error: {0} is {1} (expect: {2})".format( feat, s, ref ) nsnps += 1 if nsnps % 50000 == 0: logging.debug("SNPs parsed: {0}".format(percentage(nsnps, total))) logging.debug( "A total of {0} SNPs validated and written to `{1}`.".format(nsnps, snpfile) )