def cnsfix(args): """ %prog cnsfix consensus-fix.out.FAILED > blacklist.ids Parse consensus-fix.out to extract layouts for fixed unitigs. This will mark all the failed fragments detected by utgcnsfix and pop them out of the existing unitigs. """ from jcvi.formats.base import read_block p = OptionParser(cnsfix.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) cnsfixout, = args fp = open(cnsfixout) utgs = [] saves = [] for header, contents in read_block(fp, "Evaluating"): contents = list(contents) utg = header.split()[2] utgs.append(utg) # Look for this line: # save fragment idx=388 ident=206054426 for next pass for c in contents: if not c.startswith("save"): continue ident = c.split()[3].split("=")[-1] saves.append(ident) print "\n".join(saves)
def dreme2tsv(args): fh = must_open(args.fi) start = fh.tell() pos_c, neg_c = '', '' for line in fh: ps = re.split(r" +", line) if len(ps) >= 2 and ps[1].startswith('positives'): pos_c = ps[2] elif len(ps) >= 2 and ps[1].startswith('negatives'): neg_c = ps[2] break fh.seek(start) i = 0 fho = must_open(args.fo, 'w') fho.write("mid\tre\tseq\tseq_rc\tpos\tneg\tpos_c\tneg_c\tpval\teval\n") for head, lines in read_block(fh, 'MOTIF'): if not head: break i += 1 mtf, conseq, mid = head.split(" ") for line in lines: if line.startswith("#"): line = line.replace("#", '').strip() ps = re.split(r" +", line) if ps[0] in ['Word', 'Stopping', 'Running']: continue tag = '' if ps[0] == 'BEST': tag = 'RE' ps = ps[1:] seq, seq_rc, pos, neg, pval, eval = ps fho.write("\t".join([ mid, tag, seq, seq_rc, pos, neg, pos_c, neg_c, pval, eval ]) + '\n') fh.close()
def __init__(self, filename): super(ClstrFile, self).__init__(filename) assert filename.endswith(".clstr") fp = open(filename) for clstr, members in read_block(fp, ">"): self.append([ClstrLine(x) for x in members])
def __init__(self, filename=None, ctgsizes=None): super(OO, self).__init__(filename) if filename is None: return from jcvi.formats.base import read_block fp = open(filename) prefix = "contig_" self.contigs = set() for header, block in read_block(fp, ">"): header = header[1:] # Trim the '>' header = header.split()[0] for b in block: ctg, orientation = b.split() if ctg.startswith(prefix): ctg = ctg[len(prefix):] assert orientation in ("BE", "EB") strand = "+" if orientation == "BE" else "-" ctgsize = ctgsizes[ctg] self.add(header, ctg, ctgsize, strand) self.contigs.add(ctg)
def parse_names(lstfile): """ This is the alternative format `lstfile`. In this format, there are two sections, starting with [Sequence] and [Manuscript], respectively, then followed by authors separated by comma. """ from jcvi.formats.base import read_block fp = open(lstfile) all_authors = [] for header, seq in read_block(fp, "["): seq = " ".join(seq) authors = [] for au in seq.split(","): au = au.strip() if not au: continue au = string.translate(au, None, string.digits) #au = au.replace("-", '') authors.append(au) all_authors.append(authors) out = [] for authors in all_authors: blocks = [] for au in authors: last, first, initials = get_name_parts(au) suffix = "" nameblock = NameTemplate.format(last=last, first=first, initials=initials, suffix=suffix) blocks.append(nameblock) bigblock = ",\n".join(blocks) out.append(bigblock) return out
def dust2bed(args): """ %prog dust2bed fastafile Use dustmasker to find low-complexity regions (LCRs) in the genome. """ from jcvi.formats.base import read_block p = OptionParser(dust2bed.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args interval = fastafile + ".iv" if need_update(fastafile, interval): cmd = "dustmasker -in {0}".format(fastafile) sh(cmd, outfile=interval) fp = open(interval) bedfile = fastafile.rsplit(".", 1)[0] + ".dust.bed" fw = must_open(bedfile, "w") nlines = 0 nbases = 0 for header, block in read_block(fp, ">"): header = header.strip(">") for b in block: start, end = b.split(" - ") start, end = int(start), int(end) print >> fw, "\t".join(str(x) for x in (header, start, end)) nlines += 1 nbases += end - start logging.debug("A total of {0} DUST intervals ({1} bp) exported to `{2}`".\ format(nlines, nbases, bedfile))
def add_score(args): mtfs = read_streme_xml(args.fx) fhi = open(args.fi, 'r') fho = open(args.fo, 'w') # start = "MOTIF" while 1: pos = fhi.tell() line = fhi.readline() if not line: break if line.startswith(start): fhi.seek(pos) break else: fho.write(line) # i = 0 for head, content in read_block(fhi, 'MOTIF'): pre, mid, alt = head.split(' ') assert mtfs[i]['id'] == mid, 'motifs not in sync' new_head = f"{pre} {mid} {mtfs[i]['score_threshold']}" fho.write(new_head + "\n") for line in content: if line.startswith("0") or line.startswith("1"): fho.write(" " + line + "\n") else: fho.write(line + "\n") i += 1 fhi.close() fho.close()
def iter_scaffold(self): filename = self.filename sizes = self.sizes fp = open(filename) for header, lines in read_block(fp, ">"): scaffold, size, tigs = header[1:].split("|") lines = [EvidenceLine(x, sizes) for x in lines if x.strip()] yield scaffold, lines
def iter_chain(self): fp = open(self.filename) for row in fp: if row[0] != '#': break for chain, lines in read_block(fp, "chain"): lines = list(lines) yield ChainLine(chain, lines)
def iter_records(self): c = None for a, b in read_block(self.fp, "#"): if a[:2] == '##': if c: yield c c = ContigLine(a) else: c.reads.append(ReadLine(a, c.id)) if c: # last one yield c
def __init__(self, filename): super(BinMap, self).__init__(filename) fp = open(filename) for header, seq in read_block(fp, "group "): lg = header.split()[-1] self[lg] = [] for s in seq: if s.strip() == '' or s[0] == ';': continue marker, pos = s.split() pos = int(float(pos) * 1000) self[lg].append((marker, pos))
def read_trees(tree): from urlparse import parse_qs from jcvi.formats.base import read_block trees = [] fp = open(tree) for header, tx in read_block(fp, "#"): header = parse_qs(header[1:]) label = header["label"][0].strip("\"") outgroup = header["outgroup"] trees.append((label, outgroup, "".join(tx))) return trees
def read_trees(tree): from six.moves.urllib.parse import parse_qs from jcvi.formats.base import read_block trees = [] fp = open(tree) for header, tx in read_block(fp, "#"): header = parse_qs(header[1:]) label = header["label"][0].strip("\"") outgroup = header["outgroup"] color, = header.get("color", ["k"]) trees.append((label, outgroup, color, "".join(tx))) return trees
def read_meme(fi): mtfs = [] fhi = open(fi, 'r') for head, content in read_block(fhi, 'MOTIF'): ps = head.split(' ') pre, mid = ps[:2] score = '' if len(ps) >= 3: score = ps[2] #mtf = mid.split("-")[1] if is_number(score): score = float(score) width = len(content) - 2 mtfs.append([mid, width, score]) #print(mid,'\t',width) return mtfs
def __iter__(self): line = self._handle.readline() if not line.startswith(bac_tag): read_until(self._handle, bac_tag) for header, seq in read_block(self._handle, bac_tag): rec = FpcRecord() assert header.startswith(bac_tag) rec.bac_name = header.split('\"')[1] for line in seq: if line.startswith("Map"): rec.ctg_name = line.split('\"')[1] if "Left" in line: rec.map_left = line.split("Left")[1].split()[0] rec.map_left = int(float(rec.map_left)) if "Right" in line: rec.map_right = line.split("Right")[1].split()[0] rec.map_right = int(float(rec.map_right)) if line.startswith("Gel_number"): rec.gel_number = line.split()[-1] if line.startswith("Fp_number"): rec.fp_number = line.split()[-1] if line.startswith("Bands"): rec.bands = line.split()[-1] if line.startswith("match_to_cosmid"): rec.cosmid = line.split('\"')[1] rec.cosmid_match = line.split("_match_")[0] if line.startswith("Positive"): rec.probes.append(line.split('\"')[1]) if line.startswith("Fpc_remark"): rec.remark.append(line.split('\"')[1]) if line.startswith("Creation_date"): rec.cre_date = line.split('_date')[1].strip() if line.startswith("Modified_date"): rec.mod_date = line.split('_date')[1].strip() yield rec
def coge(args): """ %prog coge cogefile Convert CoGe file to anchors file. """ p = OptionParser(coge.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) cogefile, = args fp = must_open(cogefile) cogefile = cogefile.replace(".gz", "") ksfile = cogefile + ".ks" anchorsfile = cogefile + ".anchors" fw_ks = must_open(ksfile, "w") fw_ac = must_open(anchorsfile, "w") tag = "###" print >> fw_ks, tag for header, lines in read_block(fp, tag): print >> fw_ac, tag lines = list(lines) for line in lines: if line[0] == '#': continue ks, ka, achr, a, astart, astop, bchr, \ b, bstart, bstop, ev, ss = line.split() a = a.split("||")[3] b = b.split("||")[3] print >> fw_ac, "\t".join((a, b, ev)) print >> fw_ks, ",".join((";".join((a, b)), ks, ka, ks, ka)) fw_ks.close() fw_ac.close()
def summary(args): """ %prog summary cdhit.clstr Parse cdhit.clstr file to get distribution of cluster sizes. """ from jcvi.graphics.histogram import loghistogram p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clstrfile, = args assert clstrfile.endswith(".clstr") fp = open(clstrfile) data = [] for clstr, members in read_block(fp, ">"): size = len(members) data.append(size) loghistogram(data)
def iter_chain(self): fp = open(self.filename) for chain, lines in read_block(fp, "chain"): lines = list(lines) yield ChainLine(chain, lines)
def iter_blocks(self, minsize=0): fp = open(self.filename) for header, lines in read_block(fp, "#"): lines = [x.split() for x in lines] if len(lines) >= minsize: yield lines