Exemplo n.º 1
0
def cnsfix(args):
    """
    %prog cnsfix consensus-fix.out.FAILED > blacklist.ids

    Parse consensus-fix.out to extract layouts for fixed unitigs. This will
    mark all the failed fragments detected by utgcnsfix and pop them out of the
    existing unitigs.
    """
    from jcvi.formats.base import read_block

    p = OptionParser(cnsfix.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    cnsfixout, = args
    fp = open(cnsfixout)
    utgs = []
    saves = []
    for header, contents in read_block(fp, "Evaluating"):
        contents = list(contents)
        utg = header.split()[2]
        utgs.append(utg)
        # Look for this line:
        #   save fragment idx=388 ident=206054426 for next pass
        for c in contents:
            if not c.startswith("save"):
                continue
            ident = c.split()[3].split("=")[-1]
            saves.append(ident)
    print "\n".join(saves)
Exemplo n.º 2
0
def dreme2tsv(args):
    fh = must_open(args.fi)

    start = fh.tell()
    pos_c, neg_c = '', ''
    for line in fh:
        ps = re.split(r" +", line)
        if len(ps) >= 2 and ps[1].startswith('positives'):
            pos_c = ps[2]
        elif len(ps) >= 2 and ps[1].startswith('negatives'):
            neg_c = ps[2]
            break
    fh.seek(start)

    i = 0
    fho = must_open(args.fo, 'w')
    fho.write("mid\tre\tseq\tseq_rc\tpos\tneg\tpos_c\tneg_c\tpval\teval\n")
    for head, lines in read_block(fh, 'MOTIF'):
        if not head: break
        i += 1
        mtf, conseq, mid = head.split(" ")
        for line in lines:
            if line.startswith("#"):
                line = line.replace("#", '').strip()
                ps = re.split(r" +", line)
                if ps[0] in ['Word', 'Stopping', 'Running']: continue
                tag = ''
                if ps[0] == 'BEST':
                    tag = 'RE'
                    ps = ps[1:]
                seq, seq_rc, pos, neg, pval, eval = ps
                fho.write("\t".join([
                    mid, tag, seq, seq_rc, pos, neg, pos_c, neg_c, pval, eval
                ]) + '\n')
    fh.close()
Exemplo n.º 3
0
    def __init__(self, filename):
        super(ClstrFile, self).__init__(filename)
        assert filename.endswith(".clstr")

        fp = open(filename)
        for clstr, members in read_block(fp, ">"):
            self.append([ClstrLine(x) for x in members])
Exemplo n.º 4
0
    def __init__(self, filename):
        super(ClstrFile, self).__init__(filename)
        assert filename.endswith(".clstr")

        fp = open(filename)
        for clstr, members in read_block(fp, ">"):
            self.append([ClstrLine(x) for x in members])
Exemplo n.º 5
0
Arquivo: agp.py Projeto: bennyyu/jcvi
    def __init__(self, filename=None, ctgsizes=None):
        super(OO, self).__init__(filename)

        if filename is None:
            return

        from jcvi.formats.base import read_block

        fp = open(filename)
        prefix = "contig_"
        self.contigs = set()
        for header, block in read_block(fp, ">"):
            header = header[1:]  # Trim the '>'
            header = header.split()[0]
            for b in block:
                ctg, orientation = b.split()
                if ctg.startswith(prefix):
                    ctg = ctg[len(prefix):]

                assert orientation in ("BE", "EB")

                strand = "+" if orientation == "BE" else "-"
                ctgsize = ctgsizes[ctg]
                self.add(header, ctg, ctgsize, strand)
                self.contigs.add(ctg)
Exemplo n.º 6
0
def parse_names(lstfile):
    """
    This is the alternative format `lstfile`. In this format, there are two
    sections, starting with [Sequence] and [Manuscript], respectively, then
    followed by authors separated by comma.
    """
    from jcvi.formats.base import read_block

    fp = open(lstfile)
    all_authors = []
    for header, seq in read_block(fp, "["):
        seq = " ".join(seq)
        authors = []
        for au in seq.split(","):
            au = au.strip()
            if not au:
                continue
            au = string.translate(au, None, string.digits)
            #au = au.replace("-", '')
            authors.append(au)
        all_authors.append(authors)

    out = []
    for authors in all_authors:
        blocks = []
        for au in authors:
            last, first, initials = get_name_parts(au)
            suffix = ""
            nameblock = NameTemplate.format(last=last, first=first,
                    initials=initials, suffix=suffix)
            blocks.append(nameblock)
        bigblock = ",\n".join(blocks)
        out.append(bigblock)

    return out
Exemplo n.º 7
0
    def __init__(self, filename=None, ctgsizes=None):
        super(OO, self).__init__(filename)

        if filename is None:
            return

        from jcvi.formats.base import read_block

        fp = open(filename)
        prefix = "contig_"
        self.contigs = set()
        for header, block in read_block(fp, ">"):
            header = header[1:]  # Trim the '>'
            header = header.split()[0]
            for b in block:
                ctg, orientation = b.split()
                if ctg.startswith(prefix):
                    ctg = ctg[len(prefix):]

                assert orientation in ("BE", "EB")

                strand = "+" if orientation == "BE" else "-"
                ctgsize = ctgsizes[ctg]
                self.add(header, ctg, ctgsize, strand)
                self.contigs.add(ctg)
Exemplo n.º 8
0
def dust2bed(args):
    """
    %prog dust2bed fastafile

    Use dustmasker to find low-complexity regions (LCRs) in the genome.
    """
    from jcvi.formats.base import read_block

    p = OptionParser(dust2bed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    interval = fastafile + ".iv"
    if need_update(fastafile, interval):
        cmd = "dustmasker -in {0}".format(fastafile)
        sh(cmd, outfile=interval)

    fp = open(interval)
    bedfile = fastafile.rsplit(".", 1)[0] + ".dust.bed"
    fw = must_open(bedfile, "w")
    nlines = 0
    nbases = 0
    for header, block in read_block(fp, ">"):
        header = header.strip(">")
        for b in block:
            start, end = b.split(" - ")
            start, end = int(start), int(end)
            print >> fw, "\t".join(str(x) for x in (header, start, end))
            nlines += 1
            nbases += end - start
    logging.debug("A total of {0} DUST intervals ({1} bp) exported to `{2}`".\
                    format(nlines, nbases, bedfile))
Exemplo n.º 9
0
def add_score(args):
    mtfs = read_streme_xml(args.fx)
    fhi = open(args.fi, 'r')
    fho = open(args.fo, 'w')
    #
    start = "MOTIF"
    while 1:
        pos = fhi.tell()
        line = fhi.readline()
        if not line:
            break
        if line.startswith(start):
            fhi.seek(pos)
            break
        else:
            fho.write(line)
    #
    i = 0
    for head, content in read_block(fhi, 'MOTIF'):
        pre, mid, alt = head.split(' ')
        assert mtfs[i]['id'] == mid, 'motifs not in sync'
        new_head = f"{pre} {mid} {mtfs[i]['score_threshold']}"
        fho.write(new_head + "\n")
        for line in content:
            if line.startswith("0") or line.startswith("1"):
                fho.write(" " + line + "\n")
            else:
                fho.write(line + "\n")
        i += 1
    fhi.close()
    fho.close()
Exemplo n.º 10
0
def cnsfix(args):
    """
    %prog cnsfix consensus-fix.out.FAILED > blacklist.ids

    Parse consensus-fix.out to extract layouts for fixed unitigs. This will
    mark all the failed fragments detected by utgcnsfix and pop them out of the
    existing unitigs.
    """
    from jcvi.formats.base import read_block

    p = OptionParser(cnsfix.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    cnsfixout, = args
    fp = open(cnsfixout)
    utgs = []
    saves = []
    for header, contents in read_block(fp, "Evaluating"):
        contents = list(contents)
        utg = header.split()[2]
        utgs.append(utg)
        # Look for this line:
        #   save fragment idx=388 ident=206054426 for next pass
        for c in contents:
            if not c.startswith("save"):
                continue
            ident = c.split()[3].split("=")[-1]
            saves.append(ident)
    print "\n".join(saves)
Exemplo n.º 11
0
 def iter_scaffold(self):
     filename = self.filename
     sizes = self.sizes
     fp = open(filename)
     for header, lines in read_block(fp, ">"):
         scaffold, size, tigs = header[1:].split("|")
         lines = [EvidenceLine(x, sizes) for x in lines if x.strip()]
         yield scaffold, lines
Exemplo n.º 12
0
 def iter_scaffold(self):
     filename = self.filename
     sizes = self.sizes
     fp = open(filename)
     for header, lines in read_block(fp, ">"):
         scaffold, size, tigs = header[1:].split("|")
         lines = [EvidenceLine(x, sizes) for x in lines if x.strip()]
         yield scaffold, lines
Exemplo n.º 13
0
    def iter_chain(self):
        fp = open(self.filename)
        for row in fp:
            if row[0] != '#':
                break

        for chain, lines in read_block(fp, "chain"):
            lines = list(lines)
            yield ChainLine(chain, lines)
Exemplo n.º 14
0
    def iter_chain(self):
        fp = open(self.filename)
        for row in fp:
            if row[0] != '#':
                break

        for chain, lines in read_block(fp, "chain"):
            lines = list(lines)
            yield ChainLine(chain, lines)
Exemplo n.º 15
0
 def iter_records(self):
     c = None
     for a, b in read_block(self.fp, "#"):
         if a[:2] == '##':
             if c:
                 yield c
             c = ContigLine(a)
         else:
             c.reads.append(ReadLine(a, c.id))
     if c:  # last one
         yield c
Exemplo n.º 16
0
 def iter_records(self):
     c = None
     for a, b in read_block(self.fp, "#"):
         if a[:2] == '##':
             if c:
                 yield c
             c = ContigLine(a)
         else:
             c.reads.append(ReadLine(a, c.id))
     if c:  # last one
         yield c
Exemplo n.º 17
0
    def __init__(self, filename):
        super(BinMap, self).__init__(filename)

        fp = open(filename)
        for header, seq in read_block(fp, "group "):
            lg = header.split()[-1]
            self[lg] = []
            for s in seq:
                if s.strip() == '' or s[0] == ';':
                    continue
                marker, pos = s.split()
                pos = int(float(pos) * 1000)
                self[lg].append((marker, pos))
Exemplo n.º 18
0
def read_trees(tree):
    from urlparse import parse_qs
    from jcvi.formats.base import read_block

    trees = []

    fp = open(tree)
    for header, tx in read_block(fp, "#"):
        header = parse_qs(header[1:])
        label = header["label"][0].strip("\"")
        outgroup = header["outgroup"]
        trees.append((label, outgroup, "".join(tx)))

    return trees
Exemplo n.º 19
0
Arquivo: tree.py Projeto: fw1121/jcvi
def read_trees(tree):
    from urlparse import parse_qs
    from jcvi.formats.base import read_block

    trees = []

    fp = open(tree)
    for header, tx in read_block(fp, "#"):
        header = parse_qs(header[1:])
        label = header["label"][0].strip("\"")
        outgroup = header["outgroup"]
        trees.append((label, outgroup, "".join(tx)))

    return trees
Exemplo n.º 20
0
def read_trees(tree):
    from six.moves.urllib.parse import parse_qs
    from jcvi.formats.base import read_block

    trees = []

    fp = open(tree)
    for header, tx in read_block(fp, "#"):
        header = parse_qs(header[1:])
        label = header["label"][0].strip("\"")
        outgroup = header["outgroup"]
        color, = header.get("color", ["k"])
        trees.append((label, outgroup, color, "".join(tx)))

    return trees
Exemplo n.º 21
0
Arquivo: fimo.py Projeto: orionzhou/nf
def read_meme(fi):
    mtfs = []
    fhi = open(fi, 'r')
    for head, content in read_block(fhi, 'MOTIF'):
        ps = head.split(' ')
        pre, mid = ps[:2]
        score = ''
        if len(ps) >= 3:
            score = ps[2]
        #mtf = mid.split("-")[1]
        if is_number(score):
            score = float(score)
        width = len(content) - 2
        mtfs.append([mid, width, score])
        #print(mid,'\t',width)
    return mtfs
Exemplo n.º 22
0
Arquivo: fpc.py Projeto: zjwang6/jcvi
    def __iter__(self):
        line = self._handle.readline()
        if not line.startswith(bac_tag):
            read_until(self._handle, bac_tag)

        for header, seq in read_block(self._handle, bac_tag):

            rec = FpcRecord()
            assert header.startswith(bac_tag)
            rec.bac_name = header.split('\"')[1]

            for line in seq:
                if line.startswith("Map"):
                    rec.ctg_name = line.split('\"')[1]
                    if "Left" in line:
                        rec.map_left = line.split("Left")[1].split()[0]
                        rec.map_left = int(float(rec.map_left))
                    if "Right" in line:
                        rec.map_right = line.split("Right")[1].split()[0]
                        rec.map_right = int(float(rec.map_right))
                if line.startswith("Gel_number"):
                    rec.gel_number = line.split()[-1]
                if line.startswith("Fp_number"):
                    rec.fp_number = line.split()[-1]
                if line.startswith("Bands"):
                    rec.bands = line.split()[-1]
                if line.startswith("match_to_cosmid"):
                    rec.cosmid = line.split('\"')[1]
                    rec.cosmid_match = line.split("_match_")[0]
                if line.startswith("Positive"):
                    rec.probes.append(line.split('\"')[1])
                if line.startswith("Fpc_remark"):
                    rec.remark.append(line.split('\"')[1])
                if line.startswith("Creation_date"):
                    rec.cre_date = line.split('_date')[1].strip()
                if line.startswith("Modified_date"):
                    rec.mod_date = line.split('_date')[1].strip()

            yield rec
Exemplo n.º 23
0
Arquivo: fpc.py Projeto: bennyyu/jcvi
    def __iter__(self):
        line = self._handle.readline()
        if not line.startswith(bac_tag):
            read_until(self._handle, bac_tag)

        for header, seq in read_block(self._handle, bac_tag):

            rec = FpcRecord()
            assert header.startswith(bac_tag)
            rec.bac_name = header.split('\"')[1]

            for line in seq:
                if line.startswith("Map"):
                    rec.ctg_name = line.split('\"')[1]
                    if "Left" in line:
                        rec.map_left = line.split("Left")[1].split()[0]
                        rec.map_left = int(float(rec.map_left))
                    if "Right" in line:
                        rec.map_right = line.split("Right")[1].split()[0]
                        rec.map_right = int(float(rec.map_right))
                if line.startswith("Gel_number"):
                    rec.gel_number = line.split()[-1]
                if line.startswith("Fp_number"):
                    rec.fp_number = line.split()[-1]
                if line.startswith("Bands"):
                    rec.bands = line.split()[-1]
                if line.startswith("match_to_cosmid"):
                    rec.cosmid = line.split('\"')[1]
                    rec.cosmid_match = line.split("_match_")[0]
                if line.startswith("Positive"):
                    rec.probes.append(line.split('\"')[1])
                if line.startswith("Fpc_remark"):
                    rec.remark.append(line.split('\"')[1])
                if line.startswith("Creation_date"):
                    rec.cre_date = line.split('_date')[1].strip()
                if line.startswith("Modified_date"):
                    rec.mod_date = line.split('_date')[1].strip()

            yield rec
Exemplo n.º 24
0
def coge(args):
    """
    %prog coge cogefile

    Convert CoGe file to anchors file.
    """
    p = OptionParser(coge.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    cogefile, = args
    fp = must_open(cogefile)
    cogefile = cogefile.replace(".gz", "")
    ksfile = cogefile + ".ks"
    anchorsfile = cogefile + ".anchors"
    fw_ks = must_open(ksfile, "w")
    fw_ac = must_open(anchorsfile, "w")

    tag = "###"
    print >> fw_ks, tag
    for header, lines in read_block(fp, tag):
        print >> fw_ac, tag
        lines = list(lines)
        for line in lines:
            if line[0] == '#':
                continue
            ks, ka, achr, a, astart, astop, bchr, \
                    b, bstart, bstop, ev, ss = line.split()
            a = a.split("||")[3]
            b = b.split("||")[3]
            print >> fw_ac, "\t".join((a, b, ev))
            print >> fw_ks, ",".join((";".join((a, b)), ks, ka, ks, ka))

    fw_ks.close()
    fw_ac.close()
Exemplo n.º 25
0
def coge(args):
    """
    %prog coge cogefile

    Convert CoGe file to anchors file.
    """
    p = OptionParser(coge.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    cogefile, = args
    fp = must_open(cogefile)
    cogefile = cogefile.replace(".gz", "")
    ksfile = cogefile + ".ks"
    anchorsfile = cogefile + ".anchors"
    fw_ks = must_open(ksfile, "w")
    fw_ac = must_open(anchorsfile, "w")

    tag = "###"
    print >> fw_ks, tag
    for header, lines in read_block(fp, tag):
        print >> fw_ac, tag
        lines = list(lines)
        for line in lines:
            if line[0] == '#':
                continue
            ks, ka, achr, a, astart, astop, bchr, \
                    b, bstart, bstop, ev, ss = line.split()
            a = a.split("||")[3]
            b = b.split("||")[3]
            print >> fw_ac, "\t".join((a, b, ev))
            print >> fw_ks, ",".join((";".join((a, b)), ks, ka, ks, ka))

    fw_ks.close()
    fw_ac.close()
Exemplo n.º 26
0
def summary(args):
    """
    %prog summary cdhit.clstr

    Parse cdhit.clstr file to get distribution of cluster sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    assert clstrfile.endswith(".clstr")

    fp = open(clstrfile)
    data = []
    for clstr, members in read_block(fp, ">"):
        size = len(members)
        data.append(size)

    loghistogram(data)
Exemplo n.º 27
0
def summary(args):
    """
    %prog summary cdhit.clstr

    Parse cdhit.clstr file to get distribution of cluster sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    assert clstrfile.endswith(".clstr")

    fp = open(clstrfile)
    data = []
    for clstr, members in read_block(fp, ">"):
        size = len(members)
        data.append(size)

    loghistogram(data)
Exemplo n.º 28
0
 def iter_chain(self):
     fp = open(self.filename)
     for chain, lines in read_block(fp, "chain"):
         lines = list(lines)
         yield ChainLine(chain, lines)
Exemplo n.º 29
0
 def iter_blocks(self, minsize=0):
     fp = open(self.filename)
     for header, lines in read_block(fp, "#"):
         lines = [x.split() for x in lines]
         if len(lines) >= minsize:
             yield lines
Exemplo n.º 30
0
 def iter_blocks(self, minsize=0):
     fp = open(self.filename)
     for header, lines in read_block(fp, "#"):
         lines = [x.split() for x in lines]
         if len(lines) >= minsize:
             yield lines