Пример #1
0
def write_csv(header, contents, sep=",", filename="stdout", thousands=False, tee=False):
    """
    Write csv that are aligned with the column headers.

    >>> header = ["x_value", "y_value"]
    >>> contents = [(1, 100), (2, 200)]
    >>> write_csv(header, contents)
    x_value, y_value
          1,     100
          2,     200
    """
    from jcvi.formats.base import must_open, is_number
    from jcvi.utils.cbook import thousands as th

    fw = must_open(filename, "w")
    allcontents = [header] + contents if header else contents
    cols = len(contents[0])
    for content in allcontents:
        assert len(content) == cols

    # Stringify the contents
    for i, content in enumerate(allcontents):
        if thousands:
            content = [int(x) if is_number(x, cast=int) else x for x in content]
            content = [th(x) if (is_number(x, cast=int) and x >= 1000) else x for x in content]
        allcontents[i] = [str(x) for x in content]

    colwidths = [max(len(x[i]) for x in allcontents) for i in xrange(cols)]
    sep += " "
    for content in allcontents:
        rjusted = [x.rjust(cw) for x, cw in zip(content, colwidths)]
        formatted = sep.join(rjusted)
        print >> fw, formatted
        if tee and filename != "stdout":
            print formatted
Пример #2
0
def load_csv(header, contents, sep=",", thousands=False, align=True):

    from jcvi.formats.base import is_number
    from jcvi.utils.cbook import thousands as th

    allcontents = [header] + contents if header else contents
    cols = len(contents[0])
    for content in allcontents:
        assert len(content) == cols

    # Stringify the contents
    for i, content in enumerate(allcontents):
        if thousands:
            content = [int(x) if is_number(x, cast=int) else x \
                        for x in content]
            content = [th(x) if (is_number(x, cast=int) and x >= 1000) else x \
                        for x in content]
        allcontents[i] = [str(x) for x in content]

    colwidths = [max(len(x[i]) for x in allcontents) for i in xrange(cols)]
    sep += " "
    formatted_contents = []
    for content in allcontents:
        rjusted = [x.rjust(cw) for x, cw in zip(content, colwidths)] \
                    if align else content
        formatted = sep.join(rjusted)
        formatted_contents.append(formatted)

    return formatted_contents
Пример #3
0
def load_csv(header, contents, sep=",", thousands=False, align=True):

    from jcvi.formats.base import is_number
    from jcvi.utils.cbook import thousands as th

    allcontents = [header] + contents if header else contents
    cols = len(contents[0])
    for content in allcontents:
        assert len(content) == cols

    # Stringify the contents
    for i, content in enumerate(allcontents):
        if thousands:
            content = [int(x) if is_number(x, cast=int) else x \
                        for x in content]
            content = [th(x) if (is_number(x, cast=int) and x >= 1000) else x \
                        for x in content]
        allcontents[i] = [str(x) for x in content]

    colwidths = [max(len(x[i]) for x in allcontents) for i in xrange(cols)]
    sep += " "
    formatted_contents = []
    for content in allcontents:
        rjusted = [x.rjust(cw) for x, cw in zip(content, colwidths)] \
                    if align else content
        formatted = sep.join(rjusted)
        formatted_contents.append(formatted)

    return formatted_contents
Пример #4
0
Файл: bed.py Проект: yangjl/jcvi
def random(args):
    """
    %prog random bedfile number_of_features

    Extract a random subset of features. Number of features can be an integer
    number, or a fractional number in which case a random fraction (for example
    0.1 = 10% of all features) will be extracted.
    """
    from random import sample
    from jcvi.formats.base import flexible_cast

    p = OptionParser(random.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, N = args
    assert is_number(N)

    b = Bed(bedfile)
    NN = flexible_cast(N)
    if NN < 1:
        NN = int(round(NN * len(b)))

    beds = sample(b, NN)
    new_bed = Bed()
    new_bed.extend(beds)

    outfile = bedfile.rsplit(".", 1)[0] + ".{0}.bed".format(N)
    new_bed.print_to_file(outfile)
    logging.debug("Write {0} features to `{1}`".format(NN, outfile))
Пример #5
0
def guess_method(tag):
    from jcvi.formats.base import is_number

    jobids = tag.split(",")
    for jobid in jobids:
        if not is_number(jobid):
            return "pattern"
    return "jobid"
Пример #6
0
 def gffline(self, type='match', source='default'):
     score = "." if not self.score or \
             (self.score and not is_number(self.score)) \
             else self.score
     strand = "." if not self.strand else self.strand
     row = "\t".join((self.seqid, source, type, str(self.start + 1),
                      str(self.end), score, strand, '.', 'ID=' + self.accn))
     return row
Пример #7
0
def guess_method(tag):
    from jcvi.formats.base import is_number

    jobids = tag.split(",")
    for jobid in jobids:
        if not is_number(jobid):
            return "pattern"
    return "jobid"
Пример #8
0
Файл: bed.py Проект: yangjl/jcvi
 def gffline(self, type='match', source='default'):
     score = "." if not self.score or \
             (self.score and not is_number(self.score)) \
             else self.score
     strand = "." if not self.strand else self.strand
     row = "\t".join((self.seqid, source, type,
         str(self.start + 1), str(self.end), score,
         strand, '.', 'ID=' + self.accn))
     return row
Пример #9
0
def blat(args):
    """
    %prog blat map1.txt ref.fasta

    Make ALLMAPS input csv based on sequences. The tab-delimited txt file
    include: name, LG, position, sequence.
    """
    from jcvi.formats.base import is_number
    from jcvi.formats.blast import best as blast_best, bed as blast_bed
    from jcvi.apps.align import blat as blat_align

    p = OptionParser(blat.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    maptxt, ref = args
    pf = maptxt.rsplit(".", 1)[0]
    register = {}
    fastafile = pf + ".fasta"
    fp = open(maptxt)
    fw = open(fastafile, "w")
    for row in fp:
        name, lg, pos, seq = row.split()
        if not is_number(pos):
            continue
        register[name] = (pf + '-' + lg, pos)
        print(">{0}\n{1}\n".format(name, seq), file=fw)
    fw.close()

    blatfile = blat_align([ref, fastafile])
    bestfile = blast_best([blatfile])
    bedfile = blast_bed([bestfile])
    b = Bed(bedfile).order

    pf = ".".join((op.basename(maptxt).split(".")[0],
                   op.basename(ref).split(".")[0]))
    csvfile = pf + ".csv"
    fp = open(maptxt)
    fw = open(csvfile, "w")
    for row in fp:
        name, lg, pos, seq = row.split()
        if name not in b:
            continue
        bbi, bb = b[name]
        scaffold, scaffold_pos = bb.seqid, bb.start
        print(",".join(str(x) for x in \
                    (scaffold, scaffold_pos, lg, pos)), file=fw)
    fw.close()
Пример #10
0
def blat(args):
    """
    %prog blat map1.txt ref.fasta

    Make ALLMAPS input csv based on sequences. The tab-delimited txt file
    include: name, LG, position, sequence.
    """
    from jcvi.formats.base import is_number
    from jcvi.formats.blast import best as blast_best, bed as blast_bed
    from jcvi.apps.align import blat as blat_align

    p = OptionParser(blat.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    maptxt, ref = args
    pf = maptxt.rsplit(".", 1)[0]
    register = {}
    fastafile = pf + ".fasta"
    fp = open(maptxt)
    fw = open(fastafile, "w")
    for row in fp:
        name, lg, pos, seq = row.split()
        if not is_number(pos):
            continue
        register[name] = (pf + '-' + lg, pos)
        print(">{0}\n{1}\n".format(name, seq), file=fw)
    fw.close()

    blatfile = blat_align([ref, fastafile])
    bestfile = blast_best([blatfile])
    bedfile = blast_bed([bestfile])
    b = Bed(bedfile).order

    pf = ".".join(
        (op.basename(maptxt).split(".")[0], op.basename(ref).split(".")[0]))
    csvfile = pf + ".csv"
    fp = open(maptxt)
    fw = open(csvfile, "w")
    for row in fp:
        name, lg, pos, seq = row.split()
        if name not in b:
            continue
        bbi, bb = b[name]
        scaffold, scaffold_pos = bb.seqid, bb.start
        print(",".join(str(x) for x in \
                    (scaffold, scaffold_pos, lg, pos)), file=fw)
    fw.close()
Пример #11
0
def read_meme(fi):
    mtfs = []
    fhi = open(fi, 'r')
    for head, content in read_block(fhi, 'MOTIF'):
        ps = head.split(' ')
        pre, mid = ps[:2]
        score = ''
        if len(ps) >= 3:
            score = ps[2]
        #mtf = mid.split("-")[1]
        if is_number(score):
            score = float(score)
        width = len(content) - 2
        mtfs.append([mid, width, score])
        #print(mid,'\t',width)
    return mtfs
Пример #12
0
def rename_seqid(seqid):
    seqid = seqid.split("_")[-1]
    seqid = seqid.replace("supercont", "s")
    seqid = seqid.replace("contig", "c").replace("scaffold", "s")
    return "c{}".format(int(seqid)) if is_number(seqid, int) else seqid
Пример #13
0
def frommaf(args):
    """
    %prog frommaf maffile

    Convert to four-column tabular format from MAF.
    """
    p = OptionParser(frommaf.__doc__)
    p.add_option("--validate", help="Validate coordinates against FASTA")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (maf,) = args
    snpfile = maf.rsplit(".", 1)[0] + ".vcf"
    fp = open(maf)
    fw = open(snpfile, "w")
    total = 0
    id = "."
    qual = 20
    filter = "PASS"
    info = "DP=20"
    print("##fileformat=VCFv4.0", file=fw)
    print("#CHROM POS ID REF ALT QUAL FILTER INFO".replace(" ", "\t"), file=fw)
    for row in fp:
        atoms = row.split()
        c, pos, ref, alt = atoms[:4]
        if is_number(c, int):
            c = int(c)
        else:
            continue
        c = "chr{0:02d}".format(c)
        pos = int(pos)
        print(
            "\t".join(str(x) for x in (c, pos, id, ref, alt, qual, filter, info)),
            file=fw,
        )
        total += 1
    fw.close()

    validate = opts.validate
    if not validate:
        return

    from jcvi.utils.cbook import percentage

    f = Fasta(validate)
    fp = open(snpfile)
    nsnps = 0
    for row in fp:
        if row[0] == "#":
            continue

        c, pos, id, ref, alt, qual, filter, info = row.split("\t")
        pos = int(pos)
        feat = dict(chr=c, start=pos, stop=pos)
        s = f.sequence(feat)
        s = str(s)
        assert s == ref, "Validation error: {0} is {1} (expect: {2})".format(
            feat, s, ref
        )
        nsnps += 1
        if nsnps % 50000 == 0:
            logging.debug("SNPs parsed: {0}".format(percentage(nsnps, total)))
    logging.debug(
        "A total of {0} SNPs validated and written to `{1}`.".format(nsnps, snpfile)
    )