Exemplo n.º 1
def filterdata(args):
    %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids

    Filter subset of data after dropping remove.ids.
    p = OptionParser(filterdata.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 6:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, af, remove, final = args
    df, m, samples, loci = read_binfile(binfile, sampleids, strids)
    remove = [x.strip() for x in open(remove)]
    removes = set(remove)
    final = [x.strip() for x in open(final)]
    assert len(loci) == len(remove) + len(final)

    fp = open(af)
    percentiles = {}
    for row in fp:
        sname, counts = row.split()
        countsd = af_to_counts(counts)
        percentile = counts_to_percentile(countsd)
        percentiles[sname] = percentile

    run_args = []
    for i, sname in enumerate(loci):
        if sname in removes:
        a = m[:, i]
        percentile = percentiles[sname]
        run_args.append((i, a, percentile))

    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    res = []
    for r in p.map_async(convert_to_percentile, run_args).get():

    # Write mask (P-value) matrix
    ii, pvalues = zip(*res)
    m = np.vstack(pvalues).T
    write_csv("final.mask.tsv", m, samples, final)

    df.drop(remove, inplace=True, axis=1)
    df.columns = final

    # Save a copy of the raw numpy array
    filtered_bin = "filtered.bin"
    m = df.as_matrix()
    m[m < 0] = -1
    logging.debug("Binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
Exemplo n.º 2
Arquivo: str.py Projeto: qiao-xin/jcvi
def filterdata(args):
    %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids

    Filter subset of data after dropping remove.ids.
    p = OptionParser(filterdata.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 6:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, af, remove, final = args
    df, m, samples, loci = read_binfile(binfile, sampleids, strids)
    remove = [x.strip() for x in open(remove)]
    removes = set(remove)
    final = [x.strip() for x in open(final)]
    assert len(loci) == len(remove) + len(final)

    fp = open(af)
    percentiles = {}
    for row in fp:
        sname, counts = row.split()
        countsd = af_to_counts(counts)
        percentile = counts_to_percentile(countsd)
        percentiles[sname] = percentile

    run_args = []
    for i, sname in enumerate(loci):
        if sname in removes:
        a = m[:, i]
        percentile = percentiles[sname]
        run_args.append((i, a, percentile))

    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    res = []
    for r in p.map_async(convert_to_percentile, run_args).get():

    # Write mask (P-value) matrix
    ii, pvalues = zip(*res)
    m = np.vstack(pvalues).T
    write_csv("final.mask.tsv", m, samples, final)

    df.drop(remove, inplace=True, axis=1)
    df.columns = final

    # Save a copy of the raw numpy array
    filtered_bin = "filtered.bin"
    m = df.as_matrix()
    m[m < 0] = -1
    logging.debug("Binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
Exemplo n.º 3
def write_mask(cpus, samples, final_columns, run_args, filename="mask.tsv"):
    p = Pool(processes=cpus)
    res = []
    r = p.map_async(convert_to_percentile, run_args, callback=res.append)

    if len(res) == 1:  # sometimes res end up with one more nest
        res, = res

    # Write mask (P-value) matrix
    ii, pvalues = zip(*res)
    m = np.vstack(pvalues).T
    write_csv(filename, m, samples, final_columns)
Exemplo n.º 4
def write_mask(cpus, samples, final_columns, run_args, filename="mask.tsv"):
    p = Pool(processes=cpus)
    res = []
    r = p.map_async(convert_to_percentile, run_args,

    if len(res) == 1:  # sometimes res end up with one more nest
        res, = res

    # Write mask (P-value) matrix
    ii, pvalues = zip(*res)
    m = np.vstack(pvalues).T
    write_csv(filename, m, samples, final_columns)
Exemplo n.º 5
def count(args):
    %prog count *.gz

    Count reads based on FASTQC results. FASTQC needs to be run on all the input
    data given before running this command.
    from jcvi.utils.table import loadtable, write_csv

    p = OptionParser(count.__doc__)
                 help="Sub-directory where FASTQC was run [default: %default]")
                 help="Human friendly numbers [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    filenames = args
    subdir = opts.dir
    header = "Filename|Total Sequences|Sequence length|Total Bases".split("|")
    rows = []
    human = opts.human
    for f in filenames:
        folder = f.replace(".gz", "").rsplit(".", 1)[0] + "_fastqc"
        if subdir:
            folder = op.join(subdir, folder)
        summaryfile = op.join(folder, "fastqc_data.txt")

        fqcdata = FastQCdata(summaryfile, human=human)
        row = [fqcdata[x] for x in header]

    print >> sys.stderr, loadtable(header, rows)
Exemplo n.º 6
    def __init__(self, filename, delimiter=','):
        super(Layout, self).__init__(filename)
        if not op.exists(filename):
            ksfiles = iglob(".", "*.ks")
            header = "Ks file|ncomponents|label|color|marker".split("|")
            contents = []
            for ksfile in ksfiles:
                leg = op.basename(ksfile).rsplit(".", 1)[0]
                if leg.count(".") == 1:
                    leg = leg.replace(".", " *vs.* ")
                contents.append((ksfile, "1", leg, "", ""))
            write_csv(header, contents, comment=True, filename=filename)

        fp = open(filename)
        for row in fp:
            if row[0] == '#':
            self.append(LayoutLine(row, delimiter=delimiter))

Exemplo n.º 7
Arquivo: ks.py Projeto: ascendo/jcvi
    def __init__(self, filename, delimiter=','):
        super(Layout, self).__init__(filename)
        if not op.exists(filename):
            ksfiles = iglob(".", "*.ks")
            header = "Ks file|ncomponents|label|color|marker".split("|")
            contents = []
            for ksfile in ksfiles:
                leg = op.basename(ksfile).rsplit(".", 1)[0]
                if leg.count(".") == 1:
                    leg = leg.replace(".", " *vs.* ")
                contents.append((ksfile, "1", leg, "", ""))
            write_csv(header, contents, comment=True, filename=filename)

        fp = open(filename)
        for row in fp:
            if row[0] == '#':
            self.append(LayoutLine(row, delimiter=delimiter))

Exemplo n.º 8
def count(args):
    %prog count *.gz

    Count reads based on FASTQC results. FASTQC needs to be run on all the input
    data given before running this command.
    from jcvi.utils.table import loadtable, write_csv

    p = OptionParser(count.__doc__)
                help="Sub-directory where FASTQC was run [default: %default]")
    p.add_option("--human", default=False, action="store_true",
                help="Human friendly numbers [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    filenames = args
    subdir = opts.dir
    header = "Filename|Total Sequences|Sequence length|Total Bases".split("|")
    rows = []
    human = opts.human
    for f in filenames:
        folder = f.replace(".gz", "").rsplit(".", 1)[0] + "_fastqc"
        if subdir:
            folder = op.join(subdir, folder)
        summaryfile = op.join(folder, "fastqc_data.txt")

        fqcdata = FastQCdata(summaryfile, human=human)
        row = [fqcdata[x] for x in header]

    print >> sys.stderr, loadtable(header, rows)
    write_csv(header, rows, sep=opts.sep,
              filename=opts.outfile, align=opts.align)
Exemplo n.º 9
Arquivo: agp.py Projeto: bennyyu/jcvi
def summary(args):
    %prog summary agpfile

    print a table of scaffold statistics, number of BACs, no of scaffolds,
    scaffold N50, scaffold L50, actual sequence, PSMOL NNNs, PSMOL-length, % of
    PSMOL sequenced.
    from jcvi.utils.table import write_csv

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:

    agpfile, = args
    header = "Chromosome #_Distinct #_Components #_Scaffolds " \
             "Scaff_N50 Scaff_L50 Length".split()

    agp = AGP(agpfile)
    data = list(agp.summary_all())
    write_csv(header, data, sep=" ")
Exemplo n.º 10
def summary(args):
    %prog summary agpfile

    print a table of scaffold statistics, number of BACs, no of scaffolds,
    scaffold N50, scaffold L50, actual sequence, PSMOL NNNs, PSMOL-length, % of
    PSMOL sequenced.
    from jcvi.utils.table import write_csv

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:

    agpfile, = args
    header = "Chromosome #_Distinct #_Components #_Scaffolds " \
             "Scaff_N50 Scaff_L50 Length".split()

    agp = AGP(agpfile)
    data = list(agp.summary_all())
    write_csv(header, data, sep=" ")
Exemplo n.º 11
def stats(args):
    %prog stats folder

    Generate table summarizing .stats files.
    p = OptionParser(stats.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    statsfiles = iglob(folder, "*.stats")
    after_equal = lambda x: x.split("=")[-1]
    header = "Library Assembled_reads Contigs".split()
    contents = []
    # label=M0096 total=7443 cnts=948 mean=7.851 std=35.96
    for statsfile in statsfiles:
        fp = open(statsfile)
        for row in fp:
            if row.startswith("label="):
        label, total, cnts = row.split()[:3]
        label = after_equal(label)
        reads = int(after_equal(total))
        contigs = int(after_equal(cnts))
        contents.append((label, reads, contigs))

    all_labels, all_reads, all_contigs = zip(*contents)
    contents.append(("SUM", sum(all_reads), sum(all_contigs)))
    contents.append(("AVERAGE (per sample)", \
                    int(np.mean(all_reads)), int(np.mean(all_contigs))))
    contents.append(("MEDIAN (per sample)", \
                    int(np.median(all_reads)), int(np.median(all_contigs))))
    write_csv(header, contents, filename=opts.outfile)
Exemplo n.º 12
def ystr(args):
    %prog ystr chrY.vcf

    Print out Y-STR info given VCF. Marker name extracted from tabfile.
    from jcvi.utils.table import write_csv

    p = OptionParser(ystr.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    si = STRFile(opts.lobstr_home, db="hg38-named")
    register = si.register

    header = "Marker|Reads|Ref|Genotype|Motif".split("|")
    contents = []
    fp = must_open(vcffile)
    reader = vcf.Reader(fp)
    simple_register = {}
    for record in reader:
        name = register[(record.CHROM, record.POS)]
        info = record.INFO
        ref = int(float(info["REF"]))
        rpa = info.get("RPA", ref)
        if isinstance(rpa, list):
            rpa = "|".join(str(int(float(x))) for x in rpa)
        ru = info["RU"]
        simple_register[name] = rpa
        for sample in record.samples:
            contents.append((name, sample["ALLREADS"], ref, rpa, ru))

    # Multi-part markers
    a, b, c = "DYS389I", "DYS389B.1", "DYS389B"
    if a in simple_register and b in simple_register:
        simple_register[c] = int(simple_register[a]) + int(simple_register[b])

    # Multi-copy markers
    mm = ["DYS385", "DYS413", "YCAII"]
    for m in mm:
        ma, mb = m + 'a', m + 'b'
        if ma not in simple_register or mb not in simple_register:
            simple_register[ma] = simple_register[mb] = None
            del simple_register[ma]
            del simple_register[mb]
        if simple_register[ma] > simple_register[mb]:
            simple_register[ma], simple_register[mb] = \
                    simple_register[mb], simple_register[ma]

    write_csv(header, contents, sep=" ")
    build_yhrd_link(simple_register, panel=YHRD_YFILER)
    build_yhrd_link(simple_register, panel=YHRD_YFILERPLUS)
    build_yhrd_link(simple_register, panel=USYSTR_ALL)
Exemplo n.º 13
def prepare(args):
    %prog prepare "B. oleracea" *.fastq

    Scan input fastq files (see below) and create `in_groups.csv` and
    `in_libs.csv`. The species name does not really matter.
    from jcvi.utils.table import write_csv
    from jcvi.formats.base import write_file
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("--corr", default=False, action="store_true",
                 help="Extra parameters for corrected data [default: %default]")
    p.add_option("--norun", default=False, action="store_true",
                 help="Don't write `run.sh` script [default: %default]")
    p.add_option("--ploidy", default="2", choices=("1", "2"),
                 help="Ploidy [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    organism_name = args[0]
    project_name = "".join(x[0] for x in organism_name.split()).upper()
    fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:])
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    offset = guessoffset([fnames[0]])
    phred64 = offset == 64

    assert all(guessoffset([x]) == offset for x in fnames[1:])

    groupheader = "group_name library_name file_name".split()
    libheader = "library_name project_name organism_name type paired "\
        "frag_size frag_stddev insert_size insert_stddev read_orientation "\
        "genomic_start genomic_end".split()
    groupcontents = []
    libs = []
    for file_name in fnames:
        group_name = op.basename(file_name).split(".")[0]
        library_name = "-".join(group_name.split("-")[:2])

        # Handle paired files and convert to wildcard
        if ".1." in file_name:
            file_name = file_name.replace(".1.", ".?.")
        elif ".2." in file_name:

        groupcontents.append((group_name, library_name, file_name))
        if library_name not in libs:

    libcontents = []
    for library_name in libs:
        L = Library(library_name)
        size = L.size
        stddev = L.stddev
        type = L.type
        paired = L.paired
        read_orientation = L.read_orientation

        size = size or ""
        stddev = stddev or ""
        frag_size = size if type == "fragment" else ""
        frag_stddev = stddev if type == "fragment" else ""
        insert_size = size if type != "fragment" else ""
        insert_stddev = stddev if type != "fragment" else ""
        genomic_start, genomic_end = "", ""
        libcontents.append((library_name, project_name, organism_name, type, \
            paired, frag_size, frag_stddev, insert_size, insert_stddev, \
            read_orientation, genomic_start, genomic_end))

    write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True)
    logging.debug("`in_group.csv` created (# of groups = {0}).".\

    write_csv(libheader, libcontents, filename="in_libs.csv", tee=True)
    logging.debug("`in_libs.csv` created (# of libs = {0}).".\

    runfile = "run.sh"

    extra = ""
    if opts.corr:
        extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0"

    if not opts.norun:
        contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, phred64, extra)
        write_file(runfile, contents)
Exemplo n.º 14
def plot(args):
    %prog plot tagged.new.bed chr1

    Plot gene identifiers along a particular chromosome, often to illustrate the
    gene id assignment procedure.
    from jcvi.graphics.base import plt, savefig
    from jcvi.graphics.chromosome import ChromosomeMap

    p = OptionParser(plot.__doc__)
    p.add_option("--firstn", type="int", help="Only plot the first N genes")
    p.add_option("--ymax", type="int", help="Y-axis max value")
                 help="Write plotting data [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="6x4")

    if len(args) != 2:
        sys.exit(not p.print_help())

    taggedbed, chr = args
    bed = Bed(taggedbed)
    beds = list(bed.sub_bed(chr))
    old, new = [], []
    i = 0
    for b in beds:
        accn = b.extra[0]
        if "te" in accn:

        accn, tag = accn.split("|")
        if tag == "OVERLAP":

        c, r = atg_name(accn)
        if tag == "NEW":
            new.append((i, r))
            old.append((i, r))
        i += 1

    ngenes = i
    assert ngenes == len(new) + len(old)

    logging.debug("Imported {0} ranks on {1}.".format(ngenes, chr))
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    xstart, xend = .2, .8
    ystart, yend = .2, .8
    pad = .02

    ngenes = opts.firstn or ngenes
    ymax = opts.ymax or 500000

    title = "Assignment of Medtr identifiers"
    if opts.ymax:
        subtitle = "{0}, first {1} genes".format(chr, ngenes)
        subtitle = "{0}, {1} genes ({2} new)".format(chr, ngenes, len(new))

    chr_map = ChromosomeMap(fig, root, xstart, xend, ystart, yend, pad, 0,
                            ymax, 5, title, subtitle)

    ax = chr_map.axes

    if opts.log:
        from jcvi.utils.table import write_csv
        header = ["x", "y"]
        write_csv(header, new, filename=chr + ".new")
        write_csv(header, old, filename=chr + ".old")

    x, y = zip(*new)
    ax.plot(x, y, "b,")
    x, y = zip(*old)
    ax.plot(x, y, "r,")

    # Legends
    ymid = (ystart + yend) / 2
    y = ymid + pad
    root.plot([.2], [y], "r.", lw=2)
    root.text(.2 + pad, y, "Existing Medtr ids", va="center", size=10)
    y = ymid - pad
    root.plot([.2], [y], "b.", lw=2)
    root.text(.2 + pad, y, "Newly instantiated ids", va="center", size=10)

    ax.set_xlim(0, ngenes)
    ax.set_ylim(0, ymax)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)

    image_name = chr + ".identifiers." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Exemplo n.º 15
def ystr(args):
    %prog ystr chrY.vcf

    Print out Y-STR info given VCF. Marker name extracted from tabfile.
    from jcvi.utils.table import write_csv

    p = OptionParser(ystr.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    si = STRFile(opts.lobstr_home, db="hg38-named")
    register = si.register

    header = "Marker|Reads|Ref|Genotype|Motif".split("|")
    contents = []
    fp = must_open(vcffile)
    reader = vcf.Reader(fp)
    simple_register = {}
    for record in reader:
        name = register[(record.CHROM, record.POS)]
        info = record.INFO
        ref = int(float(info["REF"]))
        rpa = info.get("RPA", ref)
        if isinstance(rpa, list):
            rpa = "|".join(str(int(float(x))) for x in rpa)
        ru = info["RU"]
        simple_register[name] = rpa
        for sample in record.samples:
            contents.append((name, sample["ALLREADS"], ref, rpa, ru))

    # Multi-part markers
    a, b, c = "DYS389I", "DYS389B.1", "DYS389B"
    if a in simple_register and b in simple_register:
        simple_register[c] = int(simple_register[a]) + int(simple_register[b])

    # Multi-copy markers
    mm = ["DYS385", "DYS413", "YCAII"]
    for m in mm:
        ma, mb = m + 'a', m + 'b'
        if ma not in simple_register or mb not in simple_register:
            simple_register[ma] = simple_register[mb] = None
            del simple_register[ma]
            del simple_register[mb]
        if simple_register[ma] > simple_register[mb]:
            simple_register[ma], simple_register[mb] = \
                    simple_register[mb], simple_register[ma]

    write_csv(header, contents, sep=" ")
    print "[YSEARCH]"
    print "[YFILER]"
    build_yhrd_link(simple_register, panel=YHRD_YFILER)
    print "[YFILERPLUS]"
    build_yhrd_link(simple_register, panel=YHRD_YFILERPLUS)
    print "[YSTR-ALL]"
    build_yhrd_link(simple_register, panel=USYSTR_ALL)
Exemplo n.º 16
def plot(args):
    %prog plot tagged.new.bed chr1

    Plot gene identifiers along a particular chromosome, often to illustrate the
    gene id assignment procedure.
    from jcvi.graphics.base import plt, savefig
    from jcvi.graphics.chromosome import ChromosomeMap

    p = OptionParser(plot.__doc__)
    p.add_option("--firstn", type="int", help="Only plot the first N genes")
    p.add_option("--ymax", type="int", help="Y-axis max value")
    p.add_option("--log", action="store_true",
                help="Write plotting data [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="6x4")

    if len(args) != 2:
        sys.exit(not p.print_help())

    taggedbed, chr = args
    bed = Bed(taggedbed)
    beds = list(bed.sub_bed(chr))
    old, new = [], []
    i = 0
    for b in beds:
        accn = b.extra[0]
        if "te" in accn:

        accn, tag = accn.split("|")
        if tag == "OVERLAP":

        c, r = atg_name(accn)
        if tag == "NEW":
            new.append((i, r))
            old.append((i, r))
        i += 1

    ngenes = i
    assert ngenes == len(new) + len(old)

    logging.debug("Imported {0} ranks on {1}.".format(ngenes, chr))
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    xstart, xend = .2, .8
    ystart, yend = .2, .8
    pad = .02

    ngenes = opts.firstn or ngenes
    ymax = opts.ymax or 500000

    title = "Assignment of Medtr identifiers"
    if opts.ymax:
        subtitle = "{0}, first {1} genes".format(chr, ngenes)
        subtitle = "{0}, {1} genes ({2} new)".format(chr, ngenes, len(new))

    chr_map = ChromosomeMap(fig, root, xstart, xend, ystart, yend, pad, 0,
                        ymax, 5, title, subtitle)

    ax = chr_map.axes

    if opts.log:
        from jcvi.utils.table import write_csv
        header = ["x", "y"]
        write_csv(header, new, filename=chr + ".new")
        write_csv(header, old, filename=chr + ".old")

    x, y = zip(*new)
    ax.plot(x, y, "b,")
    x, y = zip(*old)
    ax.plot(x, y, "r,")

    # Legends
    ymid = (ystart + yend) / 2
    y = ymid + pad
    root.plot([.2], [y], "r.", lw=2)
    root.text(.2 + pad, y, "Existing Medtr ids", va="center", size=10)
    y = ymid - pad
    root.plot([.2], [y], "b.", lw=2)
    root.text(.2 + pad, y, "Newly instantiated ids", va="center", size=10)

    ax.set_xlim(0, ngenes)
    ax.set_ylim(0, ymax)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)

    image_name = chr + ".identifiers." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Exemplo n.º 17
def prepare(args):
    %prog prepare "B. oleracea" *.fastq

    Scan input fastq files (see below) and create `in_groups.csv` and
    `in_libs.csv`. The species name does not really matter.
    from jcvi.utils.table import write_csv
    from jcvi.formats.base import check_exists

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("--norun", default=False, action="store_true",
                 help="Don't write `run.sh` script [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    organism_name = args[0]
    project_name = "".join(x[0] for x in organism_name.split()).upper()
    fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:])
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    groupheader = "group_name library_name file_name".split()
    libheader = "library_name project_name organism_name type paired "\
        "frag_size frag_stddev insert_size insert_stddev read_orientation "\
        "genomic_start genomic_end".split()
    groupcontents = []
    libs = []
    for file_name in fnames:
        group_name = op.basename(file_name).split(".")[0]
        library_name = "-".join(group_name.split("-")[:2])

        # Handle paired files and convert to wildcard
        if ".1." in file_name:
            file_name = file_name.replace(".1.", ".?.")
        elif ".2." in file_name:

        groupcontents.append((group_name, library_name, file_name))
        if library_name not in libs:

    libcontents = []
    for library_name in libs:
        L = Library(library_name)
        size = L.size
        stddev = L.stddev
        type = L.type
        paired = L.paired
        read_orientation = L.read_orientation

        size = size or ""
        stddev = stddev or ""
        frag_size = size if type == "fragment" else ""
        frag_stddev = stddev if type == "fragment" else ""
        insert_size = size if type != "fragment" else ""
        insert_stddev = stddev if type != "fragment" else ""
        genomic_start, genomic_end = "", ""
        libcontents.append((library_name, project_name, organism_name, type, \
            paired, frag_size, frag_stddev, insert_size, insert_stddev, \
            read_orientation, genomic_start, genomic_end))

    write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True)
    logging.debug("`in_group.csv` created (# of groups = {0}).".\

    write_csv(libheader, libcontents, filename="in_libs.csv", tee=True)
    logging.debug("`in_libs.csv` created (# of libs = {0}).".\

    runfile = "run.sh"
    if not opts.norun and check_exists(runfile):
        fw = open(runfile, "w")
        print >> fw, ALLPATHSRUN
        logging.debug("Run script written to `{0}`.".format(runfile))
Exemplo n.º 18
def prepare(args):
    %prog prepare "B. oleracea" *.fastq

    Scan input fastq files (see below) and create `in_groups.csv` and
    `in_libs.csv`. The species name does not really matter.
    from jcvi.utils.table import write_csv
    from jcvi.formats.base import check_exists

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("--norun", default=False, action="store_true",
                 help="Don't write `run.sh` script [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    organism_name = args[0]
    project_name = "".join(x[0] for x in organism_name.split()).upper()
    fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:])
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    groupheader = "group_name library_name file_name".split()
    libheader = "library_name project_name organism_name type paired "\
        "frag_size frag_stddev insert_size insert_stddev read_orientation "\
        "genomic_start genomic_end".split()
    groupcontents = []
    libs = []
    for file_name in fnames:
        group_name = op.basename(file_name).split(".")[0]
        library_name = "-".join(group_name.split("-")[:2])

        # Handle paired files and convert to wildcard
        if ".1." in file_name:
            file_name = file_name.replace(".1.", ".?.")
        elif ".2." in file_name:

        groupcontents.append((group_name, library_name, file_name))
        if library_name not in libs:

    libcontents = []
    for library_name in libs:
        L = Library(library_name)
        size = L.size
        stddev = L.stddev
        type = L.type
        paired = L.paired
        read_orientation = L.read_orientation

        size = size or ""
        stddev = stddev or ""
        frag_size = size if type == "fragment" else ""
        frag_stddev = stddev if type == "fragment" else ""
        insert_size = size if type != "fragment" else ""
        insert_stddev = stddev if type != "fragment" else ""
        genomic_start, genomic_end = "", ""
        libcontents.append((library_name, project_name, organism_name, type, \
            paired, frag_size, frag_stddev, insert_size, insert_stddev, \
            read_orientation, genomic_start, genomic_end))

    write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True)
    logging.debug("`in_group.csv` created (# of groups = {0}).".\

    write_csv(libheader, libcontents, filename="in_libs.csv", tee=True)
    logging.debug("`in_libs.csv` created (# of libs = {0}).".\

    runfile = "run.sh"
    if not opts.norun and check_exists(runfile):
        fw = open(runfile, "w")
        print >> fw, ALLPATHSRUN
        logging.debug("Run script written to `{0}`.".format(runfile))