예제 #1
0
파일: meraculous.py 프로젝트: Hensonmw/jcvi
def prepare(args):
    """
    %prog prepare genomesize *.fastq

    Prepare MERACULOUS configuation file. Genome size should be entered in Mb.
    """
    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K", default=51, type="int", help="K-mer size")
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    genomesize = float(args[0]) / 1000
    fnames = args[1:]
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    s = comment_banner("Meraculous params file") + "\n"
    s += comment_banner("Basic parameters") + "\n"
    s += "# Describe the libraries ( one line per library )\n"
    s += "# " + " ".join(header.split()) + "\n"

    libs = get_libs(fnames)
    lib_seqs = []
    rank = 0
    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue
        rank += 1
        library_name = lib.library_name
        name = library_name.replace("-", "")
        wildcard = "{0}*.1.*,{0}*.2.*".format(library_name)
        rl = max(readlen([x]) for x in fs)
        lib_seq = lib.get_lib_seq(wildcard, name, rl, rank)
        lib_seqs.append(lib_seq)

    s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n"
    params = [("genome_size", genomesize),
              ("is_diploid", 0),
              ("mer_size", opts.K),
              ("num_prefix_blocks", 1),
              ("no_read_validation", 0),
              ("local_num_procs", opts.cpus)]
    s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n"

    cfgfile = "meraculous.config"
    write_file(cfgfile, s, tee=True)

    s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\
                .format(cfgfile)
    runsh = "run.sh"
    write_file(runsh, s)
예제 #2
0
파일: meraculous.py 프로젝트: zjwang6/jcvi
def prepare(args):
    """
    %prog prepare genomesize *.fastq

    Prepare MERACULOUS configuation file. Genome size should be entered in Mb.
    """
    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K", default=51, type="int", help="K-mer size")
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    genomesize = float(args[0]) / 1000
    fnames = args[1:]
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    s = comment_banner("Meraculous params file") + "\n"
    s += comment_banner("Basic parameters") + "\n"
    s += "# Describe the libraries ( one line per library )\n"
    s += "# " + " ".join(header.split()) + "\n"

    libs = get_libs(fnames)
    lib_seqs = []
    rank = 0
    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue
        rank += 1
        library_name = lib.library_name
        name = library_name.replace("-", "")
        wildcard = "{0}*.1.*,{0}*.2.*".format(library_name)
        rl = max(readlen([x]) for x in fs)
        lib_seq = lib.get_lib_seq(wildcard, name, rl, rank)
        lib_seqs.append(lib_seq)

    s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n"
    params = [("genome_size", genomesize), ("is_diploid", 0),
              ("mer_size", opts.K), ("num_prefix_blocks", 1),
              ("no_read_validation", 0), ("local_num_procs", opts.cpus)]
    s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n"

    cfgfile = "meraculous.config"
    write_file(cfgfile, s, tee=True)

    s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\
                .format(cfgfile)
    runsh = "run.sh"
    write_file(runsh, s)
예제 #3
0
파일: sspace.py 프로젝트: biologyguy/jcvi
def write_libraries(fastqs, aligner=None):
    from jcvi.assembly.base import get_libs
    libs = get_libs(fastqs)
    assert libs

    libtxt = "libraries.txt"
    contents = []
    for i, (lib, fns) in enumerate(libs):
        fns = " ".join(fns)
        pe = "RF" if lib.read_orientation == "outward" else "FR"
        cc = ["lib{0}".format(i + 1), fns, lib.size, 0.75, pe]
        if aligner:
            cc.insert(1, aligner)
        libline = " ".join(str(x) for x in cc)
        contents.append(libline)

    write_file(libtxt, "\n".join(contents), tee=True)
    return libtxt
예제 #4
0
파일: soap.py 프로젝트: rrane/jcvi
def prepare(args):
    """
    %prog prepare *.fastq

    Scan input fastq files (see below) and write SOAP config files based
    on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding.
    """
    from jcvi.formats.base import write_file

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K", default=45, type="int", help="K-mer size [default: %default]")
    p.add_option(
        "--assemble_1st_rank_only",
        default=False,
        action="store_true",
        help="Assemble the first rank only, other libs asm_flags=2 [default: %default]",
    )
    p.add_option("--scaffold", help="Only perform scaffolding [default: %default]")
    p.add_option("--gapclose", help="Only perform gap closure [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    a1st = opts.assemble_1st_rank_only

    cfgfile = "soap.config"
    gc_cfgfile = "soap.gc.config"
    fw = open(cfgfile, "w")
    fw_gc = open(gc_cfgfile, "w")

    libs = get_libs(fnames)
    rank = 0
    singletons = []
    max_rd_len = max(readlen([f]) for f in fnames)

    block = "max_rd_len={0}\n".format(max_rd_len)
    for stream in (sys.stderr, fw, fw_gc):
        print >> stream, block

    # Collect singletons first
    singletons = []
    for lib, fs in libs:
        if lib.size == 0:
            singletons += fs
            continue

    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue

        rank += 1
        block = "[LIB]\n"
        block += "avg_ins={0}\n".format(size)
        f = fs[0]
        block += "reverse_seq={0}\n".format(lib.reverse_seq)
        asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags
        block += "asm_flags={0}\n".format(asm_flags)
        block += "rank={0}\n".format(rank)
        if lib.reverse_seq:
            pair_num_cutoff = 3
            block += "pair_num_cutoff={0}\n".format(pair_num_cutoff)
        block += "map_len=35\n"

        for f in fs:
            if ".1." in f:
                tag = "q1"
            elif ".2." in f:
                tag = "q2"
            block += "{0}={1}\n".format(tag, f)

        if rank == 1:
            for s in singletons:
                block += "q={0}\n".format(s)

        print >>sys.stderr, block
        print >> fw, block

        if asm_flags > 2:
            print >> fw_gc, block

    runfile = "run.sh"
    scaffold = opts.scaffold
    header = SOAPHEADER.format(opts.cpus, opts.K)
    if opts.gapclose:
        gapclose = opts.gapclose
        outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta"
        template = header + GCRUNG.format(gapclose, outfile)
    else:
        template = header + (SCFRUN % scaffold if scaffold else SOAPRUN)

    write_file(runfile, template, meta="run script")
    fw.close()
    fw_gc.close()
예제 #5
0
def prepare(args):
    """
    %prog prepare *.fastq

    Scan input fastq files (see below) and write SOAP config files based
    on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding.
    """
    from jcvi.formats.base import write_file

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K",
                 default=45,
                 type="int",
                 help="K-mer size [default: %default]")
    p.add_option(
        "--assemble_1st_rank_only",
        default=False,
        action="store_true",
        help=
        "Assemble the first rank only, other libs asm_flags=2 [default: %default]"
    )
    p.add_option("--scaffold",
                 help="Only perform scaffolding [default: %default]")
    p.add_option("--gapclose",
                 help="Only perform gap closure [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    K = opts.K
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    a1st = opts.assemble_1st_rank_only

    cfgfile = "soap.config"
    gc_cfgfile = "soap.gc.config"
    fw = open(cfgfile, "w")
    fw_gc = open(gc_cfgfile, "w")

    libs = get_libs(fnames)
    rank = 0
    singletons = []
    max_rd_len = max(readlen([f]) for f in fnames)

    block = "max_rd_len={0}\n".format(max_rd_len)
    for stream in (sys.stderr, fw, fw_gc):
        print(block, file=stream)

    # Collect singletons first
    singletons = []
    for lib, fs in libs:
        if lib.size == 0:
            singletons += fs
            continue

    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue

        rank += 1
        block = "[LIB]\n"
        block += "avg_ins={0}\n".format(size)
        f = fs[0]
        block += "reverse_seq={0}\n".format(lib.reverse_seq)
        asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags
        block += "asm_flags={0}\n".format(asm_flags)
        block += "rank={0}\n".format(rank)
        if lib.reverse_seq:
            pair_num_cutoff = 3
            block += "pair_num_cutoff={0}\n".format(pair_num_cutoff)
        block += "map_len=35\n"

        for f in fs:
            if ".1." in f:
                tag = "q1"
            elif ".2." in f:
                tag = "q2"
            block += "{0}={1}\n".format(tag, f)

        if rank == 1:
            for s in singletons:
                tag = "q" if is_fastq(s) else "f"
                block += tag + "={0}\n".format(s)

        print(block, file=sys.stderr)
        print(block, file=fw)

        if asm_flags > 2:
            print(block, file=fw_gc)

    runfile = "run.sh"
    scaffold = opts.scaffold
    bb = 63 if K <= 63 else 127
    binary = "SOAPdenovo-{0}mer".format(bb)
    header = SOAPHEADER.format(opts.cpus, K, binary)
    if opts.gapclose:
        gapclose = opts.gapclose
        outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta"
        template = header + GCRUNG.format(gapclose, outfile)
    else:
        template = header + (SCFRUN % scaffold if scaffold else SOAPRUN)

    write_file(runfile, template)
    fw.close()
    fw_gc.close()