Exemplo n.º 1
0
def prepare(args):
    """
    %prog prepare *.fastq

    Generate run.sh script to run clc_novo_assemble.
    """
    from itertools import groupby

    from jcvi.utils.iter import grouper
    from jcvi.formats.base import check_exists
    from jcvi.assembly.base import FastqNamings, Library

    p = OptionParser(prepare.__doc__ + FastqNamings)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    library_name = lambda x: "-".join(\
                op.basename(x).split(".")[0].split("-")[:2])
    libs = [(Library(x), sorted(fs)) for x, fs in \
                groupby(fnames, key=library_name)]

    libs.sort(key=lambda x: x[0].size)
    singletons = []
    pairs = []

    write_file("license.properties", CLCLICENSE)

    for lib, fs in libs:
        size = lib.size
        stddev = lib.stddev

        if size == 0:
            singletons += fs
            continue

        for f in fs:

            reverse_seq = 0 if ".corr." in f else lib.reverse_seq
            fb = "bf" if reverse_seq else "fb"
            minsize, maxsize = size - 2 * stddev, size + 2 * stddev
            pair_opt = "-p {0} ss {1} {2} ".format(fb, minsize, maxsize)

            if ".1." in f:
                f = f.replace(".1.", ".?.")
                pairs.append(pair_opt + "-i {0}".format(f))
            elif ".2." in f:
                continue
            else:
                pairs.append(pair_opt + f)

    cmd = "clc_novo_assemble --cpus 32 -o contigs.fasta \\\n"
    cmd += "\t-q {0} \\\n".format(" ".join(singletons))
    cmd += "\n".join("\t{0} \\".format(x) for x in pairs)

    runfile = "run.sh"
    if check_exists(runfile):
        fw = open(runfile, "w")
        print >> fw, "#!/bin/bash\n"
        print >> fw, cmd
        logging.debug("Run script written to `{0}`.".format(runfile))
Exemplo n.º 2
0
def prepare(args):
    """
    %prog prepare *.fastq

    Scan input fastq files (see below) and write SOAP config files based
    on inputfiles.
    """
    from jcvi.utils.iter import grouper
    from jcvi.formats.base import check_exists

    p = OptionParser(prepare.__doc__ + FastqNamings)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    cfgfile = "soap.config"
    fw = open(cfgfile, "w")

    library_name = lambda x: "-".join(\
                op.basename(x).split(".")[0].split("-")[:2])
    libs = [(Library(x), sorted(fs)) for x, fs in \
                groupby(fnames, key=library_name)]

    libs.sort(key=lambda x: x[0].size)
    rank = 0
    singletons = []
    for lib, fs in libs:
        size = lib.size
        if size == 0:
            singletons = fs
            continue

        rank += 1
        block = "[LIB]\n"
        block += "avg_ins={0}\n".format(size)
        f = fs[0]
        reverse_seq = 0 if ".corr." in f else lib.reverse_seq
        block += "reverse_seq={0}\n".format(reverse_seq)
        block += "asm_flags={0}\n".format(lib.asm_flags)
        block += "rank={0}\n".format(rank)
        if singletons:
            fs += singletons
            singletons = []

        for f in fs:
            if ".1." in f:
                tag = "q1"
            elif ".2." in f:
                tag = "q2"
            else:
                tag = "q"
            block += "{0}={1}\n".format(tag, f)
        print >> sys.stderr, block
        print >> fw, block

    runfile = "run.sh"
    if check_exists(runfile):
        fw = open(runfile, "w")
        print >> fw, SOAPRUN
        logging.debug("Run script written to `{0}`.".format(runfile))
Exemplo n.º 3
0
def prepare(args):
    """
    %prog prepare *.fastq

    Scan input fastq files (see below) and write SOAP config files based
    on inputfiles.
    """
    from jcvi.utils.iter import grouper
    from jcvi.formats.base import check_exists

    p = OptionParser(prepare.__doc__ + FastqNamings)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    cfgfile = "soap.config"
    fw = open(cfgfile, "w")

    library_name = lambda x: "-".join(\
                op.basename(x).split(".")[0].split("-")[:2])
    libs = [(Library(x), sorted(fs)) for x, fs in \
                groupby(fnames, key=library_name)]

    libs.sort(key=lambda x: x[0].size)
    rank = 0
    singletons = []
    for lib, fs in libs:
        size = lib.size
        if size == 0:
            singletons = fs
            continue

        rank += 1
        block = "[LIB]\n"
        block += "avg_ins={0}\n".format(size)
        f = fs[0]
        reverse_seq = 0 if ".corr." in f else lib.reverse_seq
        block += "reverse_seq={0}\n".format(reverse_seq)
        block += "asm_flags={0}\n".format(lib.asm_flags)
        block += "rank={0}\n".format(rank)
        if singletons:
            fs += singletons
            singletons = []

        for f in fs:
            if ".1." in f:
                tag = "q1"
            elif ".2." in f:
                tag = "q2"
            else:
                tag = "q"
            block += "{0}={1}\n".format(tag, f)
        print >> sys.stderr, block
        print >> fw, block

    runfile = "run.sh"
    if check_exists(runfile):
        fw = open(runfile, "w")
        print >> fw, SOAPRUN
        logging.debug("Run script written to `{0}`.".format(runfile))
Exemplo n.º 4
0
def prepare(args):
    """
    %prog prepare "B. oleracea" *.fastq

    Scan input fastq files (see below) and create `in_groups.csv` and
    `in_libs.csv`. The species name does not really matter.
    """
    from jcvi.utils.table import write_csv
    from jcvi.formats.base import check_exists

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("--norun", default=False, action="store_true",
                 help="Don't write `run.sh` script [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    organism_name = args[0]
    project_name = "".join(x[0] for x in organism_name.split()).upper()
    fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:])
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    groupheader = "group_name library_name file_name".split()
    libheader = "library_name project_name organism_name type paired "\
        "frag_size frag_stddev insert_size insert_stddev read_orientation "\
        "genomic_start genomic_end".split()
    groupcontents = []
    libs = []
    for file_name in fnames:
        group_name = op.basename(file_name).split(".")[0]
        library_name = "-".join(group_name.split("-")[:2])

        # Handle paired files and convert to wildcard
        if ".1." in file_name:
            file_name = file_name.replace(".1.", ".?.")
        elif ".2." in file_name:
            continue

        groupcontents.append((group_name, library_name, file_name))
        if library_name not in libs:
            libs.append(library_name)

    libcontents = []
    for library_name in libs:
        L = Library(library_name)
        size = L.size
        stddev = L.stddev
        type = L.type
        paired = L.paired
        read_orientation = L.read_orientation

        size = size or ""
        stddev = stddev or ""
        frag_size = size if type == "fragment" else ""
        frag_stddev = stddev if type == "fragment" else ""
        insert_size = size if type != "fragment" else ""
        insert_stddev = stddev if type != "fragment" else ""
        genomic_start, genomic_end = "", ""
        libcontents.append((library_name, project_name, organism_name, type, \
            paired, frag_size, frag_stddev, insert_size, insert_stddev, \
            read_orientation, genomic_start, genomic_end))

    write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True)
    logging.debug("`in_group.csv` created (# of groups = {0}).".\
        format(len(groupcontents)))

    write_csv(libheader, libcontents, filename="in_libs.csv", tee=True)
    logging.debug("`in_libs.csv` created (# of libs = {0}).".\
        format(len(libcontents)))

    runfile = "run.sh"
    if not opts.norun and check_exists(runfile):
        fw = open(runfile, "w")
        print >> fw, ALLPATHSRUN
        logging.debug("Run script written to `{0}`.".format(runfile))
Exemplo n.º 5
0
def prepare(args):
    """
    %prog prepare "B. oleracea" *.fastq

    Scan input fastq files (see below) and create `in_groups.csv` and
    `in_libs.csv`. The species name does not really matter.
    """
    from jcvi.utils.table import write_csv
    from jcvi.formats.base import check_exists

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("--norun", default=False, action="store_true",
                 help="Don't write `run.sh` script [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    organism_name = args[0]
    project_name = "".join(x[0] for x in organism_name.split()).upper()
    fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:])
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    groupheader = "group_name library_name file_name".split()
    libheader = "library_name project_name organism_name type paired "\
        "frag_size frag_stddev insert_size insert_stddev read_orientation "\
        "genomic_start genomic_end".split()
    groupcontents = []
    libs = []
    for file_name in fnames:
        group_name = op.basename(file_name).split(".")[0]
        library_name = "-".join(group_name.split("-")[:2])

        # Handle paired files and convert to wildcard
        if ".1." in file_name:
            file_name = file_name.replace(".1.", ".?.")
        elif ".2." in file_name:
            continue

        groupcontents.append((group_name, library_name, file_name))
        if library_name not in libs:
            libs.append(library_name)

    libcontents = []
    for library_name in libs:
        L = Library(library_name)
        size = L.size
        stddev = L.stddev
        type = L.type
        paired = L.paired
        read_orientation = L.read_orientation

        size = size or ""
        stddev = stddev or ""
        frag_size = size if type == "fragment" else ""
        frag_stddev = stddev if type == "fragment" else ""
        insert_size = size if type != "fragment" else ""
        insert_stddev = stddev if type != "fragment" else ""
        genomic_start, genomic_end = "", ""
        libcontents.append((library_name, project_name, organism_name, type, \
            paired, frag_size, frag_stddev, insert_size, insert_stddev, \
            read_orientation, genomic_start, genomic_end))

    write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True)
    logging.debug("`in_group.csv` created (# of groups = {0}).".\
        format(len(groupcontents)))

    write_csv(libheader, libcontents, filename="in_libs.csv", tee=True)
    logging.debug("`in_libs.csv` created (# of libs = {0}).".\
        format(len(libcontents)))

    runfile = "run.sh"
    if not opts.norun and check_exists(runfile):
        fw = open(runfile, "w")
        print >> fw, ALLPATHSRUN
        logging.debug("Run script written to `{0}`.".format(runfile))