Exemplo n.º 1
0
Arquivo: gmap.py Projeto: fw1121/jcvi
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fasta import join
    from jcvi.formats.fastq import guessoffset
    from jcvi.projects.tgbs import snp

    p = OptionParser(align.__doc__)
    p.add_option("--join", default=False, action="store_true",
                 help="Join sequences with padded 50Ns")
    p.add_option("--rnaseq", default=False, action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--snp", default=False, action="store_true",
                 help="Call SNPs after GSNAP")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[0:2]
    if opts.join:
        dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"])

    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = prefix + ".log"
    gsnapfile = prefix + ".gsnap"
    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.snp:
        snp([gsnapfile, "--cpus={0}".format(opts.cpus)])

    return gsnapfile, logfile
Exemplo n.º 2
0
def assemble(args):
    """
    %prog assemble sffdir

    Assemble each BAC separately using newbler.
    """
    from jcvi.formats.fasta import join

    p = OptionParser(assemble.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="Overwrite the separate BAC assembly [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    sffdir, = args
    asmdir = "newbler"
    fastadir = "fasta"
    mkdir(asmdir, overwrite=opts.overwrite)
    mkdir(fastadir, overwrite=opts.overwrite)
    cmd = "runAssembly -cpu 8 -o {0} {1}"
    for sffile in glob("{0}/*.sff".format(sffdir)):
        pf = op.basename(sffile).split(".")[1]
        pf = pf.lower()
        outdir = op.join(asmdir, pf)
        if op.exists(outdir):
            logging.debug("`{0}` exists. Ignored.".format(outdir))
            continue

        acmd = cmd.format(outdir, sffile)
        sh(acmd)

        ctgfile = op.join(outdir, "454LargeContigs.fna")
        if not op.exists(ctgfile):  # newbler failure
            logging.error("File `{0}` not found (newbler failure).".\
                    format(ctgfile))
            continue
        outfile = op.join(fastadir, "{0}.fasta".format(pf))
        newidopt = "--newid={0}".format(pf)
        minctgsizeopt = "--minctgsize=200"
        join([ctgfile, outfile, newidopt, minctgsizeopt])
Exemplo n.º 3
0
def scaffold(args):
    """
    %prog scaffold ctgfasta reads1.fasta mapping1.bed
                            reads2.fasta mapping2.bed ...

    Run BAMBUS on set of contigs, reads and read mappings.
    """

    from jcvi.formats.base import FileMerger
    from jcvi.formats.bed import mates
    from jcvi.formats.contig import frombed
    from jcvi.formats.fasta import join
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option("--conf",
                 help="BAMBUS configuration file [default: %default]")
    p.add_option(
        "--prefix",
        default=False,
        action="store_true",
        help="Only keep links between IDs with same prefix [default: %default]"
    )
    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 3 or nargs % 2 != 1:
        sys.exit(not p.print_help())

    ctgfasta = args[0]
    duos = list(grouper(2, args[1:]))
    trios = []
    for fastafile, bedfile in duos:
        prefix = bedfile.rsplit(".", 1)[0]
        matefile = prefix + ".mates"
        matebedfile = matefile + ".bed"
        if need_update(bedfile, [matefile, matebedfile]):
            matesopt = [bedfile, "--lib", "--nointra"]
            if opts.prefix:
                matesopt += ["--prefix"]
            matefile, matebedfile = mates(matesopt)
        trios.append((fastafile, matebedfile, matefile))

    # Merge the readfasta, bedfile and matefile
    bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates"

    for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)):
        FileMerger(files, outfile=outfile).merge(checkexists=True)

    ctgfile = "bambus.contig"
    idsfile = "bambus.ids"
    frombedInputs = [bbbed, ctgfasta, bbfasta]
    if need_update(frombedInputs, ctgfile):
        frombed(frombedInputs)

    inputfasta = "bambus.contigs.fasta"
    singletonfasta = "bambus.singletons.fasta"
    cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile)
    sh(cmd + inputfasta)
    sh(cmd + singletonfasta + " -exclude")

    # Run bambus
    prefix = "bambus"
    cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix)
    if opts.conf:
        cmd += " -C {0}".format(opts.conf)
    sh(cmd)

    cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\
            format(prefix)
    sh(cmd)

    final = "final"
    cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \
          "-merge -detail -oo -sum -o {1}".format(prefix, final)
    sh(cmd)

    oofile = final + ".oo"
    join([inputfasta, "--oo={0}".format(oofile)])
Exemplo n.º 4
0
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fasta import join
    from jcvi.formats.fastq import guessoffset
    from jcvi.projects.tgbs import snp

    p = OptionParser(align.__doc__)
    p.add_option("--join",
                 default=False,
                 action="store_true",
                 help="Join sequences with padded 50Ns")
    p.add_option("--rnaseq",
                 default=False,
                 action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--snp",
                 default=False,
                 action="store_true",
                 help="Call SNPs after GSNAP")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[0:2]
    if opts.join:
        dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"])

    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = prefix + ".log"
    gsnapfile = prefix + ".gsnap"
    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.snp:
        snp([gsnapfile, "--cpus={0}".format(opts.cpus)])

    return gsnapfile, logfile
Exemplo n.º 5
0
def scaffold(args):
    """
    %prog scaffold ctgfasta reads1.fasta mapping1.bed
                            reads2.fasta mapping2.bed ...

    Run BAMBUS on set of contigs, reads and read mappings.
    """

    from jcvi.formats.base import FileMerger
    from jcvi.formats.bed import mates
    from jcvi.formats.contig import frombed
    from jcvi.formats.fasta import join
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.set_rclip(rclip=1)
    p.add_option("--conf", help="BAMBUS configuration file [default: %default]")
    p.add_option("--prefix", default=False, action="store_true",
            help="Only keep links between IDs with same prefix [default: %default]")
    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 3 or nargs % 2 != 1:
        sys.exit(not p.print_help())

    rclip = opts.rclip
    ctgfasta = args[0]
    duos = list(grouper(args[1:], 2))
    trios = []
    for fastafile, bedfile in duos:
        prefix = bedfile.rsplit(".", 1)[0]
        matefile = prefix + ".mates"
        matebedfile = matefile + ".bed"
        if need_update(bedfile, [matefile, matebedfile]):
            matesopt = [bedfile, "--lib", "--nointra",
                        "--rclip={0}".format(rclip),
                        "--cutoff={0}".format(opts.cutoff)]
            if opts.prefix:
                matesopt += ["--prefix"]
            matefile, matebedfile = mates(matesopt)
        trios.append((fastafile, matebedfile, matefile))

    # Merge the readfasta, bedfile and matefile
    bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates"

    for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)):
        FileMerger(files, outfile=outfile).merge(checkexists=True)

    ctgfile = "bambus.contig"
    idsfile = "bambus.ids"
    frombedInputs = [bbbed, ctgfasta, bbfasta]
    if need_update(frombedInputs, ctgfile):
        frombed(frombedInputs)

    inputfasta = "bambus.contigs.fasta"
    singletonfasta = "bambus.singletons.fasta"
    cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile)
    sh(cmd + inputfasta)
    sh(cmd + singletonfasta + " -exclude")

    # Run bambus
    prefix = "bambus"
    cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix)
    if opts.conf:
        cmd += " -C {0}".format(opts.conf)
    sh(cmd)

    cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\
            format(prefix)
    sh(cmd)

    final = "final"
    cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \
          "-merge -detail -oo -sum -o {1}".format(prefix, final)
    sh(cmd)

    oofile = final + ".oo"
    join([inputfasta, "--oo={0}".format(oofile)])