Пример #1
0
def mask(args):
    """
    %prog mask fastafile

    Mask the contaminants. By default, this will compare against UniVec_Core and
    Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can
    perform FASTA tidy if requested.
    """
    p = OptionParser(mask.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    assert op.exists(fastafile)

    outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta"
    vecbedfile = blast([fastafile])
    ecoliurl = \
    "ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.fna"
    ecolifile = download(ecoliurl, filename="Ecoli.fasta")
    assert op.exists(ecolifile)
    ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)])

    cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile)
    cmd += " | mergeBed -nms -d 100 -i stdin"
    cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".\
            format(fastafile, outfastafile)
    sh(cmd)

    tidy([outfastafile])
Пример #2
0
def mask(args):
    """
    %prog mask fastafile

    Mask the contaminants. By default, this will compare against UniVec_Core and
    Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can
    perform FASTA tidy if requested.
    """
    p = OptionParser(mask.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    assert op.exists(fastafile)

    outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta"
    vecbedfile = blast([fastafile])
    ecoliurl = \
    "ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.fna"
    ecolifile = download(ecoliurl, filename="Ecoli.fasta")
    assert op.exists(ecolifile)
    ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)])

    cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile)
    cmd += " | mergeBed -nms -d 100 -i stdin"
    cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".\
            format(fastafile, outfastafile)
    sh(cmd)

    tidy([outfastafile])
Пример #3
0
def mask(args):
    """
    %prog mask fastafile

    Mask the contaminants. By default, this will compare against UniVec_Core and
    Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can
    perform FASTA tidy if requested.
    """
    p = OptionParser(mask.__doc__)
    p.add_option(
        "--db",
        default=ECOLI_URL,
        help=
        "Contaminant db other than Ecoli K12, will download if file starts with http://, https://, or ftp://",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    db = opts.db
    assert op.exists(fastafile)

    outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta"
    vecbedfile = blast([fastafile])
    ecolifile = (download(db, filename="Ecoli.fasta", handle_gzip=True)
                 if is_internet_file(db) else db)
    assert op.exists(ecolifile)
    ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)])

    cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile)
    cmd += " | sort -k1,1 -k2,2n"
    cmd += " | mergeBed -c 4 -o distinct -d 100 -i stdin"
    cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".format(
        fastafile, outfastafile)
    sh(cmd)

    return tidy([outfastafile])
Пример #4
0
def scaffold(args):
    """
    %prog scaffold ctgfasta agpfile

    Build scaffolds based on ordering in the AGP file.
    """
    from jcvi.formats.agp import bed, order_to_agp, build
    from jcvi.formats.bed import Bed

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix",
                 default=False,
                 action="store_true",
                 help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, agpfile = args
    sizes = Sizes(ctgfasta).mapping

    pf = ctgfasta.rsplit(".", 1)[0]
    phasefile = pf + ".phases"
    fwphase = open(phasefile, "w")
    newagpfile = pf + ".new.agp"
    fwagp = open(newagpfile, "w")

    scaffoldbuckets = defaultdict(list)

    bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"])
    bb = Bed(bedfile)
    for s, partialorder in bb.sub_beds():
        name = partialorder[0].accn
        bname = name.rsplit("_", 1)[0] if opts.prefix else s
        scaffoldbuckets[bname].append([(b.accn, b.strand)
                                       for b in partialorder])

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    for bname, scaffolds in sorted(scaffoldbuckets.items()):
        ctgorder = []
        singletons = set()
        for scaf in sorted(scaffolds):
            for node, orientation in scaf:
                ctgorder.append((node, orientation))
            if len(scaf) == 1:
                singletons.add(node)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)
        if nsingletons == 1 and nscaffolds == 0:
            phase = 3
        elif nsingletons == 0 and nscaffolds == 1:
            phase = 2
        else:
            phase = 1

        msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\
            format(bname, nscaffolds, nsingletons, phase)
        print >> sys.stderr, msg
        print >> fwphase, "\t".join((bname, str(phase)))

        order_to_agp(bname, ctgorder, sizes, fwagp)

    fwagp.close()
    os.remove(bedfile)

    fastafile = "final.fasta"
    build([newagpfile, ctgfasta, fastafile])
    tidy([fastafile])
Пример #5
0
def scaffold(args):
    """
    %prog scaffold ctgfasta agpfile

    Build scaffolds based on ordering in the AGP file.
    """
    from jcvi.formats.agp import bed, order_to_agp, build
    from jcvi.formats.bed import Bed

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix", default=False, action="store_true",
            help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, agpfile = args
    sizes = Sizes(ctgfasta).mapping

    pf = ctgfasta.rsplit(".", 1)[0]
    phasefile = pf + ".phases"
    fwphase = open(phasefile, "w")
    newagpfile = pf + ".new.agp"
    fwagp = open(newagpfile, "w")

    scaffoldbuckets = defaultdict(list)

    bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"])
    bb = Bed(bedfile)
    for s, partialorder in bb.sub_beds():
        name = partialorder[0].accn
        bname = name.rsplit("_", 1)[0] if opts.prefix else s
        scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder])

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    for bname, scaffolds in sorted(scaffoldbuckets.items()):
        ctgorder = []
        singletons = set()
        for scaf in sorted(scaffolds):
            for node, orientation in scaf:
                ctgorder.append((node, orientation))
            if len(scaf) == 1:
                singletons.add(node)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)
        if nsingletons == 1 and nscaffolds == 0:
            phase = 3
        elif nsingletons == 0 and nscaffolds == 1:
            phase = 2
        else:
            phase = 1

        msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\
            format(bname, nscaffolds, nsingletons, phase)
        print >> sys.stderr, msg
        print >> fwphase, "\t".join((bname, str(phase)))

        order_to_agp(bname, ctgorder, sizes, fwagp)

    fwagp.close()
    os.remove(bedfile)

    fastafile = "final.fasta"
    build([newagpfile, ctgfasta, fastafile])
    tidy([fastafile])