Exemplo n.º 1
0
def anchor(args):
    """
    %prog anchor map.bed markers.blast > anchored.bed

    Anchor scaffolds based on map.
    """
    from jcvi.formats.blast import bed

    p = OptionParser(anchor.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mapbed, blastfile = args
    bedfile = bed([blastfile])
    markersbed = Bed(bedfile)
    markers = markersbed.order

    mapbed = Bed(mapbed, sorted=False)
    for b in mapbed:
        m = b.accn
        if m not in markers:
            continue

        i, mb = markers[m]
        new_accn = "{0}:{1}-{2}".format(mb.seqid, mb.start, mb.end)
        b.accn = new_accn
        print b
Exemplo n.º 2
0
def paste(args):
    """
    %prog paste flanks.bed flanks_vs_assembly.blast backbone.fasta

    Paste in good sequences in the final assembly.
    """
    from jcvi.formats.bed import uniq

    p = OptionParser(paste.__doc__)
    p.add_option(
        "--maxsize",
        default=300000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    p.add_option("--prefix", help="Prefix of the new object")
    p.set_rclip(rclip=1)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pbed, blastfile, bbfasta = args
    maxsize = opts.maxsize  # Max DNA size to replace gap
    order = Bed(pbed).order

    beforebed, afterbed = blast_to_twobeds(
        blastfile, order, log=True, rclip=opts.rclip, maxsize=maxsize, flipbeds=True
    )
    beforebed = uniq([beforebed])

    afbed = Bed(beforebed)
    bfbed = Bed(afterbed)

    shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
Exemplo n.º 3
0
def check_beds(hintfile, p, opts):

    wd, hintfile = op.split(hintfile)
    if not (opts.qbed and opts.sbed):
        try:
            q, s = hintfile.split(".", 2)[:2]
            opts.qbed = op.join(wd, q + ".bed")
            opts.sbed = op.join(wd, s + ".bed")
            logging.debug("Assuming --qbed={0} --sbed={1}".\
                         format(opts.qbed, opts.sbed))
        except:
            print >> sys.stderr, "Options --qbed and --sbed are required"
            sys.exit(not p.print_help())

    qbed_file, sbed_file = opts.qbed, opts.sbed
    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        logging.debug("Looks like self-self comparison.")

    qbed = Bed(opts.qbed)
    sbed = Bed(opts.sbed)
    qorder = qbed.order
    sorder = sbed.order

    return qbed, sbed, qorder, sorder, is_self
Exemplo n.º 4
0
def subset(args):
    """
    %prog subset blastfile qbedfile sbedfile

    Extract blast hits between given query and subject chrs.

    If --qchrs or --schrs is not given, then all chrs from q/s genome will
    be included. However one of --qchrs and --schrs must be specified.
    Otherwise the script will do nothing.
    """
    p = OptionParser(subset.__doc__)
    p.add_option("--qchrs",
                 default=None,
                 help="query chrs to extract, comma sep [default: %default]")
    p.add_option("--schrs",
                 default=None,
                 help="subject chrs to extract, comma sep [default: %default]")
    p.add_option("--convert",
                 default=False,
                 action="store_true",
                 help="convert accns to chr_rank [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, qbedfile, sbedfile = args
    qchrs = opts.qchrs
    schrs = opts.schrs
    assert qchrs or schrs, p.print_help()
    convert = opts.convert

    outfile = blastfile + "."
    if qchrs:
        outfile += qchrs + "."
        qchrs = set(qchrs.split(","))
    else:
        qchrs = set(Bed(qbedfile).seqids)
    if schrs:
        schrs = set(schrs.split(","))
        if qbedfile != sbedfile or qchrs != schrs:
            outfile += ",".join(schrs) + "."
    else:
        schrs = set(Bed(sbedfile).seqids)
    outfile += "blast"

    qo = Bed(qbedfile).order
    so = Bed(sbedfile).order

    fw = must_open(outfile, "w")
    for b in Blast(blastfile):
        q, s = b.query, b.subject
        if qo[q][1].seqid in qchrs and so[s][1].seqid in schrs:
            if convert:
                b.query = qo[q][1].seqid + "_" + "{0:05d}".format(qo[q][0])
                b.subject = so[s][1].seqid + "_" + "{0:05d}".format(so[s][0])
            print >> fw, b
    fw.close()
    logging.debug("Subset blastfile written to `{0}`".format(outfile))
Exemplo n.º 5
0
def patcher(args):
    """
    %prog patcher backbone.bed other.bed

    Given optical map alignment, prepare the patchers. Use --backbone to suggest
    which assembly is the major one, and the patchers will be extracted from
    another assembly.
    """
    from jcvi.formats.bed import uniq

    p = OptionParser(patcher.__doc__)
    p.add_option("--backbone",
                 default="OM",
                 help="Prefix of the backbone assembly [default: %default]")
    p.add_option("--object",
                 default="object",
                 help="New object name [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    backbonebed, otherbed = args
    backbonebed = uniq([backbonebed])
    otherbed = uniq([otherbed])

    bb = opts.backbone
    pf = backbonebed.split(".")[0]
    key = lambda x: (x.seqid, x.start, x.end)
    is_bb = lambda x: x.startswith(bb)

    # Make a uniq bed keeping backbone at redundant intervals
    cmd = "intersectBed -v -wa"
    cmd += " -a {0} -b {1}".format(otherbed, backbonebed)
    outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed
    sh(cmd, outfile=outfile)

    uniqbed = Bed()
    uniqbedfile = pf + ".merged.bed"
    uniqbed.extend(Bed(backbonebed))
    uniqbed.extend(Bed(outfile))
    uniqbed.print_to_file(uniqbedfile, sorted=True)

    # Condense adjacent intervals, allow some chaining
    bed = uniqbed
    key = lambda x: range_parse(x.accn).seqid

    bed_fn = pf + ".patchers.bed"
    bed_fw = open(bed_fn, "w")

    for k, sb in groupby(bed, key=key):
        sb = list(sb)
        chr, start, end, strand = merge_ranges(sb)

        id = "{0}:{1}-{2}".format(chr, start, end)
        print >> bed_fw, "\t".join(str(x) for x in \
                (chr, start, end, opts.object, 1000, strand))

    bed_fw.close()
Exemplo n.º 6
0
def geneinfo(args):
    """
    %prog geneinfo pineapple.20141004.bed liftover.bed pineapple.20150413.bed \
                   note.txt interproscan.txt

    Build gene info table from various sources. The three beds contain
    information on the original scaffolds, linkage groups, and final selected
    loci (after removal of TEs and split loci). The final two text files contain
    AHRD and domain data.
    """
    p = OptionParser(geneinfo.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    scfbed, liftoverbed, lgbed, note, ipr = args
    note = DictFile(note, delimiter="\t")
    scfbed = Bed(scfbed)
    lgorder = Bed(lgbed).order
    liftover = Bed(liftoverbed).order
    header = ("Accession Scaffold-position LG-position "
              "Description Interpro-domain Interpro-description "
              "GO-term KEGG".split())
    ipr = read_interpro(ipr)

    fw_clean = must_open("master.txt", "w")
    fw_removed = must_open("master-removed.txt", "w")

    for fw in (fw_clean, fw_removed):
        print("\t".join(header), file=fw)

    for b in scfbed:
        accession = b.accn
        scaffold_position = b.tag
        if accession in liftover:
            lg_position = liftover[accession][-1].tag
        else:
            lg_position = "split"
        fw = fw_clean if accession in lgorder else fw_removed
        description = note[accession]
        interpro = interpro_description = go = kegg = ""
        if accession in ipr:
            interpro, interpro_description, go, kegg = ipr[accession]
        print(
            "\t".join((
                accession,
                scaffold_position,
                lg_position,
                description,
                interpro,
                interpro_description,
                go,
                kegg,
            )),
            file=fw,
        )
    fw.close()
Exemplo n.º 7
0
def scaffold(args):
    """
    %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed
                         physicalmap.blast physicalmap.sizes physicalmap.bed

    As evaluation of scaffolding, visualize external line of evidences:
    * Plot synteny to an external genome
    * Plot alignments to physical map
    * Plot alignments to genetic map (TODO)

    Each trio defines one panel to be plotted. blastfile defines the matchings
    between the evidences vs scaffolds. Then the evidence sizes, and evidence
    bed to plot dot plots.

    This script will plot a dot in the dot plot in the corresponding location
    the plots are one contig/scaffold per plot.
    """
    from jcvi.graphics.base import set_image_options
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option("--cutoff", type="int", default=1000000,
            help="Plot scaffolds with size larger than [default: %default]")
    p.add_option("--highlights",
            help="A set of regions in BED format to highlight [default: %default]")
    opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150)

    if len(args) < 4 or len(args) % 3 != 1:
        sys.exit(not p.print_help())

    highlights = opts.highlights
    scafsizes = Sizes(args[0])
    trios = list(grouper(3, args[1:]))
    trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios]
    if highlights:
        hlbed = Bed(highlights)

    for scaffoldID, scafsize in scafsizes.iter_sizes():
        if scafsize < opts.cutoff:
            continue
        logging.debug("Loading {0} (size={1})".format(scaffoldID,
            thousands(scafsize)))

        tmpname = scaffoldID + ".sizes"
        tmp = open(tmpname, "w")
        tmp.write("{0}\t{1}".format(scaffoldID, scafsize))
        tmp.close()

        tmpsizes = Sizes(tmpname)
        tmpsizes.close(clean=True)

        if highlights:
            subhighlights = list(hlbed.sub_bed(scaffoldID))

        imagename = ".".join((scaffoldID, opts.format))
        plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts,
                          highlights=subhighlights)
Exemplo n.º 8
0
def tips(args):
    """
    %prog tips patchers.bed complements.bed original.fasta backbone.fasta

    Append telomeric sequences based on patchers and complements.
    """
    p = OptionParser(tips.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbedfile, cbedfile, sizesfile, bbfasta = args

    pbed = Bed(pbedfile, sorted=False)
    cbed = Bed(cbedfile, sorted=False)

    complements = dict()
    for object, beds in groupby(cbed, key=lambda x: x.seqid):
        beds = list(beds)
        complements[object] = beds

    sizes = Sizes(sizesfile).mapping
    bbsizes = Sizes(bbfasta).mapping
    tbeds = []

    for object, beds in groupby(pbed, key=lambda x: x.accn):
        beds = list(beds)
        startbed, endbed = beds[0], beds[-1]
        start_id, end_id = startbed.seqid, endbed.seqid
        if startbed.start == 1:
            start_id = None
        if endbed.end == sizes[end_id]:
            end_id = None
        print(object, start_id, end_id, file=sys.stderr)
        if start_id:
            b = complements[start_id][0]
            b.accn = object
            tbeds.append(b)
        tbeds.append(
            BedLine(
                "\t".join(
                    str(x) for x in (object, 0, bbsizes[object], object, 1000, "+")
                )
            )
        )
        if end_id:
            b = complements[end_id][-1]
            b.accn = object
            tbeds.append(b)

    tbed = Bed()
    tbed.extend(tbeds)

    tbedfile = "tips.bed"
    tbed.print_to_file(tbedfile)
Exemplo n.º 9
0
def liftover(args):
    """
    %prog liftover agpfile bedfile

    Given coordinates in components, convert to the coordinates in chromosomes.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--prefix",
                 default=False,
                 action="store_true",
                 help="Prepend prefix to accn names [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile).order
    bed = Bed(bedfile)
    newbed = Bed()
    for b in bed:
        component = b.seqid
        if component not in agp:
            newbed.append(b)
            continue

        i, a = agp[component]

        assert a.component_beg < a.component_end
        arange = a.component_beg, a.component_end
        assert b.start < b.end
        brange = b.start, b.end

        st = range_intersect(arange, brange)
        if not st:
            continue
        start, end = st
        assert start <= end

        if a.orientation == '-':
            d = a.object_end + a.component_beg
            s, t = d - end, d - start
        else:
            d = a.object_beg - a.component_beg
            s, t = d + start, d + end

        name = b.accn.replace(" ", "_")
        if opts.prefix:
            name = component + "_" + name
        bline = "\t".join(str(x) for x in (a.object, s - 1, t, name))
        newbed.append(BedLine(bline))

    newbed.print_to_file(sorted=True)
Exemplo n.º 10
0
def check_beds(hintfile, p, opts, sorted=True):
    qbed_file, sbed_file = get_bed_filenames(hintfile, p, opts)
    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        logging.debug("Looks like self-self comparison.")

    qbed = Bed(opts.qbed, sorted=sorted)
    sbed = Bed(opts.sbed, sorted=sorted)
    qorder = qbed.order
    sorder = sbed.order

    return qbed, sbed, qorder, sorder, is_self
Exemplo n.º 11
0
def breakpoint(args):
    """
    %prog breakpoint blastfile bedfile

    Identify breakpoints where collinearity ends. `blastfile` contains mapping
    from markers (query) to scaffolds (subject). `bedfile` contains marker
    locations in the related species.
    """
    from jcvi.formats.blast import bed
    from jcvi.utils.range import range_interleave

    p = OptionParser(breakpoint.__doc__)
    p.add_option("--xdist",
                 type="int",
                 default=20,
                 help="xdist (in related genome) cutoff [default: %default]")
    p.add_option("--ydist",
                 type="int",
                 default=200000,
                 help="ydist (in current genome) cutoff [default: %default]")
    p.add_option("-n",
                 type="int",
                 default=5,
                 help="number of markers in a block [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, bedfile = args
    order = Bed(bedfile).order
    blastbedfile = bed([blastfile])
    bbed = Bed(blastbedfile)
    key = lambda x: x[1]
    for scaffold, bs in bbed.sub_beds():
        blocks = get_blocks(scaffold,
                            bs,
                            order,
                            xdist=opts.xdist,
                            ydist=opts.ydist,
                            N=opts.n)
        sblocks = []
        for block in blocks:
            xx, yy = zip(*block)
            sblocks.append((scaffold, min(yy), max(yy)))
        iblocks = range_interleave(sblocks)
        for ib in iblocks:
            ch, start, end = ib
            print "{0}\t{1}\t{2}".format(ch, start - 1, end)
Exemplo n.º 12
0
def synfind(args):
    """
    %prog synfind all.last *.bed

    Prepare input for SynFind.
    """
    p = OptionParser(synfind.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    bedfiles = args[1:]
    fp = open(lastfile)
    filteredlast = lastfile + ".filtered"
    fw = open(filteredlast, "w")
    for row in fp:
        b = BlastLine(row)
        if b.query == b.subject:
            continue
        print(b, file=fw)
    fw.close()
    logging.debug("Filtered LAST file written to `{0}`".format(filteredlast))

    allbed = "all.bed"
    fw = open(allbed, "w")
    for i, bedfile in enumerate(bedfiles):
        prefix = chr(ord('A') + i)
        bed = Bed(bedfile)
        for b in bed:
            b.seqid = prefix + b.seqid
            print(b, file=fw)
    fw.close()
    logging.debug("Bed file written to `{0}`".format(allbed))
Exemplo n.º 13
0
def prepare_synteny(tourfile, lastfile, odir, p, opts):
    """
    Prepare synteny plots for movie().
    """
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
    qbedfile = op.abspath(qbedfile)
    sbedfile = op.abspath(sbedfile)

    qbed = Bed(qbedfile, sorted=False)
    contig_to_beds = dict(qbed.sub_beds())

    # Create a separate directory for the subplots and movie
    mkdir(odir, overwrite=True)
    os.chdir(odir)
    logging.debug("Change into subdir `{}`".format(odir))

    # Make anchorsfile
    anchorsfile = ".".join(op.basename(lastfile).split(".",
                                                       2)[:2]) + ".anchors"
    fw = open(anchorsfile, "w")
    for b in Blast(lastfile):
        print >> fw, "\t".join(
            (gene_name(b.query), gene_name(b.subject), str(int(b.score))))
    fw.close()

    # Symlink sbed
    symlink(sbedfile, op.basename(sbedfile))

    return anchorsfile, qbedfile, contig_to_beds
Exemplo n.º 14
0
def insertion(args):
    """
    %prog insertion mic.mac.bed

    Find IES based on mapping MIC reads to MAC genome. Output a bedfile with
    'lesions' (stack of broken reads) in the MAC genome.
    """
    p = OptionParser(insertion.__doc__)
    p.add_option("--mindepth", default=6, type="int",
                 help="Minimum depth to call an insertion")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    mindepth = opts.mindepth
    bed = Bed(bedfile)
    fw = must_open(opts.outfile, "w")
    for seqid, feats in bed.sub_beds():
        left_ends = Counter([x.start for x in feats])
        right_ends = Counter([x.end for x in feats])
        selected = []
        for le, count in left_ends.items():
            if count >= mindepth:
                selected.append((seqid, le, "LE-{0}".format(le), count))
        for re, count in right_ends.items():
            if count >= mindepth:
                selected.append((seqid, re, "RE-{0}".format(re), count))
        selected.sort()
        for seqid, pos, label, count in selected:
            label = "{0}-r{1}".format(label, count)
            print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))
Exemplo n.º 15
0
def rename(args):
    """
    %prog rename map markers.blast > renamed.map

    Rename markers according to the new mapping locations.
    """
    from jcvi.formats.blast import bed

    p = OptionParser(rename.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mstmap, blastfile = args
    bedfile = bed([blastfile])
    markersbed = Bed(bedfile)
    markers = markersbed.order

    data = MSTMap(mstmap)
    header = data.header
    header = [header[0]] + ["seqid", "start"] + header[1:]
    print "\t".join(header)
    for b in data:
        m, geno = b.id, b.genotype
        if m not in markers:
            continue

        i, mb = markers[m]
        print "\t".join(str(x) for x in \
                (m, mb.seqid, mb.start, "\t".join(list(geno))))
Exemplo n.º 16
0
def prepare(bedfile):
    """
    Remove prepended tags in gene names.
    """
    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    fwa = open(abedfile, "w")
    fwb = open(bbedfile, "w")

    bed = Bed(bedfile)
    seen = set()
    for b in bed:
        accns = b.accn.split(";")
        new_accns = []
        for accn in accns:
            if ":" in accn:
                method, a = accn.split(":", 1)
                if method in ("liftOver", "GMAP", ""):
                    accn = a
            if accn in seen:
                logging.error("Duplicate id {0} found. Ignored.".format(accn))
                continue

            new_accns.append(accn)
            b.accn = accn
            print >> fwa, b
            seen.add(accn)

        b.accn = ";".join(new_accns)
        print >> fwb, b
    fwa.close()
    fwb.close()
Exemplo n.º 17
0
def condense(args):
    """
    %prog condense OM.bed

    Merge split alignments in OM bed.
    """
    from itertools import groupby
    from jcvi.assembly.patch import merge_ranges

    p = OptionParser(condense.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bedfile,) = args
    bed = Bed(bedfile, sorted=False)
    key = lambda x: (x.seqid, x.start, x.end)
    for k, sb in groupby(bed, key=key):
        sb = list(sb)
        b = sb[0]
        chr, start, end, strand = merge_ranges(sb)

        id = "{0}:{1}-{2}".format(chr, start, end)
        b.accn = id
        print(b)
Exemplo n.º 18
0
def gaps(args):
    """
    %prog gaps OM.bed fastafile

    Create patches around OM gaps.
    """
    from jcvi.formats.bed import uniq

    p = OptionParser(gaps.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ombed, fastafile = args
    ombed = uniq([ombed])
    bed = Bed(ombed)

    for a, b in pairwise(bed):
        om_a = (a.seqid, a.start, a.end, "+")
        om_b = (b.seqid, b.start, b.end, "+")
        ch_a = range_parse(a.accn)
        ch_b = range_parse(b.accn)
        ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+")
        ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+")

        om_dist, x = range_distance(om_a, om_b, distmode="ee")
        ch_dist, x = range_distance(ch_a, ch_b, distmode="ee")

        if om_dist <= 0 and ch_dist <= 0:
            continue

        print(a)
        print(b)
        print(om_dist, ch_dist)
Exemplo n.º 19
0
def fuse(args):
    """
    %prog fuse *.bed *.anchors

    Fuse gene orders based on anchors file.
    """
    from jcvi.algorithms.graph import BiGraph

    p = OptionParser(fuse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bedfiles = [x for x in args if x.endswith(".bed")]
    anchorfiles = [x for x in args if x.endswith(".anchors")]

    # TODO: Use Markov clustering to sparsify the edges
    families = Grouper()
    for anchorfile in anchorfiles:
        af = AnchorFile(anchorfile)
        for a, b, block_id in af.iter_pairs():
            families.join(a, b)

    allowed = set(families.keys())
    logging.debug("Total families: {}, Gene members: {}".format(
        len(families), len(allowed)))

    # TODO: Use C++ implementation of BiGraph() when available
    # For now just serialize this to the disk
    for bedfile in bedfiles:
        bed = Bed(bedfile, include=allowed)
        print_edges(bed, families)
Exemplo n.º 20
0
def comparebed(args):
    """
    %prog comparebed AP.chr.bed infer.bed

    Compare the scaffold links indicated in two bed files.
    """
    p = OptionParser(comparebed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    abed, bbed = args
    abed = Bed(abed)
    bbed = Bed(bbed)
    query_links(abed, bbed)
    query_links(bbed, abed)
Exemplo n.º 21
0
def layout(args):
    """
    %prog layout omgfile taxa

    Build column formatted gene lists after omgparse(). Use species list
    separated by comma in place of taxa, e.g. "BR,BO,AN,CN"
    """
    p = OptionParser(layout.__doc__)
    p.add_option("--sort",
                 help="Sort layout file based on bedfile [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    omgfile, taxa = args
    listfile = omgfile.rsplit(".", 1)[0] + ".list"
    taxa = taxa.split(",")
    ntaxa = len(taxa)
    fw = open(listfile, "w")

    data = []
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        row = ["."] * ntaxa
        genes = genes.split(",")
        ixs = [int(x) for x in idxs.split(",")]
        for gene, idx in zip(genes, ixs):
            row[idx] = gene
        txs = ",".join(taxa[x] for x in ixs)
        print >> fw, "\t".join(("\t".join(row), txs))
        data.append(row)

    coldata = zip(*data)
    ngenes = []
    for i, tx in enumerate(taxa):
        genes = [x for x in coldata[i] if x != '.']
        genes = set(x.strip("|") for x in genes)
        ngenes.append((len(genes), tx))

    details = ", ".join("{0} {1}".format(a, b) for a, b in ngenes)
    total = sum(a for a, b in ngenes)
    s = "A list of {0} orthologous families that collectively".format(
        len(data))
    s += " contain a total of {0} genes ({1})".format(total, details)
    print >> sys.stderr, s

    fw.close()
    lastcolumn = ntaxa + 1
    cmd = "sort -k{0},{0} {1} -o {1}".format(lastcolumn, listfile)
    sh(cmd)

    logging.debug("List file written to `{0}`.".format(listfile))
    sort = opts.sort
    if sort:
        thread = Bed(sort)
        sort_layout(thread, listfile)
Exemplo n.º 22
0
def chimera(args):
    """
    %prog chimera bedfile

    Scan the bed file to break scaffolds that multi-maps.
    """
    p = OptionParser(chimera.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    bed = Bed(bedfile)
    selected = select_bed(bed)
    mapped = defaultdict(set)  # scaffold => chr
    chimerabed = "chimera.bed"
    fw = open(chimerabed, "w")
    for b in selected:
        scf = range_parse(b.accn).seqid
        chr = b.seqid
        mapped[scf].add(chr)

    nchimera = 0
    for s, chrs in sorted(mapped.items()):
        if len(chrs) == 1:
            continue

        print >> sys.stderr, "=" * 80
        print >> sys.stderr, "{0} mapped to multiple locations: {1}".\
                format(s, ",".join(sorted(chrs)))
        ranges = []
        for b in selected:
            rr = range_parse(b.accn)
            scf = rr.seqid
            if scf == s:
                print >> sys.stderr, b
                ranges.append(rr)

        # Identify breakpoints
        ranges.sort(key=lambda x: (x.seqid, x.start, x.end))
        for a, b in pairwise(ranges):
            seqid = a.seqid
            if seqid != b.seqid:
                continue

            start, end = a.end, b.start
            if start > end:
                start, end = end, start

            chimeraline = "\t".join(str(x) for x in (seqid, start, end))
            print >> fw, chimeraline
            print >> sys.stderr, chimeraline
            nchimera += 1

    fw.close()
    logging.debug("A total of {0} junctions written to `{1}`.".\
                  format(nchimera, chimerabed))
Exemplo n.º 23
0
def check_beds(p, opts):

    if not (opts.qbed and opts.sbed):
        print >> sys.stderr, "Options --qbed and --sbed are required"
        sys.exit(not p.print_help())

    qbed_file, sbed_file = opts.qbed, opts.sbed
    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        logging.debug("Looks like self-self comparison.")

    qbed = Bed(opts.qbed)
    sbed = Bed(opts.sbed)
    qorder = qbed.order
    sorder = sbed.order

    return qbed, sbed, qorder, sorder, is_self
Exemplo n.º 24
0
def frombed(args):
    """
    %prog frombed bedfile contigfasta readfasta

    Convert read placement to contig format. This is useful before running BAMBUS.
    """
    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.bed import Bed
    from jcvi.utils.cbook import fill

    p = OptionParser(frombed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, contigfasta, readfasta = args
    prefix = bedfile.rsplit(".", 1)[0]
    contigfile = prefix + ".contig"
    idsfile = prefix + ".ids"

    contigfasta = Fasta(contigfasta)
    readfasta = Fasta(readfasta)

    bed = Bed(bedfile)
    checksum = "00000000 checksum."
    fw_ids = open(idsfile, "w")
    fw = open(contigfile, "w")

    for ctg, reads in bed.sub_beds():
        ctgseq = contigfasta[ctg]
        ctgline = "##{0} {1} {2} bases, {3}".format(\
                ctg, len(reads), len(ctgseq), checksum)

        print >> fw_ids, ctg
        print >> fw, ctgline
        print >> fw, fill(ctgseq.seq)

        for b in reads:
            read = b.accn
            strand = b.strand
            readseq = readfasta[read]
            rc = " [RC]" if strand == "-" else ""
            readlen = len(readseq)
            rstart, rend = 1, readlen
            if strand == "-":
                rstart, rend = rend, rstart

            readrange = "{{{0} {1}}}".format(rstart, rend)
            conrange = "<{0} {1}>".format(b.start, b.end)
            readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\
                    read, rc, readlen, checksum, readrange, conrange)
            print >> fw, readline
            print >> fw, fill(readseq.seq)

    logging.debug("Mapped contigs written to `{0}`.".format(contigfile))
    logging.debug("Contig IDs written to `{0}`.".format(idsfile))
Exemplo n.º 25
0
def closest(args):
    """
    %prog closest candidates.bed gaps.bed fastafile

    Identify the nearest gaps flanking suggested regions.
    """
    p = OptionParser(closest.__doc__)
    p.add_option(
        "--om",
        default=False,
        action="store_true",
        help="The bedfile is OM blocks",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    candidates, gapsbed, fastafile = args
    sizes = Sizes(fastafile).mapping
    bed = Bed(candidates)
    ranges = []
    for b in bed:
        r = range_parse(b.accn) if opts.om else b
        ranges.append([r.seqid, r.start, r.end])

    gapsbed = Bed(gapsbed)
    granges = [(x.seqid, x.start, x.end) for x in gapsbed]

    ranges = range_merge(ranges)
    for r in ranges:
        a = range_closest(granges, r)
        b = range_closest(granges, r, left=False)
        seqid = r[0]

        if a is not None and a[0] != seqid:
            a = None
        if b is not None and b[0] != seqid:
            b = None

        mmin = 1 if a is None else a[1]
        mmax = sizes[seqid] if b is None else b[2]

        print("\t".join(str(x) for x in (seqid, mmin - 1, mmax)))
Exemplo n.º 26
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed, BedLine
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch",
                 default=False,
                 action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale",
                 type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        bedline = "\t".join(
            str(x) for x in (qseqid, qstart - 1, qend,
                             "{0}:{1}".format(get_number(sseqid), sstart)))
        bd.append(BedLine(bedline))

    bd.print_to_file(filename=opts.outfile, sorted=True)
Exemplo n.º 27
0
def bed_store(bedfile):
    bedfile = mergeBed(bedfile, s=True, nms=True, sorted=True)
    bed = Bed(bedfile)
    reads, reads_r = {}, defaultdict(list)
    for b in bed:
        target = "{0}:{1}".format(b.seqid, b.start)
        for accn in b.accn.split(","):
            reads[accn] = target
            reads_r[target].append(accn)
    return reads, reads_r
Exemplo n.º 28
0
def make_gff(bed, prefix, fw):
    bed = Bed(bed)
    nfeats = 0
    for b in bed:
        seqid = prefix + b.seqid
        print("\t".join(str(x) for x in \
            (seqid, b.accn, b.start, b.end)), file=fw)
        nfeats += 1
    logging.debug("A total of {0} features converted to `{1}`".\
                    format(nfeats, fw.name))
Exemplo n.º 29
0
def split(args):
    """
    %prog split split.bed evidences.bed predictor1.gff predictor2.gff fastafile

    Split MAKER models by checking against predictors (such as AUGUSTUS and
    FGENESH). For each region covered by a working model. Find out the
    combination of predictors that gives the best accuracy against evidences
    (such as PASA).

    `split.bed` can be generated by pulling out subset from a list of ids
    $ python -m jcvi.formats.base join split.ids working.bed
        --column=0,3 --noheader | cut -f2-7 > split.bed
    """
    from jcvi.formats.bed import Bed

    p = OptionParser(split.__doc__)
    p.add_option("--key", default="Name",
            help="Key in the attributes to extract predictor.gff [default: %default]")
    p.add_option("--parents", default="match",
            help="list of features to extract, use comma to separate (e.g."
            "'gene,mRNA') [default: %default]")
    p.add_option("--children", default="match_part",
            help="list of features to extract, use comma to separate (e.g."
            "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    split_bed, evidences_bed, p1_gff, p2_gff, fastafile = args
    parents = opts.parents
    children = opts.children
    key = opts.key

    bed = Bed(split_bed)

    s1 = get_splits(split_bed, p1_gff, parents, key)
    s2 = get_splits(split_bed, p2_gff, parents, key)

    for b in bed:
        query = "{0}:{1}-{2}".format(b.seqid, b.start, b.end)
        b1 = get_accuracy(query, p1_gff, evidences_bed, fastafile, children, key)
        b2 = get_accuracy(query, p2_gff, evidences_bed, fastafile, children, key)
        accn = b.accn
        c1 = "|".join(s1[accn])
        c2 = "|".join(s2[accn])
        ac1 = b1.accuracy
        ac2 = b2.accuracy
        tag = p1_gff if ac1 >= ac2 else p2_gff
        tag = tag.split(".")[0]

        ac1 = "{0:.3f}".format(ac1)
        ac2 = "{0:.3f}".format(ac2)

        print "\t".join((accn, tag, ac1, ac2, c1, c2))
Exemplo n.º 30
0
Arquivo: gaps.py Projeto: zjwang6/jcvi
def estimate(args):
    """
    %prog estimate gaps.bed all.spans.bed all.mates

    Estimate gap sizes based on mate positions and library insert sizes.
    """
    from collections import defaultdict
    from jcvi.formats.bed import intersectBed_wao
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(estimate.__doc__)
    p.add_option("--minlinks",
                 default=3,
                 type="int",
                 help="Minimum number of links to place [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    gapsbed, spansbed, matesfile = args
    mf = MatesFile(matesfile)
    bed = Bed(gapsbed)
    order = bed.order

    gap2mate = defaultdict(set)
    mate2gap = defaultdict(set)

    for a, b in intersectBed_wao(gapsbed, spansbed):
        gapsize = a.span
        if gapsize != 100:
            continue

        gapname = a.accn

        if b is None:
            gap2mate[gapname] = set()
            continue

        matename = b.accn
        gap2mate[gapname].add(matename)
        mate2gap[matename].add(gapname)

    omgapsbed = "gaps.linkage.bed"
    fw = open(omgapsbed, "w")
    for gapname, mates in sorted(gap2mate.items()):
        i, b = order[gapname]
        nmates = len(mates)
        if nmates < opts.minlinks:
            print("{0}\t{1}".format(b, nmates), file=fw)
            continue

        print(gapname, mates)

    fw.close()