Пример #1
0
def main(args):
    # Load chromosome sizes
    if os.path.exists(args.chrom_sizes):
        cdict = ChromosomeDict(args.chrom_sizes)
    else:
        print >> sys.stderr, '[WARNING] Chromosome sizes were not found'
        cdict = ChromosomeDict()

    # Load cytoband
    if os.path.exists(args.cytoband):
        cytoband = list(read_gtf_file(args.cytoband))
    else:
        cytoband = None
        print >> sys.stderr, '[WARNING] Cytoband was not found'
    ''' Stage 1: Convert L1Base tracks to GTF.'''
    print >> sys.stderr, '*** Stage 1: Converting L1Base tracks to GTF.'
    all_locs = defaultdict(list)
    groups = [
        ('L1FLI', 'flil1'),
        ('L1ORF2', 'orf2l1'),
        ('L1FLnI', 'flnil1'),
    ]
    bedfiles = glob(os.path.join(args.track_dir, '*.bed'))
    for bf in bedfiles:
        group = [gn for gn, abbr in groups if re.search(abbr, bf)]
        assert len(group) == 1
        group = group[0]
        for l1line in l1baseutils.read_l1base_file(bf):
            g = l1line.to_gtf()
            g.attr['category'] = group
            all_locs[group].append(g)

    for group, locs in all_locs.iteritems():
        print '%s: %d' % (group, len(locs))
    ''' Stage 2: Remove Redundant annotations'''
    print >> sys.stderr, '*** Stage 2: Removing redundant annotations.'
    for idx_a in range(len(groups) - 1, 0, -1):
        groupA = groups[idx_a][0]
        if groupA not in all_locs: continue
        to_remove = []
        gtfA = all_locs[groupA]  #sort_gtf(all_locs[groupA], cdict.reforder)
        groupBs = [groups[b][0] for b in range(idx_a)]
        gtfBs = [all_locs[gB] for gB in groupBs if gB in all_locs]
        for g, overlaps in intersect_gtf(gtfA, gtfBs, stranded=False):
            if overlaps:
                to_remove.append(g)
        for g in to_remove:
            all_locs[groupA].remove(g)

    for group, locs in all_locs.iteritems():
        print '%s: %d' % (group, len(locs))

    print >> sys.stderr, '*** Stage 8: Naming loci'
    final_locs = {}
    for group, locs in all_locs.iteritems():
        final_locs[group] = namelocs(locs, cytoband, cdict)

    for group, locs in final_locs.iteritems():
        with open(os.path.join(args.outdir, '%s.gtf' % group), 'w') as outh:
            write_gtf_file(sort_gtf(locs, cdict.reforder), outh)
Пример #2
0
def namelocs(locs, cytogtf, cdict):
    locs = sort_gtf(locs, cdict.reforder)
    byband = defaultdict(list)
    for g, bands in intersect_gtf(locs, [
            cytogtf,
    ], stranded=False):
        chrom = g.chrom[3:] if g.chrom[:3] == 'chr' else g.chrom
        if bands:
            band = bands[0][1].attr['gene_id']
        else:
            band = ''
        byband[(chrom, band)].append(g)

    for (chrom, band), gtfs in byband.iteritems():
        if len(gtfs) == 1:
            suffixes = ['']
        else:
            suffixes = SUFFIXES[:len(gtfs)]
            # Add parentheses if band is not present
            if band == '':
                suffixes = ['(%s)' % _ for _ in suffixes]

        # Set the locus name, transcript_id, and gene_id
        for g, suf in zip(gtfs, suffixes):
            name = '%s_%s%s%s' % (g.attr['category'], chrom, band, suf)
            g.attr['locus'] = name
            g.attr['transcript_id'] = name
            g.attr['gene_id'] = name
    return locs
Пример #3
0
def stage4(iclusters, ltr_gtfs, flanksize, cdict, logh=sys.stderr):
    ''' Stage 4: Find LTRs flanking internal clusters '''
    print >> logh, '*** Stage 4: Finding LTRs that flank internal regions'
    print >> logh, '\tUsing flanksize = %d' % flanksize
    print >> logh, '\tFound %d LTR annotations' % sum(map(len, ltr_gtfs))

    # Create annotations with flanking regions (slop)
    slop = slop_gtf(iclusters, flanksize, cdict.reflen)
    # Find LTR annotations that overlap with the "slop" annotations
    isect = intersect_gtf(slop, ltr_gtfs)

    # Create annotations combining internal and LTR
    mclusters = []
    iclust_d = {g.attr['locus']: g
                for g in iclusters}  # Quickly retrieve cluster by locus ID
    for islop, flanks in isect:
        # Retrieve icluster with locus matching islop
        g = iclust_d.pop(islop.attr['locus'])
        for lidx, h in flanks:
            g.add(h.copy())
        mclusters.append(g)

    print >> logh, '\t%d merged clusters' % len(mclusters)
    print >> logh, '\t%d unmerged internal clusters' % len(iclust_d.values())

    # return sort_gtf(mclusters + iclust_d.values(), cdict.reforder)
    return sort_gtf(mclusters, cdict.reforder)
Пример #4
0
def gtftools_sortclust(args):
    # Load chromosome sizes
    if args.chrom_sizes and os.path.exists(args.chrom_sizes):
        cdict = ChromosomeDict(args.chrom_sizes)
    else:
        cdict = ChromosomeDict()

    giter = sort_gtf(read_gtf_clusters(args.infile), cdict.reforder)
    write_gtf_file(giter, args.outfile)
Пример #5
0
def stage3(igtfs, cdict, shortdist, longdist, logh=sys.stderr):
    ''' Stage 3: Merge internal annotations '''
    print >> logh, '*** Stage 3: Merging internal annotations'
    print >> logh, '\tFound %d internal annotations' % len(igtfs)
    # Merge the internal annotations that are very close (<10 bp)
    print >> logh, '\tMerging annotations < %d bp apart' % shortdist
    iclusters = cluster_gtf(igtfs, dist=shortdist)
    print >> logh, '\t%d merged annotations after first merge' % len(iclusters)

    # Merge the internal annotations that are fairly close (<10kb) and have consecutive models
    def consecutive_rmsk_model(a, b):
        left, right = (a, b) if a.start < b.start else (b, a)
        if left.strand != right.strand: return False
        if left.strand == '+':
            # Determine if the right cluster is a continuation of the left cluster
            return left.members[-1].attr['repLeft'] < right.members[0].attr[
                'repLeft']
        else:
            # Determine if the left cluster is a continuation of the right cluster
            return right.members[0].attr['repStart'] < left.members[-1].attr[
                'repStart']

    print >> logh, '\tMerging annotations < %d bp apart that have consecutive models' % (
        longdist)
    iclusters = cluster_gtf(iclusters,
                            dist=longdist,
                            criteria=consecutive_rmsk_model)
    print >> logh, '\t%d merged annotations after second merge' % len(
        iclusters)

    # Sort clusters and add attributes for locus and internal model
    iclusters = sort_gtf(iclusters, cdict.reforder)
    for i, g in enumerate(iclusters):
        g.set_attr_from_members('repName', 'intModel')
        g.set_attr('locus', '%s_%04d' % (g.attr['intModel'], i + 1))

    return iclusters
Пример #6
0
def main(args):
    ''' Setup '''
    logh = sys.stderr if args.noisy else open(os.devnull, 'w')

    if args.genome_build is None:
        sys.exit("ERROR: --genome_build is required.")
    print >> logh, "[VERBOSE] Genome build: %s" % args.genome_build

    # Create track directory, if necessary
    if not os.path.isdir(args.track_dir):
        print >> logh, "[VERBOSE] Creating track directory: %s" % args.track_dir
        os.makedirs(args.track_dir)
    else:
        print >> logh, "[VERBOSE] Using track directory: %s" % args.track_dir

    # Create output directory, if necessary
    dest = os.path.join(args.outdir, args.fam)
    if not os.path.isdir(dest):
        print >> logh, "[VERBOSE] Creating output directory: %s" % dest
        os.makedirs(dest)
    else:
        print >> logh, "[VERBOSE] Using output directory: %s" % dest

    # Setup IGV and snapshot directory
    igv = None if args.no_igv else igv_init(args.genome_build)
    if igv is None:
        if not args.no_igv:
            print >> sys.stderr, "[WARNING] Could not connect to IGV."
        else:
            print >> logh, "[VERBOSE] Not using IGV."
        snapshot_dir = None
        snapshot_final = False
    else:
        print >> logh, "[VERBOSE] Using IGV."
        if args.compare_gtfs and os.path.isdir(args.compare_gtfs):
            other_gtfs = glob(os.path.join(args.compare_gtfs, '*.gtf'))
            other_gtfs += glob(os.path.join(args.compare_gtfs, '*.gtf.gz'))
            for og in other_gtfs:
                igv.load(os.path.abspath(og))

        if args.no_snapshot is False:
            snapshot_dir = os.path.join(dest, 'snapshots')
            if not os.path.isdir(snapshot_dir):
                print >> logh, "[VERBOSE] Creating snapshot directory: %s" % snapshot_dir
                os.makedirs(snapshot_dir)
            igv.snapshotDirectory(snapshot_dir)
            print >> logh, "[VERBOSE] Snapshots will be saved to %s" % snapshot_dir
            snapshot_final = args.snapshot_final
        else:
            snapshot_dir = None
            snapshot_final = False

    if snapshot_final:
        print >> logh, "[VERBOSE] Taking snapshots of final loci."

    # Keep intermediate files?
    save_intermediate = args.save_intermediate
    if save_intermediate:
        print >> logh, "[VERBOSE] Saving intermediate GTFs"

    # Load chromosome sizes
    if os.path.exists(args.chrom_sizes):
        cdict = ChromosomeDict(args.chrom_sizes)
    else:
        print >> sys.stderr, '[WARNING] Chromosome sizes were not found'
        cdict = ChromosomeDict()

    int_model = args.intmodel.split(',')
    ltr_model = args.ltrmodel.split(',')
    ''' Stage 1: Download RMSK tracks from UCSC '''
    track_files = stage1(int_model + ltr_model, args.track_dir,
                         args.genome_build)
    ''' Stage 2: Convert RMSK tracks to GTF '''
    gtfs, mlens = stage2(track_files, int_model)
    ltr_model = [m for m in ltr_model if m in gtfs]  # Revise LTR model

    if save_intermediate:
        for m, gtf in gtfs.iteritems():
            with open(os.path.join(dest, '%s.gtf' % m), 'w') as outh:
                write_gtf_file(sort_gtf(gtf, cdict.reforder), outh)
    ''' Stage 3: Merge internal annotations '''
    igtfs = list(chain.from_iterable(gtfs[im] for im in int_model))
    iclusters = stage3(igtfs,
                       cdict,
                       shortdist=args.short_dist,
                       longdist=args.long_dist)

    # Remove records that are not in the chromosome list
    if cdict.reforder is not None:
        iclusters = region_gtf(iclusters, cdict.reforder)

    if save_intermediate:
        with open(os.path.join(dest, 'internal.gtf'), 'w') as outh:
            write_gtf_file(sort_gtf(iclusters, cdict.reforder), outh)
    ''' Stage 4: Find flanking LTRs '''
    if args.flank_size:
        flanksize = args.flank_size
    else:
        flanksize = max([mlens[lm] for lm in ltr_model])
        flanksize = int(round(flanksize / 100.) * 100)  # Make it a round 100

    mclusters = stage4(iclusters, [gtfs[lm] for lm in ltr_model], flanksize,
                       cdict)
    ''' Stage 5: Add cluster attributes'''
    mclusters = stage5(mclusters, mlens)
    ''' Stage 6: Filter short '''
    mclusters, rejected = stage6(mclusters, args.min_model_pct)
    if save_intermediate:
        with open(os.path.join(dest, 'rejected.gtf'), 'w') as outh:
            write_gtf_file(sort_gtf(rejected, cdict.reforder), outh)
        with open(os.path.join(dest, 'merged.gtf'), 'w') as outh:
            write_gtf_file(sort_gtf(mclusters, cdict.reforder), outh)
    ''' Stage 7: Resolve conflicts'''
    mclusters, lcons = stage7(mclusters, args.auto, igv, dest)
    ''' Stage 8: Name loci '''
    # Load cytoband
    if os.path.exists(args.cytoband):
        cytoband = read_gtf_file(args.cytoband)
    else:
        cytoband = None
        print >> sys.stderr, '[WARNING] Cytoband was not found'

    mclusters = stage8(mclusters, cytoband, args.fam)
    ''' Stage 9: Output '''
    print >> sys.stderr, '*** Stage 9. Final Output'
    final_gtf_file = os.path.join(dest, '%s.gtf' % args.fam)
    final_gtf = sort_gtf(mclusters, cdict.reforder)
    with open(final_gtf_file, 'w') as outh:
        write_gtf_file(final_gtf, outh)

    if igv:
        igv.load(os.path.abspath(final_gtf_file)).expand()

    # Review conflicts and display or snapshot
    if len(lcons) == 0:
        print >> logh, "[VERBOSE] No conflicts."
    else:
        for i, lcon in enumerate(lcons):
            print >> sys.stderr, '[REVIEW] Conflict %02d.' % (i + 1)
            print >> sys.stderr, '\t%s' % lcon.display_review_text()
            if snapshot_dir is not None or args.review:
                igv.goto(lcon.region_str())
            if snapshot_dir is not None:
                igv.snapshot('conflict.%02d.%s.png' % ((i + 1), lcon.action))
            if args.review:
                z = raw_input_stderr('\tPress [ENTER] to continue. ')

    # Create final snapshots
    if snapshot_final and snapshot_dir is not None:
        for g in final_gtf:
            igv.goto(g.attr['transcript_id'])
            igv.snapshot('%s.png' % g.attr['transcript_id'])
    ''' Summary '''
    print >> sys.stderr, '*** Stage 10. Summary'
    categories = Counter()
    ltr_usage = defaultdict(Counter)
    for g in final_gtf:
        categories[g.attr['category']] += 1
        if g.attr['category'] == 'prototype':
            rn = collapse_list([h.attr['repName'] for h in g.members])
            if rn[0] == rn[-1]:
                ltr_usage['prototype'][rn[0]] += 1
            else:
                lr = '%s/%s' % tuple(sorted([rn[0], rn[-1]]))
                ltr_usage['prototype'][lr] += 1
        if g.attr['category'] == 'oneside':
            rn = collapse_list([h.attr['repName'] for h in g.members])
            if rn[-1] in ltr_model:
                ltr_usage['oneside'][rn[-1]] += 1
            else:
                ltr_usage['oneside'][rn[0]] += 1

    print >> sys.stderr, '\n\n'
    print >> sys.stderr, '%s %s summary %s' % ('*' * 20, args.fam, '*' * 20)
    print >> sys.stderr, 'Locus types:'
    for cat in [
            'prototype',
            'oneside',
            'internal',
    ]:
        print >> sys.stderr, '\t%s%d' % (cat.ljust(20), categories[cat])
    for cat, v in categories.most_common():
        if cat not in [
                'prototype',
                'oneside',
                'internal',
        ]:
            print >> sys.stderr, '\t%s%d' % (cat.ljust(20), categories[cat])
    print >> sys.stderr, 'LTR usage (prototype):'
    for k, v in ltr_usage['prototype'].most_common():
        print >> sys.stderr, '\t%s%d' % (k.ljust(20), v)

    print >> sys.stderr, 'LTR usage (oneside):'
    for k, v in ltr_usage['oneside'].most_common():
        print >> sys.stderr, '\t%s%d' % (k.ljust(20), v)