def main(args): # Load chromosome sizes if os.path.exists(args.chrom_sizes): cdict = ChromosomeDict(args.chrom_sizes) else: print >> sys.stderr, '[WARNING] Chromosome sizes were not found' cdict = ChromosomeDict() # Load cytoband if os.path.exists(args.cytoband): cytoband = list(read_gtf_file(args.cytoband)) else: cytoband = None print >> sys.stderr, '[WARNING] Cytoband was not found' ''' Stage 1: Convert L1Base tracks to GTF.''' print >> sys.stderr, '*** Stage 1: Converting L1Base tracks to GTF.' all_locs = defaultdict(list) groups = [ ('L1FLI', 'flil1'), ('L1ORF2', 'orf2l1'), ('L1FLnI', 'flnil1'), ] bedfiles = glob(os.path.join(args.track_dir, '*.bed')) for bf in bedfiles: group = [gn for gn, abbr in groups if re.search(abbr, bf)] assert len(group) == 1 group = group[0] for l1line in l1baseutils.read_l1base_file(bf): g = l1line.to_gtf() g.attr['category'] = group all_locs[group].append(g) for group, locs in all_locs.iteritems(): print '%s: %d' % (group, len(locs)) ''' Stage 2: Remove Redundant annotations''' print >> sys.stderr, '*** Stage 2: Removing redundant annotations.' for idx_a in range(len(groups) - 1, 0, -1): groupA = groups[idx_a][0] if groupA not in all_locs: continue to_remove = [] gtfA = all_locs[groupA] #sort_gtf(all_locs[groupA], cdict.reforder) groupBs = [groups[b][0] for b in range(idx_a)] gtfBs = [all_locs[gB] for gB in groupBs if gB in all_locs] for g, overlaps in intersect_gtf(gtfA, gtfBs, stranded=False): if overlaps: to_remove.append(g) for g in to_remove: all_locs[groupA].remove(g) for group, locs in all_locs.iteritems(): print '%s: %d' % (group, len(locs)) print >> sys.stderr, '*** Stage 8: Naming loci' final_locs = {} for group, locs in all_locs.iteritems(): final_locs[group] = namelocs(locs, cytoband, cdict) for group, locs in final_locs.iteritems(): with open(os.path.join(args.outdir, '%s.gtf' % group), 'w') as outh: write_gtf_file(sort_gtf(locs, cdict.reforder), outh)
def namelocs(locs, cytogtf, cdict): locs = sort_gtf(locs, cdict.reforder) byband = defaultdict(list) for g, bands in intersect_gtf(locs, [ cytogtf, ], stranded=False): chrom = g.chrom[3:] if g.chrom[:3] == 'chr' else g.chrom if bands: band = bands[0][1].attr['gene_id'] else: band = '' byband[(chrom, band)].append(g) for (chrom, band), gtfs in byband.iteritems(): if len(gtfs) == 1: suffixes = [''] else: suffixes = SUFFIXES[:len(gtfs)] # Add parentheses if band is not present if band == '': suffixes = ['(%s)' % _ for _ in suffixes] # Set the locus name, transcript_id, and gene_id for g, suf in zip(gtfs, suffixes): name = '%s_%s%s%s' % (g.attr['category'], chrom, band, suf) g.attr['locus'] = name g.attr['transcript_id'] = name g.attr['gene_id'] = name return locs
def stage4(iclusters, ltr_gtfs, flanksize, cdict, logh=sys.stderr): ''' Stage 4: Find LTRs flanking internal clusters ''' print >> logh, '*** Stage 4: Finding LTRs that flank internal regions' print >> logh, '\tUsing flanksize = %d' % flanksize print >> logh, '\tFound %d LTR annotations' % sum(map(len, ltr_gtfs)) # Create annotations with flanking regions (slop) slop = slop_gtf(iclusters, flanksize, cdict.reflen) # Find LTR annotations that overlap with the "slop" annotations isect = intersect_gtf(slop, ltr_gtfs) # Create annotations combining internal and LTR mclusters = [] iclust_d = {g.attr['locus']: g for g in iclusters} # Quickly retrieve cluster by locus ID for islop, flanks in isect: # Retrieve icluster with locus matching islop g = iclust_d.pop(islop.attr['locus']) for lidx, h in flanks: g.add(h.copy()) mclusters.append(g) print >> logh, '\t%d merged clusters' % len(mclusters) print >> logh, '\t%d unmerged internal clusters' % len(iclust_d.values()) # return sort_gtf(mclusters + iclust_d.values(), cdict.reforder) return sort_gtf(mclusters, cdict.reforder)
def gtftools_sortclust(args): # Load chromosome sizes if args.chrom_sizes and os.path.exists(args.chrom_sizes): cdict = ChromosomeDict(args.chrom_sizes) else: cdict = ChromosomeDict() giter = sort_gtf(read_gtf_clusters(args.infile), cdict.reforder) write_gtf_file(giter, args.outfile)
def stage3(igtfs, cdict, shortdist, longdist, logh=sys.stderr): ''' Stage 3: Merge internal annotations ''' print >> logh, '*** Stage 3: Merging internal annotations' print >> logh, '\tFound %d internal annotations' % len(igtfs) # Merge the internal annotations that are very close (<10 bp) print >> logh, '\tMerging annotations < %d bp apart' % shortdist iclusters = cluster_gtf(igtfs, dist=shortdist) print >> logh, '\t%d merged annotations after first merge' % len(iclusters) # Merge the internal annotations that are fairly close (<10kb) and have consecutive models def consecutive_rmsk_model(a, b): left, right = (a, b) if a.start < b.start else (b, a) if left.strand != right.strand: return False if left.strand == '+': # Determine if the right cluster is a continuation of the left cluster return left.members[-1].attr['repLeft'] < right.members[0].attr[ 'repLeft'] else: # Determine if the left cluster is a continuation of the right cluster return right.members[0].attr['repStart'] < left.members[-1].attr[ 'repStart'] print >> logh, '\tMerging annotations < %d bp apart that have consecutive models' % ( longdist) iclusters = cluster_gtf(iclusters, dist=longdist, criteria=consecutive_rmsk_model) print >> logh, '\t%d merged annotations after second merge' % len( iclusters) # Sort clusters and add attributes for locus and internal model iclusters = sort_gtf(iclusters, cdict.reforder) for i, g in enumerate(iclusters): g.set_attr_from_members('repName', 'intModel') g.set_attr('locus', '%s_%04d' % (g.attr['intModel'], i + 1)) return iclusters
def main(args): ''' Setup ''' logh = sys.stderr if args.noisy else open(os.devnull, 'w') if args.genome_build is None: sys.exit("ERROR: --genome_build is required.") print >> logh, "[VERBOSE] Genome build: %s" % args.genome_build # Create track directory, if necessary if not os.path.isdir(args.track_dir): print >> logh, "[VERBOSE] Creating track directory: %s" % args.track_dir os.makedirs(args.track_dir) else: print >> logh, "[VERBOSE] Using track directory: %s" % args.track_dir # Create output directory, if necessary dest = os.path.join(args.outdir, args.fam) if not os.path.isdir(dest): print >> logh, "[VERBOSE] Creating output directory: %s" % dest os.makedirs(dest) else: print >> logh, "[VERBOSE] Using output directory: %s" % dest # Setup IGV and snapshot directory igv = None if args.no_igv else igv_init(args.genome_build) if igv is None: if not args.no_igv: print >> sys.stderr, "[WARNING] Could not connect to IGV." else: print >> logh, "[VERBOSE] Not using IGV." snapshot_dir = None snapshot_final = False else: print >> logh, "[VERBOSE] Using IGV." if args.compare_gtfs and os.path.isdir(args.compare_gtfs): other_gtfs = glob(os.path.join(args.compare_gtfs, '*.gtf')) other_gtfs += glob(os.path.join(args.compare_gtfs, '*.gtf.gz')) for og in other_gtfs: igv.load(os.path.abspath(og)) if args.no_snapshot is False: snapshot_dir = os.path.join(dest, 'snapshots') if not os.path.isdir(snapshot_dir): print >> logh, "[VERBOSE] Creating snapshot directory: %s" % snapshot_dir os.makedirs(snapshot_dir) igv.snapshotDirectory(snapshot_dir) print >> logh, "[VERBOSE] Snapshots will be saved to %s" % snapshot_dir snapshot_final = args.snapshot_final else: snapshot_dir = None snapshot_final = False if snapshot_final: print >> logh, "[VERBOSE] Taking snapshots of final loci." # Keep intermediate files? save_intermediate = args.save_intermediate if save_intermediate: print >> logh, "[VERBOSE] Saving intermediate GTFs" # Load chromosome sizes if os.path.exists(args.chrom_sizes): cdict = ChromosomeDict(args.chrom_sizes) else: print >> sys.stderr, '[WARNING] Chromosome sizes were not found' cdict = ChromosomeDict() int_model = args.intmodel.split(',') ltr_model = args.ltrmodel.split(',') ''' Stage 1: Download RMSK tracks from UCSC ''' track_files = stage1(int_model + ltr_model, args.track_dir, args.genome_build) ''' Stage 2: Convert RMSK tracks to GTF ''' gtfs, mlens = stage2(track_files, int_model) ltr_model = [m for m in ltr_model if m in gtfs] # Revise LTR model if save_intermediate: for m, gtf in gtfs.iteritems(): with open(os.path.join(dest, '%s.gtf' % m), 'w') as outh: write_gtf_file(sort_gtf(gtf, cdict.reforder), outh) ''' Stage 3: Merge internal annotations ''' igtfs = list(chain.from_iterable(gtfs[im] for im in int_model)) iclusters = stage3(igtfs, cdict, shortdist=args.short_dist, longdist=args.long_dist) # Remove records that are not in the chromosome list if cdict.reforder is not None: iclusters = region_gtf(iclusters, cdict.reforder) if save_intermediate: with open(os.path.join(dest, 'internal.gtf'), 'w') as outh: write_gtf_file(sort_gtf(iclusters, cdict.reforder), outh) ''' Stage 4: Find flanking LTRs ''' if args.flank_size: flanksize = args.flank_size else: flanksize = max([mlens[lm] for lm in ltr_model]) flanksize = int(round(flanksize / 100.) * 100) # Make it a round 100 mclusters = stage4(iclusters, [gtfs[lm] for lm in ltr_model], flanksize, cdict) ''' Stage 5: Add cluster attributes''' mclusters = stage5(mclusters, mlens) ''' Stage 6: Filter short ''' mclusters, rejected = stage6(mclusters, args.min_model_pct) if save_intermediate: with open(os.path.join(dest, 'rejected.gtf'), 'w') as outh: write_gtf_file(sort_gtf(rejected, cdict.reforder), outh) with open(os.path.join(dest, 'merged.gtf'), 'w') as outh: write_gtf_file(sort_gtf(mclusters, cdict.reforder), outh) ''' Stage 7: Resolve conflicts''' mclusters, lcons = stage7(mclusters, args.auto, igv, dest) ''' Stage 8: Name loci ''' # Load cytoband if os.path.exists(args.cytoband): cytoband = read_gtf_file(args.cytoband) else: cytoband = None print >> sys.stderr, '[WARNING] Cytoband was not found' mclusters = stage8(mclusters, cytoband, args.fam) ''' Stage 9: Output ''' print >> sys.stderr, '*** Stage 9. Final Output' final_gtf_file = os.path.join(dest, '%s.gtf' % args.fam) final_gtf = sort_gtf(mclusters, cdict.reforder) with open(final_gtf_file, 'w') as outh: write_gtf_file(final_gtf, outh) if igv: igv.load(os.path.abspath(final_gtf_file)).expand() # Review conflicts and display or snapshot if len(lcons) == 0: print >> logh, "[VERBOSE] No conflicts." else: for i, lcon in enumerate(lcons): print >> sys.stderr, '[REVIEW] Conflict %02d.' % (i + 1) print >> sys.stderr, '\t%s' % lcon.display_review_text() if snapshot_dir is not None or args.review: igv.goto(lcon.region_str()) if snapshot_dir is not None: igv.snapshot('conflict.%02d.%s.png' % ((i + 1), lcon.action)) if args.review: z = raw_input_stderr('\tPress [ENTER] to continue. ') # Create final snapshots if snapshot_final and snapshot_dir is not None: for g in final_gtf: igv.goto(g.attr['transcript_id']) igv.snapshot('%s.png' % g.attr['transcript_id']) ''' Summary ''' print >> sys.stderr, '*** Stage 10. Summary' categories = Counter() ltr_usage = defaultdict(Counter) for g in final_gtf: categories[g.attr['category']] += 1 if g.attr['category'] == 'prototype': rn = collapse_list([h.attr['repName'] for h in g.members]) if rn[0] == rn[-1]: ltr_usage['prototype'][rn[0]] += 1 else: lr = '%s/%s' % tuple(sorted([rn[0], rn[-1]])) ltr_usage['prototype'][lr] += 1 if g.attr['category'] == 'oneside': rn = collapse_list([h.attr['repName'] for h in g.members]) if rn[-1] in ltr_model: ltr_usage['oneside'][rn[-1]] += 1 else: ltr_usage['oneside'][rn[0]] += 1 print >> sys.stderr, '\n\n' print >> sys.stderr, '%s %s summary %s' % ('*' * 20, args.fam, '*' * 20) print >> sys.stderr, 'Locus types:' for cat in [ 'prototype', 'oneside', 'internal', ]: print >> sys.stderr, '\t%s%d' % (cat.ljust(20), categories[cat]) for cat, v in categories.most_common(): if cat not in [ 'prototype', 'oneside', 'internal', ]: print >> sys.stderr, '\t%s%d' % (cat.ljust(20), categories[cat]) print >> sys.stderr, 'LTR usage (prototype):' for k, v in ltr_usage['prototype'].most_common(): print >> sys.stderr, '\t%s%d' % (k.ljust(20), v) print >> sys.stderr, 'LTR usage (oneside):' for k, v in ltr_usage['oneside'].most_common(): print >> sys.stderr, '\t%s%d' % (k.ljust(20), v)