def main(args): ### Read the GTF file ################################################################ print >> sys.stderr, 'Loading GTF: %s' % args.internal_file gtf = [ GTFLine(l) for l in utils.tab_line_gen(open(args.internal_file, 'rU')) ] ### Get model lengths # mlen = calculate_model_lengths(gtf) # print mlen mlen = calculate_model_lengths2(gtf) print >> sys.stderr, 'Model lengths: %s' % mlen ### Correct the model coordinates #################################################### correct_model_coordinates(gtf, mlen) for g in gtf: if g.strand == '+': trueend = mlen[g.attr['repName']] + g.attr['repLeft'] else: trueend = mlen[g.attr['repName']] + g.attr['repStart'] assert trueend == g.attr['repEnd'] ### Organize hits by chromosome ###################################################### bychrom = defaultdict(list) for g in gtf: bychrom[g.chrom].append(g) ### List of HERV loci ################################################################ print >> sys.stderr, 'Assembling HERV loci' all_locs = [] ### Create HERV loci for plus strand ################################################# for chrom in utils.CHROMNAMES: if chrom in bychrom: plus = [h for h in bychrom[chrom] if h.strand == '+'] if not plus: continue plus.sort(key=lambda x: x.start) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1)) cur.internal.append(plus[0]) for p1 in plus[1:]: p0 = cur.internal[-1] # Genomic distance between hits gdist = p1.start - p0.end # Determine whether p1 is in sequence with locus if gdist <= 10: ## Overlapping (or nearly) in genome insequence = True else: ## Hits are in sequence and genomic distance is not extreme insequence = p0.attr['repLeft'] < p1.attr['repLeft'] insequence &= gdist < args.longdist if insequence: cur.internal.append(p1) else: all_locs.append(cur) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1)) cur.internal.append(p1) all_locs.append(cur) ### Create HERV loci for minus strand ################################################ for chrom in utils.CHROMNAMES: if chrom in bychrom: minus = [h for h in bychrom[chrom] if h.strand == '-'] if not minus: continue minus.sort(key=lambda x: x.end, reverse=True) # Sort in reverse order cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1)) cur.internal.append(minus[0]) for p1 in minus[1:]: p0 = cur.internal[-1] # Genomic distance between hits gdist = p0.start - p1.end # Determine whether p1 is in sequence with locus if gdist <= 10: ## Overlapping (or nearly) in genome insequence = True else: ## Hits are in sequence and genomic distance is not extreme insequence = p0.attr['repStart'] < p1.attr['repStart'] insequence &= gdist < args.longdist if insequence: cur.internal.append(p1) else: all_locs.append(cur) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1)) cur.internal.append(p1) all_locs.append(cur) ### Add LTRs to HERV loci ############################################################ print >> sys.stderr, 'Finding flanking LTRs' for loc in all_locs: loc.find_ltr(args.ltr_files, args.flank) loc.adjust_overlaps() print >> sys.stderr, "Initial counts:" print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count) for cat, count in Counter( c.category() for c in all_locs).most_common()) ### Filtering ######################################################################## reject = set() if args.minpct > 0 or args.mincov > 0: print >> sys.stderr, "Removing loci with less than %d percent or %dbp model coverage" % ( int(args.minpct * 100), args.mincov) for loc in all_locs: if loc.model_cov() < (mlen[loc.internal_name()] * args.minpct ) or loc.model_cov() < args.mincov: print >> sys.stderr, '%s\t%d\t%s' % (loc.id, loc.model_cov(), loc.category()) reject.add(loc) for rloc in reject: all_locs.remove(rloc) print >> sys.stderr, "After filtering:" print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count) for cat, count in Counter( c.category() for c in all_locs).most_common()) print >> sys.stderr, '%s%d' % ('Rejected'.ljust(20, ' '), len(reject)) ### Deal with overlapping loci ####################################################### # Create GTF with all_locs with open('tmp.gtf', 'w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in all_locs): print >> outh, '\t'.join(g) # Cluster overlapping and bookended using bedtools p1 = Popen('bedtools cluster -i tmp.gtf', shell=True, stdout=PIPE, stderr=PIPE) out, err = p1.communicate() os.remove('tmp.gtf') # Parse bedtools output overlap_groups = defaultdict(list) for ll in out.strip('\n').split('\n'): f = ll.split('\t') overlap_groups[f[-1]].append(GTFLine(f[:9])) # Remove clusters with one for k in overlap_groups.keys(): if len(overlap_groups[k]) == 1: del overlap_groups[k] print >> sys.stderr, "%d overlap groups" % len(overlap_groups) if args.igv_preview and len(overlap_groups) > 0: print >> sys.stderr, "Loading IGV" # Create file for IGV viewing with open('tmp.gtf', 'w') as outh: liter = utils.sort_gtf( chain.from_iterable(loc.each_gtf() for loc in all_locs)) print >> outh, '\n'.join('\t'.join(_) for _ in liter) igv = IGV() igv.new() igv.genome('hg19') igv.load( os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf')) igv.load(os.path.join(os.getcwd(), 'tmp.gtf')) tandem = [] for k in sorted(overlap_groups.keys(), key=lambda x: int(x)): ogroup = overlap_groups[k] if args.igv_preview: locus_str = '%s:%s-%s' % (ogroup[0].chrom, min(gl.start for gl in ogroup) - 5000, max(gl.end for gl in ogroup) + 5000) igv.goto(locus_str) igv.expand() # Get locus for each member of overlap group og_locus = {} for o in ogroup: tmp = [c for c in all_locs if c.id == o.attr['name']] assert len(tmp) == 1 og_locus[o.attr['name']] = tmp[0] # Print out the model coverage for n, loc in og_locus.iteritems(): print >> sys.stderr, '%s\t%d\t%s' % (n, loc.model_cov(), loc.category()) # Parse user input z = raw_input('Action to take: ').strip() if z == '': continue inputcmd = z.strip().split(' ') if inputcmd[0] == 'REJECT': if len(inputcmd) == 1: # Only max will be kept st = sorted([loc for n, loc in og_locus.iteritems()], key=lambda x: x.model_cov(), reverse=True)[1:] loc_ids = [_.id for _ in st] elif len(inputcmd) == 2: loc_ids = inputcmd[1].split(',') else: assert False for loc_id in loc_ids: reject.add(og_locus[loc_id]) elif inputcmd[0] == 'TANDEM': if len(inputcmd) == 1: assert len(og_locus) == 2, 'More than 2 loci are present' tandem.append([loc for n, loc in og_locus.iteritems()]) elif len(inputcmd) == 2: loc_ids = inputcmd[1].split('+') tandem.append([og_locus[loc_id] for loc_id in loc_ids]) else: assert False elif inputcmd[0] == 'DIFF': n1, n2 = inputcmd[1].split('-') g1 = og_locus[n1] g2 = og_locus[n2] if g1.span()[0] < g2.span()[1]: g1.shorten(g2.span()[1] + 20, g1.span()[1]) elif g1.span()[1] < g2.span()[0]: g1.shorten(g1.span()[0], g2.span()[0] - 20) else: print "no overlap!" print g1 elif inputcmd[0] == 'IGNORE': continue else: assert False, 'Unknown command: "%s"' % inputcmd[0] # Remove rejected annotations for rloc in reject: if rloc in all_locs: all_locs.remove(rloc) # Create the tandem annotations for tgroup in tandem: tandem_loc = HERVLocus(id=tgroup[0].id) tandem_loc.internal = list( chain.from_iterable(loc.internal for loc in tgroup)) if tandem_loc.strand() == '+': tandem_loc.internal.sort(key=lambda x: x.start) else: tandem_loc.internal.sort(key=lambda x: x.end, reverse=True) tandem_loc.find_ltr(args.ltr_files, 1000) tandem_loc.adjust_overlaps() tandem_loc.is_tandem = True all_locs.append(tandem_loc) # Remove from original for rloc in tgroup: all_locs.remove(rloc) print >> sys.stderr, "After overlap removal:" print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count) for cat, count in Counter( c.category() for c in all_locs).most_common()) print >> sys.stderr, '%s%d' % ('Rejected'.ljust(20, ' '), len(reject)) if args.igv_preview and len(overlap_groups) > 0: os.remove('tmp.gtf') ### Sort loci ######################################################################## bychrom = defaultdict(list) for loc in all_locs: bychrom[loc.chrom()].append(loc) final_locs = [] for chrom in utils.CHROMNAMES: if chrom in bychrom: for loc in sorted(bychrom[chrom], key=lambda x: x.span()[0]): final_locs.append(loc) for i, loc in enumerate(final_locs): loc.id = '%s_%04d' % (args.prefix, i + 1) ### Rename loci according to cytoband ################################################# # Create GTF with all_locs with open('tmp.gtf', 'w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs): print >> outh, '\t'.join(g) p1 = Popen( 'bedtools intersect -wo -a tmp.gtf -b ../other_sources/cytoband.gtf', shell=True, stdout=PIPE, stderr=PIPE) out, err = p1.communicate() os.remove('tmp.gtf') byband = defaultdict(list) for ll in out.strip('\n').split('\n'): f = ll.split('\t') g1 = GTFLine(f[:9]) g2 = GTFLine(f[9:-1]) band = '%s%s' % (g2.chrom.strip('chr'), g2.attr['gene_id']) byband[band].append(g1) namemap = {} for band, glist in byband.iteritems(): if len(glist) == 1: namemap[glist[0].attr['name']] = '%s_%s' % (args.prefix, band) else: glist.sort(key=lambda x: x.start) for i, gl in enumerate(glist): namemap[gl.attr['name']] = '%s_%s%s' % (args.prefix, band, someletters[i]) for loc in final_locs: loc.locus_name = namemap[loc.id] ### Create annotation files ########################################################## print >> sys.stderr, "Writing annotation files" with open('%s.gtf' % args.prefix, 'w') as outh: liter = utils.sort_gtf( chain.from_iterable(loc.each_gtf() for loc in final_locs)) print >> outh, '\n'.join('\t'.join(_) for _ in liter) # for loc in final_locs: # print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf()) with open('%s_reject.gtf' % args.prefix, 'w') as outh: liter = utils.sort_gtf( chain.from_iterable(loc.each_gtf() for loc in reject)) print >> outh, '\n'.join('\t'.join(_) for _ in liter) # for loc in reject: # print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf()) with open('%s_span.gtf' % args.prefix, 'w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs): print >> outh, '\t'.join(g) with open('%s_table.txt' % args.prefix, 'w') as outh: print >> outh, '\t'.join([ 'locus_name', 'id', 'strand', 'chrom', 'start', 'end', 'strand', 'nfeats', 'width', 'model_cov', 'ltr5_model', 'int_model', 'ltr3_model' ]) for loc in final_locs: mgtf = GTFLine(loc.span_gtf()) row = [ loc.locus_name, loc.id, loc.category(), mgtf.chrom, mgtf.start, mgtf.end, mgtf.strand, mgtf.attr['nfeats'], loc.width(), loc.model_cov(), loc.ltr_up_name(), loc.internal_name(), loc.ltr_down_name(), ] print >> outh, '\t'.join(str(_) for _ in row) ### Extract sequences ################################################################ if args.get_sequences: print >> sys.stderr, "Extracting sequences" genome_fasta = args.genome_fasta # '/Users/bendall/Projects/References/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa' genome = dict((s.id, s) for s in SeqIO.parse(genome_fasta, 'fasta')) with open('%s.full.fasta' % args.prefix, 'w') as outh: for loc in final_locs: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), loc.span()[0], loc.span()[1], loc.strand()) print >> outh, '>%s|%s|%s' % (loc.locus_name, loc.category(), gcoord) print >> outh, str(loc.entire_sequence(genome).seq) with open('%s.internal.fasta' % args.prefix, 'w') as outh: for loc in final_locs: gcoord = '%s:%d-%d(%s)' % ( loc.chrom(), min(p.start for p in loc.internal), max(p.end for p in loc.internal), loc.strand()) print >> outh, '>%s_int|%s|%s|%s' % (loc.locus_name, loc.category(), gcoord, loc.format_print_clust()) print >> outh, str(loc.internal_sequence(genome).seq) with open('%s.5ltr.fasta' % args.prefix, 'w') as outh: for loc in final_locs: ltrseq = loc.ltr_up_sequence(genome) if ltrseq: gcoord = '%s:%d-%d(%s)' % ( loc.chrom(), min(p.start for p in loc.ltr_up), max(p.end for p in loc.ltr_up), loc.strand()) print >> outh, '>%s_5LTR|%s|%s' % ( loc.locus_name, loc.ltr_up_name(), gcoord) print >> outh, str(ltrseq.seq) with open('%s.3ltr.fasta' % args.prefix, 'w') as outh: for loc in final_locs: ltrseq = loc.ltr_down_sequence(genome) if ltrseq: gcoord = '%s:%d-%d(%s)' % ( loc.chrom(), min(p.start for p in loc.ltr_down), max(p.end for p in loc.ltr_down), loc.strand()) print >> outh, '>%s_3LTR|%s|%s' % ( loc.locus_name, loc.ltr_down_name(), gcoord) print >> outh, str(ltrseq.seq) ### IGV snapshots #################################################################### if args.igv_snapshot: print >> sys.stderr, "Taking IGV snapshots" igv = IGV() igv.new() igv.genome('hg19') igv.load( os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf')) if os.path.isdir('tmp'): for compare_gtf in glob('tmp/*.gtf'): igv.load(os.path.join(os.getcwd(), compare_gtf)) igv.load(os.path.join(os.getcwd(), '%s.gtf' % args.prefix)) igv.load(os.path.join(os.getcwd(), '%s_reject.gtf' % args.prefix)) do_snapshots = True if do_snapshots: if not os.path.exists(os.path.join(os.getcwd(), 'snapshots')): os.mkdir(os.path.join(os.getcwd(), 'snapshots')) if not os.path.exists(os.path.join(os.getcwd(), 'reject')): os.mkdir(os.path.join(os.getcwd(), 'reject')) categories = ['prototype', 'oneside', 'internal'] for cat in categories: if not os.path.exists( os.path.join(os.getcwd(), 'snapshots/%s' % cat)): os.mkdir(os.path.join(os.getcwd(), 'snapshots/%s' % cat)) if not os.path.exists( os.path.join(os.getcwd(), 'reject/%s' % cat)): os.mkdir(os.path.join(os.getcwd(), 'reject/%s' % cat)) for loc in final_locs: rc, lc = loc.span() locus_str = '%s:%d-%d' % (loc.chrom(), rc - 5000, lc + 5000) print >> sys.stderr, '%s\t%s\t%s' % (loc.locus_name, loc.category(), locus_str) igv.goto(locus_str) igv.expand() if do_snapshots: igv.snapshotDirectory( os.path.join(os.getcwd(), 'snapshots/%s' % loc.category().strip('*'))) igv.snapshot(filename='%s.png' % loc.locus_name) for loc in reject: rc, lc = loc.span() locus_str = '%s:%d-%d' % (loc.chrom(), rc - 5000, lc + 5000) print >> sys.stderr, '%s\t%s\t%s' % (loc.id, loc.category(), locus_str) igv.goto(locus_str) igv.expand() if do_snapshots: igv.snapshotDirectory( os.path.join(os.getcwd(), 'reject/%s' % loc.category().strip('*'))) igv.snapshot(filename='%s.png' % loc.id)
#! /usr/bin/env python import os import sys from IGV import * import re import time igv = IGV() igv.new() igv.genome('hg19') igv.load(os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf')) igv.load(os.path.join(os.getcwd(), 'tmp/concat.gtf')) igv.load(os.path.join(os.getcwd(), 'tmp/prototype.gtf')) igv.load(os.path.join(os.getcwd(), 'tmp/oneside.gtf')) igv.load(os.path.join(os.getcwd(), 'tmp/soloint.gtf')) igv.load(os.path.join(os.getcwd(), 'tmp/sololtr.gtf')) igv.load(os.path.join(os.getcwd(), 'tmp/unusual.gtf')) igv.load(os.path.join(os.getcwd(), 'edited.hg19.gtf')) igv.expand() def get_coordinates_for_loci(loci, lines): for l in lines: if l[1] == 'merged': attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";', l[8])) if attr['name'] in loci: locus = '%s:%d-%d' % (l[0], int(l[3]) - 5000, int(l[4]) + 5000) yield (attr['name'], locus)
#! /usr/bin/env python import os from IGV import * import re import time igv = IGV() igv.new() igv.genome('hg19') igv.load(os.path.join(os.getcwd(),'../rmsk_LTR.hg19.gtf')) igv.load(os.path.join(os.getcwd(),'concat.gtf')) igv.load(os.path.join(os.getcwd(),'canonical.gtf')) igv.load(os.path.join(os.getcwd(),'oneside.gtf')) igv.load(os.path.join(os.getcwd(),'soloint.gtf')) igv.load(os.path.join(os.getcwd(),'sololtr.gtf')) igv.load(os.path.join(os.getcwd(),'unusual.gtf')) igv.load(os.path.join(os.getcwd(),'subtables.gtf')) # categories = ['oneside','soloint','unusual'] # for cat in categories: # if not os.path.exists(os.path.join(os.getcwd(),'%s_snapshots' % cat)): # os.mkdir(os.path.join(os.getcwd(),'%s_snapshots' % cat)) # # igv.snapshotDirectory(os.path.join(os.getcwd(),'%s_snapshots' % cat)) # lines = [l.strip('\n').split('\t') for l in open('transcript.gtf','rU')] # for l in lines: # if l[1] == 'merged': # attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) # if attr['category'] == cat: # print attr['name']
#! /usr/bin/env python import os import sys from IGV import * import re import time igv = IGV() igv.new() igv.genome('hg19') igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf')) igv.load(os.path.join(os.getcwd(),'tmp/concat.gtf')) igv.load(os.path.join(os.getcwd(),'tmp/prototype.gtf')) igv.load(os.path.join(os.getcwd(),'tmp/oneside.gtf')) igv.load(os.path.join(os.getcwd(),'tmp/soloint.gtf')) igv.load(os.path.join(os.getcwd(),'tmp/sololtr.gtf')) igv.load(os.path.join(os.getcwd(),'tmp/unusual.gtf')) igv.load(os.path.join(os.getcwd(),'tmp/subtables.gtf')) if not os.path.exists(os.path.join(os.getcwd(),'snapshots')): os.mkdir(os.path.join(os.getcwd(),'snapshots')) categories = ['prototype', 'oneside', 'soloint', 'unusual'] lines = [l.strip('\n').split('\t') for l in open('initial_merge.hg19.gtf','rU')] for cat in categories: if not os.path.exists(os.path.join(os.getcwd(),'snapshots/%s' % cat)): os.mkdir(os.path.join(os.getcwd(),'snapshots/%s' % cat)) igv.snapshotDirectory(os.path.join(os.getcwd(),'snapshots/%s' % cat)) for l in lines:
print('locating {} on CeNDR'.format(args.bams[i])) strain = args.bams[i] args.bams[i] = '{bam} index={index} name={name}'.format( bam=strain_database[strain_database[2] == strain + '.bam'][11].values[0], index=strain_database[strain_database[2] == strain + '.bam.bai'][7].values[0], name=strain) print('OK') # load in the tracks from the .bams and name them try: # load in the bams, and name the tracks based on the tracks argument: for url, name in zip(args.bams, args.tracks): print('loading bam {}'.format(url)) print(igv.load('{url} name={name}'.format(url=url, name=name))) except TypeError: # if there is no --tracks argument, load .bams into tracks without setting track names: for url in args.bams: print('loading bam {}'.format(url)) print(igv.load(url)) # set the directory: if args.directory: print('setting snapshot directory to {}'.format(args.directory)) print(igv.set_path(args.directory)) # for each region that needs to be imaged go to it and save it. for region in args.regions: print('going to {}'.format(region)) print(
def main(args): ### Read the GTF file ################################################################ print >>sys.stderr, 'Loading GTF: %s' % args.internal_file gtf = [GTFLine(l) for l in utils.tab_line_gen(open(args.internal_file,'rU'))] ### Get model lengths # mlen = calculate_model_lengths(gtf) # print mlen mlen = calculate_model_lengths2(gtf) print >>sys.stderr, 'Model lengths: %s' % mlen ### Correct the model coordinates #################################################### correct_model_coordinates(gtf,mlen) for g in gtf: if g.strand == '+': trueend = mlen[g.attr['repName']] + g.attr['repLeft'] else: trueend = mlen[g.attr['repName']] + g.attr['repStart'] assert trueend == g.attr['repEnd'] ### Organize hits by chromosome ###################################################### bychrom = defaultdict(list) for g in gtf: bychrom[g.chrom].append(g) ### List of HERV loci ################################################################ print >>sys.stderr, 'Assembling HERV loci' all_locs = [] ### Create HERV loci for plus strand ################################################# for chrom in utils.CHROMNAMES: if chrom in bychrom: plus = [h for h in bychrom[chrom] if h.strand == '+'] if not plus: continue plus.sort(key=lambda x: x.start) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1)) cur.internal.append(plus[0]) for p1 in plus[1:]: p0 = cur.internal[-1] # Genomic distance between hits gdist = p1.start - p0.end # Determine whether p1 is in sequence with locus if gdist <= 10: ## Overlapping (or nearly) in genome insequence = True else: ## Hits are in sequence and genomic distance is not extreme insequence = p0.attr['repLeft'] < p1.attr['repLeft'] insequence &= gdist < args.longdist if insequence: cur.internal.append(p1) else: all_locs.append(cur) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1)) cur.internal.append(p1) all_locs.append(cur) ### Create HERV loci for minus strand ################################################ for chrom in utils.CHROMNAMES: if chrom in bychrom: minus = [h for h in bychrom[chrom] if h.strand == '-'] if not minus: continue minus.sort(key=lambda x: x.end, reverse=True) # Sort in reverse order cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1)) cur.internal.append(minus[0]) for p1 in minus[1:]: p0 = cur.internal[-1] # Genomic distance between hits gdist = p0.start - p1.end # Determine whether p1 is in sequence with locus if gdist <= 10: ## Overlapping (or nearly) in genome insequence = True else: ## Hits are in sequence and genomic distance is not extreme insequence = p0.attr['repStart'] < p1.attr['repStart'] insequence &= gdist < args.longdist if insequence: cur.internal.append(p1) else: all_locs.append(cur) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1)) cur.internal.append(p1) all_locs.append(cur) ### Add LTRs to HERV loci ############################################################ print >>sys.stderr, 'Finding flanking LTRs' for loc in all_locs: loc.find_ltr(args.ltr_files, args.flank) loc.adjust_overlaps() print >>sys.stderr, "Initial counts:" print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common()) ### Filtering ######################################################################## reject = set() if args.minpct > 0 or args.mincov > 0: print >>sys.stderr, "Removing loci with less than %d percent or %dbp model coverage" % (int(args.minpct*100), args.mincov) for loc in all_locs: if loc.model_cov() < (mlen[loc.internal_name()] * args.minpct) or loc.model_cov() < args.mincov: print >>sys.stderr, '%s\t%d\t%s' % (loc.id, loc.model_cov(), loc.category()) reject.add(loc) for rloc in reject: all_locs.remove(rloc) print >>sys.stderr, "After filtering:" print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common()) print >>sys.stderr, '%s%d' % ('Rejected'.ljust(20,' '), len(reject)) ### Deal with overlapping loci ####################################################### # Create GTF with all_locs with open('tmp.gtf','w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in all_locs): print >>outh, '\t'.join(g) # Cluster overlapping and bookended using bedtools p1 = Popen('bedtools cluster -i tmp.gtf', shell=True, stdout=PIPE, stderr=PIPE) out,err = p1.communicate() os.remove('tmp.gtf') # Parse bedtools output overlap_groups = defaultdict(list) for ll in out.strip('\n').split('\n'): f = ll.split('\t') overlap_groups[f[-1]].append(GTFLine(f[:9])) # Remove clusters with one for k in overlap_groups.keys(): if len(overlap_groups[k]) == 1: del overlap_groups[k] print >>sys.stderr, "%d overlap groups" % len(overlap_groups) if args.igv_preview and len(overlap_groups)>0: print >>sys.stderr, "Loading IGV" # Create file for IGV viewing with open('tmp.gtf','w') as outh: liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in all_locs)) print >>outh, '\n'.join('\t'.join(_) for _ in liter) igv = IGV() igv.new() igv.genome('hg19') igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf')) igv.load(os.path.join(os.getcwd(),'tmp.gtf')) tandem = [] for k in sorted(overlap_groups.keys(), key=lambda x:int(x)): ogroup = overlap_groups[k] if args.igv_preview: locus_str = '%s:%s-%s' % (ogroup[0].chrom, min(gl.start for gl in ogroup)-5000, max(gl.end for gl in ogroup)+5000) igv.goto(locus_str) igv.expand() # Get locus for each member of overlap group og_locus = {} for o in ogroup: tmp = [c for c in all_locs if c.id == o.attr['name']] assert len(tmp)==1 og_locus[o.attr['name']] = tmp[0] # Print out the model coverage for n,loc in og_locus.iteritems(): print >>sys.stderr, '%s\t%d\t%s' % (n, loc.model_cov(), loc.category()) # Parse user input z = raw_input('Action to take: ').strip() if z == '': continue inputcmd = z.strip().split(' ') if inputcmd[0] == 'REJECT': if len(inputcmd) == 1: # Only max will be kept st = sorted([loc for n,loc in og_locus.iteritems()], key=lambda x:x.model_cov(), reverse=True)[1:] loc_ids = [_.id for _ in st] elif len(inputcmd) == 2: loc_ids = inputcmd[1].split(',') else: assert False for loc_id in loc_ids: reject.add(og_locus[loc_id]) elif inputcmd[0] == 'TANDEM': if len(inputcmd) == 1: assert len(og_locus)==2, 'More than 2 loci are present' tandem.append([loc for n,loc in og_locus.iteritems()]) elif len(inputcmd) == 2: loc_ids = inputcmd[1].split('+') tandem.append([og_locus[loc_id] for loc_id in loc_ids]) else: assert False elif inputcmd[0] == 'DIFF': n1,n2 = inputcmd[1].split('-') g1 = og_locus[n1] g2 = og_locus[n2] if g1.span()[0] < g2.span()[1]: g1.shorten(g2.span()[1]+20, g1.span()[1]) elif g1.span()[1] < g2.span()[0]: g1.shorten(g1.span()[0], g2.span()[0]-20) else: print "no overlap!" print g1 elif inputcmd[0] == 'IGNORE': continue else: assert False, 'Unknown command: "%s"' % inputcmd[0] # Remove rejected annotations for rloc in reject: if rloc in all_locs: all_locs.remove(rloc) # Create the tandem annotations for tgroup in tandem: tandem_loc = HERVLocus(id=tgroup[0].id) tandem_loc.internal = list(chain.from_iterable(loc.internal for loc in tgroup)) if tandem_loc.strand() == '+': tandem_loc.internal.sort(key=lambda x:x.start) else: tandem_loc.internal.sort(key=lambda x:x.end, reverse=True) tandem_loc.find_ltr(args.ltr_files, 1000) tandem_loc.adjust_overlaps() tandem_loc.is_tandem = True all_locs.append(tandem_loc) # Remove from original for rloc in tgroup: all_locs.remove(rloc) print >>sys.stderr, "After overlap removal:" print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common()) print >>sys.stderr, '%s%d' % ('Rejected'.ljust(20,' '), len(reject)) if args.igv_preview and len(overlap_groups)>0: os.remove('tmp.gtf') ### Sort loci ######################################################################## bychrom = defaultdict(list) for loc in all_locs: bychrom[loc.chrom()].append(loc) final_locs = [] for chrom in utils.CHROMNAMES: if chrom in bychrom: for loc in sorted(bychrom[chrom], key=lambda x:x.span()[0]): final_locs.append(loc) for i,loc in enumerate(final_locs): loc.id = '%s_%04d' % (args.prefix, i+1) ### Rename loci according to cytoband ################################################# # Create GTF with all_locs with open('tmp.gtf','w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs): print >>outh, '\t'.join(g) p1 = Popen('bedtools intersect -wo -a tmp.gtf -b ../other_sources/cytoband.gtf', shell=True, stdout=PIPE, stderr=PIPE) out,err = p1.communicate() os.remove('tmp.gtf') byband = defaultdict(list) for ll in out.strip('\n').split('\n'): f = ll.split('\t') g1 = GTFLine(f[:9]) g2 = GTFLine(f[9:-1]) band = '%s%s' % (g2.chrom.strip('chr'),g2.attr['gene_id']) byband[band].append(g1) namemap = {} for band,glist in byband.iteritems(): if len(glist) == 1: namemap[glist[0].attr['name']] = '%s_%s' % (args.prefix, band) else: glist.sort(key=lambda x:x.start) for i,gl in enumerate(glist): namemap[gl.attr['name']] = '%s_%s%s' % (args.prefix, band, someletters[i]) for loc in final_locs: loc.locus_name = namemap[loc.id] ### Create annotation files ########################################################## print >>sys.stderr, "Writing annotation files" with open('%s.gtf' % args.prefix,'w') as outh: liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in final_locs)) print >>outh, '\n'.join('\t'.join(_) for _ in liter) # for loc in final_locs: # print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf()) with open('%s_reject.gtf' % args.prefix,'w') as outh: liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in reject)) print >>outh, '\n'.join('\t'.join(_) for _ in liter) # for loc in reject: # print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf()) with open('%s_span.gtf' % args.prefix,'w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs): print >>outh, '\t'.join(g) with open('%s_table.txt' % args.prefix,'w') as outh: print >>outh, '\t'.join(['locus_name','id','strand','chrom','start','end','strand','nfeats','width','model_cov','ltr5_model','int_model','ltr3_model']) for loc in final_locs: mgtf = GTFLine(loc.span_gtf()) row = [loc.locus_name, loc.id, loc.category(), mgtf.chrom, mgtf.start, mgtf.end, mgtf.strand, mgtf.attr['nfeats'], loc.width(), loc.model_cov(), loc.ltr_up_name(), loc.internal_name(), loc.ltr_down_name(), ] print >>outh, '\t'.join(str(_) for _ in row) ### Extract sequences ################################################################ if args.get_sequences: print >>sys.stderr, "Extracting sequences" genome_fasta = args.genome_fasta # '/Users/bendall/Projects/References/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa' genome = dict((s.id,s) for s in SeqIO.parse(genome_fasta,'fasta')) with open('%s.full.fasta' % args.prefix,'w') as outh: for loc in final_locs: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), loc.span()[0], loc.span()[1], loc.strand()) print >>outh, '>%s|%s|%s' % (loc.locus_name, loc.category(), gcoord) print >>outh, str(loc.entire_sequence(genome).seq) with open('%s.internal.fasta' % args.prefix,'w') as outh: for loc in final_locs: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.internal), max(p.end for p in loc.internal), loc.strand()) print >>outh, '>%s_int|%s|%s|%s' % (loc.locus_name, loc.category(), gcoord, loc.format_print_clust()) print >>outh, str(loc.internal_sequence(genome).seq) with open('%s.5ltr.fasta' % args.prefix,'w') as outh: for loc in final_locs: ltrseq = loc.ltr_up_sequence(genome) if ltrseq: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.ltr_up), max(p.end for p in loc.ltr_up), loc.strand()) print >>outh, '>%s_5LTR|%s|%s' % (loc.locus_name, loc.ltr_up_name(), gcoord) print >>outh, str(ltrseq.seq) with open('%s.3ltr.fasta' % args.prefix,'w') as outh: for loc in final_locs: ltrseq = loc.ltr_down_sequence(genome) if ltrseq: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.ltr_down), max(p.end for p in loc.ltr_down), loc.strand()) print >>outh, '>%s_3LTR|%s|%s' % (loc.locus_name, loc.ltr_down_name(), gcoord) print >>outh, str(ltrseq.seq) ### IGV snapshots #################################################################### if args.igv_snapshot: print >>sys.stderr, "Taking IGV snapshots" igv = IGV() igv.new() igv.genome('hg19') igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf')) if os.path.isdir('tmp'): for compare_gtf in glob('tmp/*.gtf'): igv.load(os.path.join(os.getcwd(), compare_gtf)) igv.load(os.path.join(os.getcwd(),'%s.gtf' % args.prefix)) igv.load(os.path.join(os.getcwd(),'%s_reject.gtf' % args.prefix)) do_snapshots = True if do_snapshots: if not os.path.exists(os.path.join(os.getcwd(),'snapshots')): os.mkdir(os.path.join(os.getcwd(),'snapshots')) if not os.path.exists(os.path.join(os.getcwd(),'reject')): os.mkdir(os.path.join(os.getcwd(),'reject')) categories = ['prototype', 'oneside', 'internal'] for cat in categories: if not os.path.exists(os.path.join(os.getcwd(),'snapshots/%s' % cat)): os.mkdir(os.path.join(os.getcwd(),'snapshots/%s' % cat)) if not os.path.exists(os.path.join(os.getcwd(),'reject/%s' % cat)): os.mkdir(os.path.join(os.getcwd(),'reject/%s' % cat)) for loc in final_locs: rc,lc = loc.span() locus_str = '%s:%d-%d' % (loc.chrom(), rc-5000, lc+5000) print >>sys.stderr, '%s\t%s\t%s' % (loc.locus_name, loc.category(), locus_str) igv.goto(locus_str) igv.expand() if do_snapshots: igv.snapshotDirectory(os.path.join(os.getcwd(),'snapshots/%s' % loc.category().strip('*') )) igv.snapshot(filename='%s.png' % loc.locus_name) for loc in reject: rc,lc = loc.span() locus_str = '%s:%d-%d' % (loc.chrom(), rc-5000, lc+5000) print >>sys.stderr, '%s\t%s\t%s' % (loc.id, loc.category(), locus_str) igv.goto(locus_str) igv.expand() if do_snapshots: igv.snapshotDirectory(os.path.join(os.getcwd(),'reject/%s' % loc.category().strip('*') )) igv.snapshot(filename='%s.png' % loc.id)