def find_first_orfs(gtf_file, genome_fasta_file): # # extract transcript DNA sequences, translate to protein, and # search for first ORF # logging.debug('Finding ORFs in transcript sequences') # open genome fasta file ref_fa = pysam.Fastafile(genome_fasta_file) num_finished = 1 for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: if t.strand == NO_STRAND: t.strand = POS_STRAND orfpos = find_first_orf(t, ref_fa) t.strand = NEG_STRAND orfneg = find_first_orf(t, ref_fa) if len(orfpos.seq) >= len(orfneg.seq): yield orfpos else: yield orfneg else: yield find_first_orf(t, ref_fa) if (num_finished % 10000) == 0: logging.debug('Processed %d transcripts' % (num_finished)) num_finished += 1 # cleanup ref_fa.close() return
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) # parse transcripts num_transcripts = 0 # keep track of redundant gene/transcript counts gene_map = collections.defaultdict(lambda: {}) transcript_map = collections.defaultdict(lambda: {}) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: ttype, tcat, tname = impute_transcript(t, gene_map, transcript_map) # write new attributes for f in t.to_gtf_features(source='assemblyline', score=1000): f.attrs['transcript_type'] = ttype f.attrs['transcript_category'] = tcat f.attrs['transcript_name'] = tname print str(f) num_transcripts += 1 return 0
def main(): # setup logging logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # parse command line parser = argparse.ArgumentParser() parser.add_argument('-g', dest='gene_id', action='store_true', default=False) parser.add_argument('gtf_file') args = parser.parse_args() # check command line parameters if not os.path.exists(args.gtf_file): parser.error("gtf file %s not found" % (args.gtf_file)) for transcripts in parse_gtf(open(args.gtf_file)): for t in transcripts: if args.gene_id: name = '%s|%s' % (t.attrs['gene_id'], t.attrs['transcript_id']) else: name = t.attrs['transcript_id'] fields = write_bed(t.chrom, name, t.strand, 1000, t.exons) print '\t'.join(fields) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) # parse transcripts num_transcripts = 0 # keep track of redundant gene/transcript counts gene_map = collections.defaultdict(lambda: {}) transcript_map = collections.defaultdict(lambda: {}) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: ttype, tcat, tname = impute_transcript(t, gene_map, transcript_map) # write new attributes for f in t.to_gtf_features(source='assemblyline', score=1000): f.attrs['transcript_type'] = ttype f.attrs['transcript_category'] = tcat f.attrs['transcript_name'] = tname print str(f) num_transcripts += 1 return 0
def write_transcript_table(gtf_file, table_file): fileh = open(table_file, 'w') print >> fileh, '\t'.join(get_classify_header_fields()) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: fields = get_classify_fields(t) print >> fileh, '\t'.join(map(str, fields)) fileh.close()
def write_transcript_table(gtf_file, table_file): fileh = open(table_file, "w") print >> fileh, "\t".join(get_classify_header_fields()) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: fields = get_classify_fields(t) print >> fileh, "\t".join(map(str, fields)) fileh.close()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # parse command line parser = argparse.ArgumentParser() parser.add_argument('-u', type=int, dest='upstream', default=1000) parser.add_argument('-d', type=int, dest='downstream', default=0) parser.add_argument('gtf_file') parser.add_argument('chrom_sizes') args = parser.parse_args() upstream = args.upstream downstream = args.downstream chrom_sizes_file = args.chrom_sizes # check command line parameters if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) chrom_sizes = {} with open(chrom_sizes_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom_sizes[fields[0]] = int(fields[1]) # parse for locus_transcripts in parse_gtf(open(args.gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) tss_ids = set() for t in locus_transcripts: if t.strand == NO_STRAND: continue tss_id = t.attrs['tss_id'] if tss_id in tss_ids: continue tss_ids.add(tss_id) if t.strand == POS_STRAND: start = t.exons[0].start - upstream start = max(0, start) end = t.exons[0].start + downstream end = min(t.end, end) else: start = t.exons[-1].end - downstream start = max(t.start, start) end = t.exons[-1].end + upstream end = min(end, chrom_sizes[locus_chrom]) print '\t'.join( map(str, [ locus_chrom, start, end, tss_id, 0, strand_int_to_str(t.strand) ])) return 0
def parse_gtf_tss(gtf_file): tss_dict = {} for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: if t.strand == NO_STRAND: continue tss_id = t.attrs['tss_id'] if tss_id in tss_dict: continue tss_dict[tss_id] = (t.chrom, t.strand, t.start, t.end) return tss_dict
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') parser.add_argument('chrom_sizes') parser.add_argument("output_prefix") args = parser.parse_args() # read one locus at a time locus_file = args.output_prefix + '.locus.bed' intergenic_file = args.output_prefix + '.intergenic.bed' intron_file = args.output_prefix + '.intron.bed' locus_fileh = open(locus_file, 'w') introns = set() logging.info('Parsing transcripts by locus') for locus_transcripts in parse_gtf(open(args.gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) print >> locus_fileh, '\t'.join( [locus_chrom, str(locus_start), str(locus_end)]) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) # cluster locus exons cluster_tree = ClusterTree(0, 1) for t in locus_transcripts: # update locus for e in t.exons: cluster_tree.insert(e.start, e.end, 1) exon_clusters = [] for start, end, indexes in cluster_tree.getregions(): exon_clusters.append((start, end)) # get intronic regions e1 = exon_clusters[0] for j in xrange(1, len(exon_clusters)): e2 = exon_clusters[j] introns.add((locus_chrom, e1[1], e2[0])) e1 = e2 locus_fileh.close() # write introns logging.info('Writing introns') intron_fileh = open(intron_file, 'w') for chrom, start, end in sorted(introns): print >> intron_fileh, '\t'.join([chrom, str(start), str(end)]) intron_fileh.close() # take complement of locus file to find intergenic regions logging.info('Complementing locus intervals to find intergenic regions') args = ['bedtools', 'complement', '-i', locus_file, '-g', args.chrom_sizes] with open(intergenic_file, 'w') as f: subprocess.call(args, stdout=f)
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if not os.path.exists(args.ref_gtf_file): parser.error("GTF file %s not found" % (args.ref_gtf_file)) if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # show parameters logging.info("Parameters:") logging.info("verbose logging: %s" % (args.verbose)) logging.info("ref gtf file: %s" % (args.ref_gtf_file)) logging.info("assembly gtf file: %s" % (args.gtf_file)) # find CDS regions if not os.path.exists('tmp.srt.gtf'): with open('tmp.gtf', 'w') as outfileh: logging.info("Reading CDS regions from reference GTF") for f in get_cds_features(args.ref_gtf_file): print >>outfileh, str(f) logging.info("Reading transcripts from assembly GTF") i = 0 for f in GTFFeature.parse(open(args.gtf_file)): print >>outfileh, str(f) i += 1 if i % 100000 == 0: logging.debug("Parsed %d transcripts" % (i)) logging.info("Sorting GTF file") sort_gtf('tmp.gtf', 'tmp.srt.gtf') for locus_transcripts in parse_gtf(open('tmp.srt.gtf')): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for start, end, strand, m, t, c in categorize(locus_transcripts): fields = [locus_chrom, str(start), str(end), '%s|%s|%s' % (m,t,c), '0', strand_int_to_str(strand)] print '\t'.join(fields) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') parser.add_argument('chrom_sizes') parser.add_argument("output_prefix") args = parser.parse_args() # read one locus at a time locus_file = args.output_prefix + '.locus.bed' intergenic_file = args.output_prefix + '.intergenic.bed' intron_file = args.output_prefix + '.intron.bed' locus_fileh = open(locus_file, 'w') introns = set() logging.info('Parsing transcripts by locus') for locus_transcripts in parse_gtf(open(args.gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) print >>locus_fileh, '\t'.join([locus_chrom, str(locus_start), str(locus_end)]) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) # cluster locus exons cluster_tree = ClusterTree(0,1) for t in locus_transcripts: # update locus for e in t.exons: cluster_tree.insert(e.start, e.end, 1) exon_clusters = [] for start,end,indexes in cluster_tree.getregions(): exon_clusters.append((start,end)) # get intronic regions e1 = exon_clusters[0] for j in xrange(1, len(exon_clusters)): e2 = exon_clusters[j] introns.add((locus_chrom, e1[1], e2[0])) e1 = e2 locus_fileh.close() # write introns logging.info('Writing introns') intron_fileh = open(intron_file, 'w') for chrom, start,end in sorted(introns): print >>intron_fileh, '\t'.join([chrom, str(start), str(end)]) intron_fileh.close() # take complement of locus file to find intergenic regions logging.info('Complementing locus intervals to find intergenic regions') args = ['bedtools', 'complement', '-i', locus_file, '-g', args.chrom_sizes] with open(intergenic_file, 'w') as f: subprocess.call(args, stdout=f)
def compare_assembly(ref_gtf_file, test_gtf_file, output_dir, gtf_score_attr, tmp_dir): # output files compare_file = os.path.join(output_dir, "compare_transcripts.txt") global_stats_file = os.path.join(output_dir, "global_stats.txt") tmp_gtf_file = os.path.join(output_dir, "tmp.gtf") tmp_sorted_gtf_file = os.path.splitext(tmp_gtf_file)[0] + ".srt.gtf" # merge and sort ref/test gtf files logging.info("Merging reference and test GTF files") # make temporary file to store merged ref/test gtf files outfh = open(tmp_gtf_file, "w") logging.info("Adding reference GTF file") add_gtf_file(ref_gtf_file, outfh, is_ref=True, sample_id=None) logging.info("Adding test GTF file") add_gtf_file(test_gtf_file, outfh, is_ref=False, sample_id='assembly') outfh.close() logging.info("Sorting merged GTF file") sort_gtf(tmp_gtf_file, tmp_sorted_gtf_file, tmp_dir=tmp_dir) os.remove(tmp_gtf_file) # compare assemblies logging.info("Comparing assemblies") cmp_fh = open(compare_file, "w") print >>cmp_fh, '\t'.join(map(str, MatchStats.header_fields())) stats_obj = GlobalStats() for locus_transcripts in parse_gtf(open(tmp_sorted_gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) # score transcripts for t in locus_transcripts: if gtf_score_attr is None: t.score = 0.0 else: t.score = float(t.attrs.get(gtf_score_attr, 0.0)) # run comparison for mobj in compare_locus(locus_transcripts): print >>cmp_fh, str(mobj) # measure global stats locus_stats_obj = gather_global_stats(locus_transcripts) stats_obj = stats_obj + locus_stats_obj # cleanup cmp_fh.close() logging.info("Printing report") f = open(global_stats_file, "w") print >>f, stats_obj.report() os.remove(tmp_sorted_gtf_file) logging.info("Done")
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # parse command line parser = argparse.ArgumentParser() parser.add_argument('-u', type=int, dest='upstream', default=1000) parser.add_argument('-d', type=int, dest='downstream', default=0) parser.add_argument('gtf_file') parser.add_argument('chrom_sizes') args = parser.parse_args() upstream = args.upstream downstream = args.downstream chrom_sizes_file = args.chrom_sizes # check command line parameters if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) chrom_sizes = {} with open(chrom_sizes_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom_sizes[fields[0]] = int(fields[1]) # parse for locus_transcripts in parse_gtf(open(args.gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) tss_ids = set() for t in locus_transcripts: if t.strand == NO_STRAND: continue tss_id = t.attrs['tss_id'] if tss_id in tss_ids: continue tss_ids.add(tss_id) if t.strand == POS_STRAND: start = t.exons[0].start - upstream start = max(0, start) end = t.exons[0].start + downstream end = min(t.end, end) else: start = t.exons[-1].end - downstream start = max(t.start, start) end = t.exons[-1].end + upstream end = min(end, chrom_sizes[locus_chrom]) print '\t'.join(map(str, [locus_chrom, start, end, tss_id, 0, strand_int_to_str(t.strand)])) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() logging.info("Reading reference GTF file") locus_trees = read_reference_gtf(args.ref_gtf_file) logging.info("Categorizing test GTF file") for locus_transcripts in parse_gtf(open(args.gtf_file)): for t in locus_transcripts: categorize_transcript(t, locus_trees) for f in t.to_gtf_features(): print str(f)
def full_transcript_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, num_processes): # output files pfam_file = os.path.join(output_dir, 'full_length_pfam.txt') # open genome fasta file ref_fa = pysam.Fastafile(genome_fasta_file) # convert transcripts to amino acid sequences and write to fasta file logging.debug('Writing transcript amino acid sequences to FASTA file(s)') tmp_dir = os.path.join(output_dir, 'tmp') os.makedirs(tmp_dir) fasta_prefix = os.path.join(tmp_dir, 'full') fasta_files = [] fasta_sizes = [] for i in xrange(num_processes): fasta_files.append(open('%s%d.fasta' % (fasta_prefix, i), 'w')) fasta_sizes.append(0) num_finished = 1 fasta_file_index = 0 for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: # get amino acid sequences in all reading frames aa_seqs = translate_transcript(t, ref_fa) for frame, aa_seq in enumerate(aa_seqs): lines = to_fasta( '%s|frame=%d' % (t.attrs['transcript_id'], frame), aa_seq) print >> fasta_files[fasta_file_index], lines fasta_sizes[fasta_file_index] += 1 fasta_file_index = (fasta_file_index + 1) % num_processes if (num_finished % 10000) == 0: logging.debug('Processed %d transcripts' % (num_finished)) num_finished += 1 # get fasta files with lines written fasta_file_names = [] for i in xrange(len(fasta_files)): fasta_files[i].close() if fasta_sizes[i] > 0: fasta_file_names.append(fasta_files[i].name) # cleanup ref_fa.close() # # search FASTA file against Pfam # logging.debug('Scanning for Pfam domains') retcode = run_pfam(fasta_file_names, pfam_dir, pfam_file, tmp_dir) if retcode != 0: logging.error('Error running pfam_scan.pl') return retcode return 0
def classify_transcripts(classify_dir, num_processors, gtf_score_attr, tmp_dir): # setup input and output files lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE) lib_counts_list = list(LibCounts.from_file(lib_counts_file)) library_ids = [x.library_id for x in lib_counts_list] category_info_dict = {} for category_key in CATEGORIES: category_str = category_int_to_str[category_key] cinfo = CategoryInfo.create(library_ids, category_key, category_str, classify_dir) category_info_dict[category_key] = cinfo # write input files for classifier logging.info("Writing classification input files category='%s'" % (cinfo.category_str)) for transcripts in parse_gtf(open(cinfo.output_gtf_file)): for t in transcripts: # set transcript score t.score = float(t.attrs.get(gtf_score_attr, 0.0)) library_id = t.attrs[GTFAttr.LIBRARY_ID] fields = get_classification_fields(t) # lookup file handle and open new file if necessary if not library_id in cinfo.result_fh_dict: cinfo.result_fh_dict[library_id] = open( cinfo.result_file_dict[library_id], "w") print >> cinfo.result_fh_dict[library_id], '\t'.join( get_classification_header()) # write to file print >> cinfo.result_fh_dict[library_id], '\t'.join( map(str, fields)) # close open file handles for fh in cinfo.result_fh_dict.itervalues(): fh.close() for category_key, cinfo in category_info_dict.iteritems(): classify_tasks = [] for lib_counts in lib_counts_list: # see if can run classifier on this file if lib_counts.category_counts[category_key] > 0: filename = cinfo.result_file_dict[lib_counts.library_id] classify_tasks.append((lib_counts.library_id, filename)) # run classification logging.info("Classifying transcripts category='%s'" % (cinfo.category_str)) classify_category(cinfo, classify_tasks, num_processors, tmp_dir) # sort results sort_classification_results(cinfo.ctree_file, cinfo.sorted_ctree_file, tmp_dir) os.remove(cinfo.ctree_file)
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("gtf_file") args = parser.parse_args() for locus_transcripts in parse_gtf(open(args.gtf_file)): # group transcripts by gene id gene_transcript_map = collections.defaultdict(lambda: []) for t in locus_transcripts: gene_transcript_map[t.attrs['gene_id']].append(t) # categorize genes for gene_id, gene_transcripts in gene_transcript_map.iteritems(): gene_exons = cluster_isoforms(gene_transcripts) length = sum((e[1]-e[0]) for e in gene_exons) print '\t'.join([gene_id, str(length)])
def full_transcript_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, num_processes): # output files pfam_file = os.path.join(output_dir, 'full_length_pfam.txt') # open genome fasta file ref_fa = pysam.Fastafile(genome_fasta_file) # convert transcripts to amino acid sequences and write to fasta file logging.debug('Writing transcript amino acid sequences to FASTA file(s)') tmp_dir = os.path.join(output_dir, 'tmp') os.makedirs(tmp_dir) fasta_prefix = os.path.join(tmp_dir, 'full') fasta_files = [] fasta_sizes = [] for i in xrange(num_processes): fasta_files.append(open('%s%d.fasta' % (fasta_prefix, i), 'w')) fasta_sizes.append(0) num_finished = 1 fasta_file_index = 0 for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: # get amino acid sequences in all reading frames aa_seqs = translate_transcript(t, ref_fa) for frame, aa_seq in enumerate(aa_seqs): lines = to_fasta('%s|frame=%d' % (t.attrs['transcript_id'], frame), aa_seq) print >>fasta_files[fasta_file_index], lines fasta_sizes[fasta_file_index] += 1 fasta_file_index = (fasta_file_index + 1) % num_processes if (num_finished % 10000) == 0: logging.debug('Processed %d transcripts' % (num_finished)) num_finished += 1 # get fasta files with lines written fasta_file_names = [] for i in xrange(len(fasta_files)): fasta_files[i].close() if fasta_sizes[i] > 0: fasta_file_names.append(fasta_files[i].name) # cleanup ref_fa.close() # # search FASTA file against Pfam # logging.debug('Scanning for Pfam domains') retcode = run_pfam(fasta_file_names, pfam_dir, pfam_file, tmp_dir) if retcode != 0: logging.error('Error running pfam_scan.pl') return retcode return 0
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("gtf_file") args = parser.parse_args() for locus_transcripts in parse_gtf(open(args.gtf_file)): # group transcripts by gene id gene_transcript_map = collections.defaultdict(lambda: []) for t in locus_transcripts: gene_transcript_map[t.attrs['gene_id']].append(t) # categorize genes for gene_id, gene_transcripts in gene_transcript_map.iteritems(): gene_exons = cluster_isoforms(gene_transcripts) length = sum((e[1] - e[0]) for e in gene_exons) print '\t'.join([gene_id, str(length)])
def build_locus_trees(gtf_file): transcripts = [] locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1)) for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: is_ref = bool(int(t.attrs[GTFAttr.REF])) if not is_ref: continue i = len(transcripts) transcripts.append(t) locus_cluster_trees[t.chrom].insert(t.start, t.end, i) # build interval trees of loci locus_trees = collections.defaultdict(lambda: IntervalTree()) for chrom, cluster_tree in locus_cluster_trees.iteritems(): for locus_start, locus_end, indexes in cluster_tree.getregions(): for i in indexes: locus_transcripts = [transcripts[i] for i in indexes] locus_trees[chrom].insert_interval(Interval(locus_start, locus_end, value=locus_transcripts)) return locus_trees
def classify_transcripts(classify_dir, num_processors, gtf_score_attr, tmp_dir): # setup input and output files lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE) lib_counts_list = list(LibCounts.from_file(lib_counts_file)) library_ids = [x.library_id for x in lib_counts_list] category_info_dict = {} for category_key in CATEGORIES: category_str = category_int_to_str[category_key] cinfo = CategoryInfo.create(library_ids, category_key, category_str, classify_dir) category_info_dict[category_key] = cinfo # write input files for classifier logging.info("Writing classification input files category='%s'" % (cinfo.category_str)) for transcripts in parse_gtf(open(cinfo.output_gtf_file)): for t in transcripts: # set transcript score t.score = float(t.attrs.get(gtf_score_attr, 0.0)) library_id = t.attrs[GTFAttr.LIBRARY_ID] fields = get_classification_fields(t) # lookup file handle and open new file if necessary if not library_id in cinfo.result_fh_dict: cinfo.result_fh_dict[library_id] = open(cinfo.result_file_dict[library_id], "w") print >>cinfo.result_fh_dict[library_id], '\t'.join(get_classification_header()) # write to file print >>cinfo.result_fh_dict[library_id], '\t'.join(map(str, fields)) # close open file handles for fh in cinfo.result_fh_dict.itervalues(): fh.close() for category_key, cinfo in category_info_dict.iteritems(): classify_tasks = [] for lib_counts in lib_counts_list: # see if can run classifier on this file if lib_counts.category_counts[category_key] > 0: filename = cinfo.result_file_dict[lib_counts.library_id] classify_tasks.append((lib_counts.library_id, filename)) # run classification logging.info("Classifying transcripts category='%s'" % (cinfo.category_str)) classify_category(cinfo, classify_tasks, num_processors, tmp_dir) # sort results sort_classification_results(cinfo.ctree_file, cinfo.sorted_ctree_file, tmp_dir) os.remove(cinfo.ctree_file)
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() logging.info("Reading reference GTF file") locus_trees = read_reference_gtf(args.ref_gtf_file) logging.info("Categorizing test GTF file") for locus_transcripts in parse_gtf(open(args.gtf_file)): # group transcripts by gene id gene_transcript_map = collections.defaultdict(lambda: []) for t in locus_transcripts: gene_transcript_map[t.attrs[GTFAttr.GENE_ID]].append(t) # categorize genes for gene_transcripts in gene_transcript_map.itervalues(): categorize_gene_transcripts(gene_transcripts, locus_trees) # output transcript for t in gene_transcripts: for f in t.to_gtf_features(): print str(f)
def main(): # setup logging logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # parse command line parser = argparse.ArgumentParser() parser.add_argument('-g', dest='gene_id', action='store_true', default=False) parser.add_argument('gtf_file') args = parser.parse_args() # check command line parameters if not os.path.exists(args.gtf_file): parser.error("gtf file %s not found" % (args.gtf_file)) for transcripts in parse_gtf(open(args.gtf_file)): for t in transcripts: if args.gene_id: name = '%s|%s' % (t.attrs['gene_id'], t.attrs['transcript_id']) else: name = t.attrs['transcript_id'] fields = write_bed(t.chrom, name, t.strand, 1000, t.exons) print '\t'.join(fields) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("gtf_file") parser.add_argument("region") args = parser.parse_args() if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) region_chrom, startend = args.region.split(":") start, end = startend.split("-") region_start = int(start) region_end = int(end) for transcripts in parse_gtf(open(args.gtf_file)): for t in transcripts: if ((t.chrom == region_chrom) and (t.start < region_end) and (t.end > region_start)): features = t.to_gtf_features() for f in features: print str(f) logging.debug("Done") return 0
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() logging.info("Reading reference GTF file") locus_trees = read_reference_gtf(args.ref_gtf_file) logging.info("Categorizing test GTF file") for locus_transcripts in parse_gtf(open(args.gtf_file)): # group transcripts by gene id gene_transcript_map = collections.defaultdict(lambda: []) for t in locus_transcripts: gene_transcript_map[t.attrs[GTFAttr.GENE_ID]].append(t) # categorize genes for gene_transcripts in gene_transcript_map.itervalues(): categorize_gene_transcripts(gene_transcripts, locus_trees) # output transcript for t in gene_transcripts: for f in t.to_gtf_features(): print str(f)
def read_gtf(filename): return list(parse_gtf(open(get_gtf_path(filename))))
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') parser.add_argument('excl_file') parser.add_argument('chrom_sizes') parser.add_argument("output_prefix") args = parser.parse_args() prefix = args.output_prefix excl_file = args.excl_file chrom_sizes_file = args.chrom_sizes gtf_file = args.gtf_file # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) gene_intervals_file = prefix + '.gene_intervals.bed' gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed' shuffled_gtf_file = prefix + '.shuffle.gtf' sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf' logging.info('Parsing GTF file') with open(gene_intervals_file, 'w') as f: for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for g in get_gene_intervals(locus_transcripts): print >> f, '\t'.join( map(str, [g.chrom, g.start, g.end, g.gene_id])) # randomly shuffle genes logging.info("Shuffling genes") args = [ 'bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file, '-g', args.chrom_sizes ] with open(gene_intervals_shuffled_file, 'w') as fileh: subprocess.call(args, stdout=fileh) # read new gene positions logging.info("Reading shuffled gene intervals") shuffle_gene_map = {} with open(gene_intervals_shuffled_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom = fields[0] start = int(fields[1]) end = int(fields[2]) gene_id = fields[3] shuffle_gene_map[gene_id] = (chrom, start, end) # reposition transcripts logging.info("Repositioning transcripts") with open(shuffled_gtf_file, 'w') as fileh: for locus_transcripts in parse_gtf(open(gtf_file)): # get original positions orig_gene_map = {} for g in get_gene_intervals(locus_transcripts): orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end) for t in locus_transcripts: gene_id = t.attrs['gene_id'] orig_chrom, orig_start, orig_end = orig_gene_map[gene_id] if gene_id not in shuffle_gene_map: logging.warning( 'Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end)) continue new_chrom, new_start, new_end = shuffle_gene_map[gene_id] # reposition transcript t.chrom = new_chrom t.start = new_start + (t.start - orig_start) t.end = new_start + (t.end - orig_start) for e in t.exons: e.start = new_start + (e.start - orig_start) e.end = new_start + (e.end - orig_start) fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons) print '\t'.join(fields) #for f in t.to_gtf_features(source='shuffle'): # print >>fileh, str(f) logging.info("Sorting GTF file") sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--rename', dest='rename', action='store_true') parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file rename = args.rename if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) # parse transcripts num_transcripts = 0 # keep track of redundant gene/transcript counts gene_map = collections.defaultdict(lambda: {}) transcript_map = collections.defaultdict(lambda: {}) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: catstr = t.attrs['category'] catint = Category.to_int(catstr) gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] if catint == Category.SAME_STRAND: # impute gene type new_gene_type = ref_gene_type else: if gene_type == 'protein_coding': # don't change protein coding genes new_gene_type = gene_type elif t.length < 250: # categorize small RNA separately new_gene_type = 'misc_RNA' else: if ref_gene_type == 'protein_coding': # categorize based on overlap with reference new_gene_type = PROTEIN_CATEGORY_MAP[catint] else: # reference is also non-coding new_gene_type = 'lincRNA' # get gene category gene_category = GENCODE_CATEGORY_MAP[new_gene_type] new_gene_name = None if rename: # resolve upper/lower case issue with gene names from # different databases ref_gene_name = t.attrs['ref_gene_name'].upper() # build new gene name if ref_gene_name == 'NONE': new_gene_name = str(t.attrs['source']) elif catint == Category.SAME_STRAND: new_gene_name = str(ref_gene_name) else: new_gene_name = '%s.%s' % (ref_gene_name, catstr) # gene name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[new_gene_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[gene_id] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num) # write new attributes for f in t.to_gtf_features(source='assemblyline', score=1000): f.attrs['gene_type'] = new_gene_type f.attrs['gene_category'] = gene_category if rename: if 'gene_name' in f.attrs: f.attrs['orig_gene_name'] = f.attrs['gene_name'] f.attrs['gene_name'] = new_gene_name print str(f) num_transcripts += 1 return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--length", type=int, dest="length", default=250) parser.add_argument("--ncrna-exons", type=int, dest="ncrna_exons", default=1) parser.add_argument("--pseudogene-exons", type=int, dest="pseudogene_exons", default=1) parser.add_argument("--antisense-exons", type=int, dest="antisense_exons", default=2) parser.add_argument("--intronic-exons", type=int, dest="intronic_exons", default=2) parser.add_argument("--intergenic-exons", type=int, dest="intergenic_exons", default=2) parser.add_argument("--intergenic-dist", type=int, dest="intergenic_dist", default=1000) parser.add_argument("gtf_file") args = parser.parse_args() lncrna_categories = set(("intronic", "intergenic", "antisense", "ncrna")) logging.debug("Retrieving lncrna features from GTF") total_transcripts = 0 lncrnas = 0 intergenic = 0 intronic = 0 antisense = 0 pseudogene = 0 ncrna = 0 for transcripts in parse_gtf(open(args.gtf_file)): total_transcripts += len(transcripts) for t in transcripts: # throw out protein coding genes category = t.attrs["category"] if category not in lncrna_categories: continue # throw out transcripts that overlap certain classes of # transcripts annotation_sources = set(t.attrs["annotation_sources"].split(',')) if not annotation_sources.isdisjoint(IGNORE_SOURCES): continue # check length requirement if t.length < args.length: continue if category == "intergenic": dist = int(t.attrs["nearest_dist"]) # exclude "intergenic" lncrnas that are close to known genes if dist < args.intergenic_dist: continue if len(t.exons) < args.intergenic_exons: continue intergenic += 1 elif category == "intronic": if len(t.exons) < args.intronic_exons: continue intronic += 1 elif category == "antisense": if len(t.exons) < args.antisense_exons: continue antisense += 1 else: # keep multi-exonic pseudogenes if not annotation_sources.isdisjoint(PSEUDOGENE_SOURCES): if len(t.exons) < args.pseudogene_exons: continue pseudogene += 1 else: if len(t.exons) < args.ncrna_exons: continue ncrna += 1 # output for f in t.to_gtf_features(): print str(f) lncrnas += 1 logging.debug("Read %d lncrna transcripts out of %d total transcripts" % (lncrnas, total_transcripts)) logging.debug("intergenic: %d" % (intergenic)) logging.debug("intronic: %d" % (intronic)) logging.debug("antisense: %d" % (antisense)) logging.debug("pseudogene: %d" % (pseudogene)) logging.debug("ncrna: %d" % (ncrna))
def orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, min_orf_length, first_orf_only, num_processes): # # extract transcript DNA sequences, translate to protein, and # search for ORFs # logging.debug('Finding ORFs in transcript sequences') # output files tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.info("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) orf_bed_file = os.path.join(output_dir, 'transcript_orfs.bed') unique_orf_file = os.path.join(output_dir, 'unique_orfs.txt') unique_orf_bed_file = os.path.join(output_dir, 'unique_orfs.bed') orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.txt') sorted_orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.sortbyorf.txt') sorted_orf_id_file = os.path.join(tmp_dir, 'transcript_orfs.sortbyorf.txt') signalp_file = os.path.join(output_dir, 'signalp.txt') pfam_file = os.path.join(output_dir, 'pfam.txt') merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbyorf.merged.txt') sorted_merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbytranscript.merged.txt') # open output files orf_fileh = open(orf_file, 'w') orf_bed_fileh = open(orf_bed_file, 'w') # open genome fasta file ref_fa = pysam.Fastafile(genome_fasta_file) num_finished = 1 for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: if first_orf_only: # get first ORF orf = get_first_transcript_orf(t, ref_fa) if len(orf.seq) >= min_orf_length: print >>orf_fileh, '\t'.join(orf.to_table()) print >>orf_bed_fileh, '\t'.join(orf.to_bed()) else: # get all ORFs for orf in get_all_transcript_orfs(t, ref_fa, min_orf_length): print >>orf_fileh, '\t'.join(orf.to_table()) print >>orf_bed_fileh, '\t'.join(orf.to_bed()) if (num_finished % 10000) == 0: logging.debug('Processed %d transcripts' % (num_finished)) num_finished += 1 # cleanup orf_fileh.close() orf_bed_fileh.close() # # sort ORF table by ORF amino acid sequence to group identical ORFs # together # logging.debug('Sorting ORFs by amino acid sequence') def sort_by_seq(line): '''comparison function for batch_sort''' fields = line.strip().split('\t') return fields[ORFInfo.SEQ_COL_NUM] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=orf_file, output=sorted_orf_file, key=sort_by_seq, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) shutil.rmtree(sort_tmp_dir) # # assign each ORF a unique id and write to FASTA file # logging.debug('Determining unique ORFs') orf_fasta_prefix = os.path.join(tmp_dir, 'orf') orf_fasta_files = [] orf_fasta_sizes = [] for i in xrange(num_processes): orf_fasta_files.append(open('%s%d.fasta' % (orf_fasta_prefix, i), 'w')) orf_fasta_sizes.append(0) orf_file_index = 0 outfileh = open(sorted_orf_id_file, 'w') unique_orf_fileh = open(unique_orf_file, 'w') print >>unique_orf_fileh, '\t'.join(['orf_id', 'orf_length', 'total_occurrences', 'unique_genomic_occurrences']) unique_orf_bed_fileh = open(unique_orf_bed_file, 'w') with open(sorted_orf_file) as infileh: for orfs in group_unique_orfs(infileh): # write to master transcript/ORF table for orf in orfs: print >>outfileh, '\t'.join(orf.to_table()) # write ORF to fasta file lines = to_fasta(orfs[0].orf_id, orfs[0].seq.strip('*')) print >>orf_fasta_files[orf_file_index], lines orf_fasta_sizes[orf_file_index] += 1 # advance to next fasta file orf_file_index = (orf_file_index + 1) % (num_processes) # group by genomic position and write ORFs to BED file unique_genome_orfs = {} for orf in orfs: k = (orf.chrom, orf.strand, tuple(orf.exons)) if k in unique_genome_orfs: continue unique_genome_orfs[k] = orf for orf in unique_genome_orfs.itervalues(): print >>unique_orf_bed_fileh, '\t'.join(orf.to_bed(orf.orf_id)) # write unique ORF to tab-delimited text file fields = [orfs[0].orf_id, len(orfs[0].seq), len(orfs), len(unique_genome_orfs)] print >>unique_orf_fileh, '\t'.join(map(str,fields)) # cleanup unique_orf_bed_fileh.close() outfileh.close() # get fasta files with lines written orf_fasta_file_names = [] for i in xrange(len(orf_fasta_files)): orf_fasta_files[i].close() if orf_fasta_sizes[i] > 0: orf_fasta_file_names.append(orf_fasta_files[i].name) # # search FASTA file against signalp # logging.debug('Searching for signal peptides') retcode = run_signalp(orf_fasta_file_names, signalp_file, tmp_dir) if retcode != 0: logging.error('Error searching for signal peptides') return 1 # # search FASTA file against Pfam # logging.error('Scanning for Pfam domains') retcode = run_pfam(orf_fasta_file_names, pfam_dir, pfam_file, tmp_dir) if retcode != 0: logging.error('Error running pfam_scan.pl') # # merge results from Pfam and signalp # logging.debug('Merging SignalP and Pfam results') merge_results(sorted_orf_id_file, signalp_file, pfam_file, merged_orf_id_file) # # sort by transcript id # logging.debug('Sorting ORFs by transcript ID') def sort_by_transcript_id(line): return line.split('\t', 1)[0] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=merged_orf_id_file, output=sorted_merged_orf_id_file, key=sort_by_transcript_id, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) shutil.rmtree(sort_tmp_dir) # cleanup ref_fa.close() return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--assembly", dest = 'assembly_bed', default = intergenic_assembly_bed, help = 'Assembly file used for shuffling and snp overlap intersection') parser.add_argument("--snps", dest = 'snps', default = snp_bed, help = 'SNP universe bed file') parser.add_argument("--excl", dest = 'excl', default = excl_file, help = 'Exclusion file used for shuffling') parser.add_argument("--chrom", dest = 'chrom', default = chrom_sizes_file, help = 'Chrom size file used for shuffling') parser.add_argument("--gtf", dest = 'gtf', default = gtf_file, help = 'GTF file used to generate shuffle (should match assembly_bed)') parser.add_argument("--gwas", dest = 'gwas', default = gwas_bed, help = 'GWAS bed file file used for intersection') parser.add_argument("--flank", dest = 'flank', default = 0, help = 'number of flanking bases to add to bed files') args = parser.parse_args() args.flank = int(args.flank) logging.info('Output is printed to stdout, to save use \'>\' <filename>') # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) if not os.path.isdir('GWAS_TMPS'): os.mkdir('GWAS_TMPS') prefix = 'GWAS_TMPS' gene_intervals_file = os.path.join(prefix, 'gene_intervals.bed') intersect_file = os.path.join(prefix, 'intersect.txt') assembly_flank = os.path.join(prefix, 'flank.bed') output_file = 'gwas_intergenic_null.txt' logging.info('Parsing GTF file') with open(gene_intervals_file, 'w') as f: for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) # logging.debug("[LOCUS] %s:%d-%d %d transcripts" % # (locus_chrom, locus_start, locus_end, # len(locus_transcripts))) for g in get_gene_intervals(locus_transcripts): print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id])) #apply flank to the bed file #read chrom file to make sure flanks added do not enter chrom ends chrom_length = {} for line in open(chrom_sizes_file): line = line.strip().split('\t') chr = line[0] length = line[1] chrom_length[chr] = length with open(assembly_flank, 'w') as f: for line in open(args.assembly_bed): line = line.strip().split('\t') chr = line[0] start = int(line[1]) end = int(line[2]) chr_len = chrom_length[chr] start = max(0, (start - args.flank)) end = min(chr_len, (end + args.flank)) line[1] = start line[2] = end print >> f, '\t'.join(map(str, line)) #GWAS snps #do intersections for real data and report number of overlapping GWAS snps logging.info('Intersecting assembly with GWAS snps') args_int = ['bedtools', 'intersect', '-a', args.gwas, '-b', assembly_flank, '-wa', '-wb'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) gwas_overlap = len(snps) #snp universe #do intersections for real data and report number of overlapping snps in snp universe logging.info('Intersecting assembly with snp universe') args_int = ['bedtools', 'intersect', '-a', args.snps, '-b', assembly_flank, '-wa', '-wb', '-sorted'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) snp_overlap = len(snps) frac_real = float(gwas_overlap)/snp_overlap logging.info('%d GWAS snps overlap compendia genes' % gwas_overlap) logging.info('%d snps (from \"snp universe\") overlap compendia genes' % snp_overlap) logging.info('Frac: %f' % frac_real) print '\t'.join(map(str, [args.flank, gwas_overlap, snp_overlap, frac_real])) # valso.close() shutil.rmtree(prefix) return 0
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig( level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if not os.path.exists(args.ref_gtf_file): parser.error("GTF file %s not found" % (args.ref_gtf_file)) if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # show parameters logging.info("Parameters:") logging.info("verbose logging: %s" % (args.verbose)) logging.info("ref gtf file: %s" % (args.ref_gtf_file)) logging.info("assembly gtf file: %s" % (args.gtf_file)) # find CDS regions if not os.path.exists('tmp.srt.gtf'): with open('tmp.gtf', 'w') as outfileh: logging.info("Reading CDS regions from reference GTF") for f in get_cds_features(args.ref_gtf_file): print >> outfileh, str(f) logging.info("Reading transcripts from assembly GTF") i = 0 for f in GTFFeature.parse(open(args.gtf_file)): print >> outfileh, str(f) i += 1 if i % 100000 == 0: logging.debug("Parsed %d transcripts" % (i)) logging.info("Sorting GTF file") sort_gtf('tmp.gtf', 'tmp.srt.gtf') for locus_transcripts in parse_gtf(open('tmp.srt.gtf')): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for start, end, strand, m, t, c in categorize(locus_transcripts): fields = [ locus_chrom, str(start), str(end), '%s|%s|%s' % (m, t, c), '0', strand_int_to_str(strand) ] print '\t'.join(fields) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--rand_snp", dest = 'rand_snp', default = rand_snps_file, help = 'Bed file of random snps to use as negative control for analyses') parser.add_argument("--snps", dest = 'snps', default = snp_bed, help = 'SNP universe bed file') parser.add_argument("--excl", dest = 'excl', default = excl_file, help = 'Exclusion file used for shuffling') parser.add_argument("--chrom", dest = 'chrom', default = chrom_sizes_file, help = 'Chrom size file used for shuffling') parser.add_argument("--gtf", dest = 'gtf', help = 'GTF file used to generate shuffle') parser.add_argument("--gwas", dest = 'gwas', default = gwas_bed_file, help = 'GWAS bed file file used for intersection') parser.add_argument("--shuffs", dest = 'shuffs', default = 100, help = 'number of shuffles to perform') parser.add_argument("-p", dest = 'proc', default = 4, help = 'number of processors to use') parser.add_argument("--flank", dest = 'flank', default = 0, help = 'number of flanking bases to add to bed files') parser.add_argument("--exon", dest="exon", action="store_true", default=False, help="Perform analysis looking only at exonic overlap") args = parser.parse_args() args.proc = int(args.proc) args.flank = int(args.flank) logging.info('Output is printed to stdout') if args.exon: logging.info('Looking at exonic overlap only') # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) if not os.path.isdir('GWAS_TMPS'): os.mkdir('GWAS_TMPS') prefix = 'GWAS_TMPS' locus_intervals_file = os.path.join(prefix, 'locus_intervals.bed') intersect_file = os.path.join(prefix, 'intersect.txt') assembly_flank = os.path.join(prefix, 'flank.bed') assembly_flank_sorted = os.path.join(prefix + '.flank.sorted.bed') assembly_bed = os.path.join(prefix, 'assembly.bed') #read chrom file to make sure flanks added do not enter chrom ends chrom_length = {} for line in open(chrom_sizes_file): line = line.strip().split('\t') chr = line[0] length = line[1] chrom_length[chr] = length #convert GTF file to BED for initial intersections and get locus intervals logging.info('Parsing GTF: converting to BED and obtaining locus intervals') with open(assembly_bed, 'w') as f2: with open(locus_intervals_file, 'w') as f: j = 0 for locus_transcripts in parse_gtf(open(args.gtf)): if (j%2500)==0: logging.debug('Finished %d/%d loci' % (j, 35000)) for t in locus_transcripts: name = t.attrs['transcript_id'] fields = write_bed(t.chrom, name, t.strand, 1000, t.exons, args.flank, chrom_length) print >>f2, '\t'.join(fields) # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) locus_id = j j+=1 print >>f, '\t'.join(map(str, [locus_chrom, locus_start, locus_end, locus_id])) #apply flank to the bed file with open(assembly_flank, 'w') as f: for line in open(assembly_bed): line = line.strip().split('\t') chr = line[0] start = int(line[1]) end = int(line[2]) chr_len = chrom_length[chr] start = max(0, (start - args.flank)) end = min(chr_len, (end + args.flank)) line[1] = start line[2] = end print >> f, '\t'.join(map(str, line)) args_sort = ['sort', '-k1,1', '-k2,2n', assembly_flank] with open(assembly_flank_sorted, 'w') as fileh: subprocess.call(args_sort, stdout=fileh) #GWAS snps #do intersections for real data and report number of overlapping GWAS snps logging.info('Intersecting assembly with GWAS snps') args_int = ['bedtools', 'intersect', '-a', args.gwas, '-b', assembly_flank_sorted, '-sorted', '-wa', '-wb'] if args.exon: args_int.append('-split') with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) gwas_overlap = len(snps) #Random snps #do intersections for real data and report number of overlapping GWAS snps logging.info('Intersecting assembly with random snps') args_int = ['bedtools', 'intersect', '-a', args.rand_snp, '-b', assembly_flank_sorted, '-sorted', '-wa', '-wb'] if args.exon: args_int.append('-split') with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) rand_overlap = len(snps) #snp universe #do intersections for real data and report number of overlapping snps in snp universe logging.info('Intersecting assembly with snp universe') args_int = ['bedtools', 'intersect', '-a', args.snps, '-b', assembly_flank_sorted, '-sorted', '-wa', '-wb'] if args.exon: args_int.append('-split') with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) snp_overlap = len(snps) frac_real = float(gwas_overlap)/snp_overlap frac_rand = float(rand_overlap)/snp_overlap if args.exon: logging.info('%d GWAS snps overlap compendia exons' % gwas_overlap) logging.info('%d random snps overlap compendia exons' % rand_overlap) logging.info('%d total snps overlap compendia exons' % snp_overlap) else: logging.info('%d GWAS snps overlap compendia genes' % gwas_overlap) logging.info('%d random snps overlap compendia genes' % rand_overlap) logging.info('%d total snps overlap compendia genes' % snp_overlap) logging.info('Frac_gwas: %f' % frac_real) logging.info('Frac_rand: %f' % frac_rand) #loop the shuffle to generate a distribution of nulls for number of snps hit by random intergenic genes pool = multiprocessing.Pool(args.proc) NUM_SHUFFS = int(args.shuffs) shuff_args = (args.snps, args.gwas, args.rand_snp, args.excl, locus_intervals_file, args.chrom, args.gtf, frac_real, frac_rand, gwas_overlap, rand_overlap, snp_overlap, NUM_SHUFFS, prefix, args.flank, args.exon) tasks = [] header = [ 'gwas_shuff_overlap', 'rand_snp_shuff_overlap', 'all_snp_shuff_overlap', 'frac_gwas_shuff', 'frac_rand_shuff', 'gwas_overlap', 'rand_snp_overlap', 'all_snp_overlap', 'frac_gwas', 'frac_rand', 'OR_gwas', 'OR_rand' ] print '\t'.join(header) logging.info("Beginning shuffles") for i in xrange(NUM_SHUFFS): tasks.append((i,) + shuff_args) result_iter = pool.imap_unordered(shuffle_imap, tasks) for line in result_iter: print line pool.close() pool.join() shutil.rmtree(prefix) return 0
def shuffle(process, snps_file, gwas_file, excl_file, locus_intervals_file, chrom_sizes_file, gtf_file, frac_real, gwas_real, snps_real, NUM_SHUFFS, output_dir, flank): x = process prefix = 'process' + str(x) locus_intervals_shuffled_file = os.path.join(output_dir, prefix + '.locus_intervals.shuffle.bed') shuffled_bed_file = os.path.join(output_dir, prefix + '.shuffle.bed') intersect_file = os.path.join(output_dir, prefix + '.intersect.txt') args_shuff = ['bedtools', 'shuffle', '-excl', excl_file, '-i', locus_intervals_file, '-g', chrom_sizes_file] with open(locus_intervals_shuffled_file, 'w') as fileh: subprocess.call(args_shuff, stdout=fileh) # read new gene positions # logging.info("Reading shuffled gene intervals") shuffle_locus_map = {} with open(locus_intervals_shuffled_file) as fileh: for line in fileh: fields = line.strip().split('\t') # print fields chrom = fields[0] start = int(fields[1]) end = int(fields[2]) locus_id = int(fields[3]) shuffle_locus_map[locus_id] = (chrom, start, end) # reposition transcripts #read chrom file to make sure flanks added do not enter chrom ends chrom_length = {} for line in open(chrom_sizes_file): line = line.strip().split('\t') chr = line[0] length = line[1] chrom_length[chr] = length with open(shuffled_bed_file, 'w') as fileh: i=0 for locus_transcripts in parse_gtf(open(gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) locus_id = i i+=1 orig_locus_map = {} orig_locus_map[locus_id] = (locus_chrom, locus_start, locus_end) orig_chrom, orig_start, orig_end = orig_locus_map[locus_id] if locus_id not in shuffle_locus_map.keys(): logging.warning('Locus %s [%s:%d-%d] could not be shuffled' % (locus_id, orig_chrom, orig_start, orig_end)) continue for t in locus_transcripts: new_chrom, new_start, new_end = shuffle_locus_map[locus_id] # reposition transcript t.chrom = new_chrom t.start = new_start + (t.start - orig_start) t.end = new_start + (t.end - orig_start) for e in t.exons: e.start = new_start + (e.start - orig_start) e.end = new_start + (e.end - orig_start) fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons, flank, chrom_length) print >>fileh, '\t'.join(map(str,fields)) #gwas snps #do intersection for shuffle with GWAS snps # logging.info('Performing GWAS intersect %d/%d' % (x+1, NUM_SHUFFS)) args_int = ['bedtools', 'intersect', '-a', gwas_file, '-b', shuffled_bed_file, '-wa', '-wb'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of GWAS SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) val = len(snps) #snp universe #do intersections for shuffle and snp universe args_int = ['bedtools', 'intersect', '-a', snps_file, '-b', shuffled_bed_file, '-wa', '-wb'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) snp_overlap = len(snps) frac = float(val)/snp_overlap OR = frac_real/frac logging.info('Shuffle %d/%d. GWAS: %d, Universe: %d, Fraction: %f, OR: %f'%(x, NUM_SHUFFS, val, snp_overlap, frac, OR)) return '\t'.join(map(str, [val, snp_overlap, frac, gwas_real, snps_real, frac_real, OR]))
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--assembly", dest = 'assembly_bed', default = intergenic_assembly_bed, help = 'Assembly file used for shuffling and snp overlap intersection') parser.add_argument("--snps", dest = 'snps', default = snp_bed, help = 'SNP universe bed file') parser.add_argument("--excl", dest = 'excl', default = excl_file, help = 'Exclusion file used for shuffling') parser.add_argument("--chrom", dest = 'chrom', default = chrom_sizes_file, help = 'Chrom size file used for shuffling') parser.add_argument("--gtf", dest = 'gtf', default = gtf_file, help = 'GTF file used to generate shuffle (should match assembly_bed)') parser.add_argument("--gwas", dest = 'gwas', default = gwas_bed, help = 'GWAS bed file file used for intersection') parser.add_argument("--shuffs", dest = 'shuffs', default = 100, help = 'number of shuffles to perform') parser.add_argument("-p", dest = 'proc', default = 4, help = 'number of processors to use') parser.add_argument("--flank", dest = 'flank', default = 0, help = 'number of flanking bases to add to bed files') args = parser.parse_args() args.proc = int(args.proc) args.flank = int(args.flank) logging.info('Output is printed to stdout, to save use \'>\' <filename>') # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) if not os.path.isdir('GWAS_TMPS'): os.mkdir('GWAS_TMPS') prefix = 'GWAS_TMPS' locus_intervals_file = os.path.join(prefix, 'locus_intervals.bed') intersect_file = os.path.join(prefix, 'intersect.txt') assembly_flank = os.path.join(prefix, 'flank.bed') logging.info('Parsing GTF file') with open(locus_intervals_file, 'w') as f: j = 0 for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) locus_id = j j+=1 print >>f, '\t'.join(map(str, [locus_chrom, locus_start, locus_end, locus_id])) #apply flank to the bed file #read chrom file to make sure flanks added do not enter chrom ends chrom_length = {} for line in open(chrom_sizes_file): line = line.strip().split('\t') chr = line[0] length = line[1] chrom_length[chr] = length with open(assembly_flank, 'w') as f: for line in open(args.assembly_bed): line = line.strip().split('\t') chr = line[0] start = int(line[1]) end = int(line[2]) chr_len = chrom_length[chr] start = max(0, (start - args.flank)) end = min(chr_len, (end + args.flank)) line[1] = start line[2] = end print >> f, '\t'.join(map(str, line)) #GWAS snps #do intersections for real data and report number of overlapping GWAS snps logging.info('Intersecting assembly with GWAS snps') args_int = ['bedtools', 'intersect', '-a', args.gwas, '-b', assembly_flank, '-wa', '-wb'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) gwas_overlap = len(snps) #snp universe #do intersections for real data and report number of overlapping snps in snp universe logging.info('Intersecting assembly with snp universe') args_int = ['bedtools', 'intersect', '-a', args.snps, '-b', assembly_flank, '-wa', '-wb', '-sorted'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) snp_overlap = len(snps) frac_real = float(gwas_overlap)/snp_overlap logging.info('%d GWAS snps overlap compendia genes' % gwas_overlap) logging.info('%d snps (from \"snp universe\") overlap compendia genes' % snp_overlap) logging.info('Frac: %f' % frac_real) #loop the shuffle to generate a distribution of nulls for number of snps hit by random intergenic genes pool = multiprocessing.Pool(args.proc) NUM_SHUFFS = int(args.shuffs) shuff_args = (args.snps, args.gwas, args.excl, locus_intervals_file, args.chrom, args.gtf, frac_real, gwas_overlap, snp_overlap, NUM_SHUFFS, prefix, args.flank) tasks = [] header = [ 'gwas_shuff', 'snp_shuff', 'frac_shuff', 'gwas_real', 'snp_real', 'frac_real', 'OR' ] print '\t'.join(header) for i in xrange(NUM_SHUFFS): tasks.append((i,) + shuff_args) result_iter = pool.imap_unordered(shuffle_imap, tasks) for line in result_iter: print line pool.close() pool.join() shutil.rmtree(prefix) return 0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): # output files if not os.path.exists(output_dir): logging.info('Creating output dir: %s' % (output_dir)) os.makedirs(output_dir) # merge step merged_gtf_file = os.path.join(output_dir, "merged.gtf") merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf" merge_done_file = os.path.join(output_dir, 'merged.done') sort_done_file = os.path.join(output_dir, 'sort.done') if not os.path.exists(merge_done_file): # merge and sort ref/test gtf files logging.info("Merging reference and test GTF files") # make temporary file to store merged ref/test gtf files with open(merged_gtf_file, "w") as fileh: logging.info("Adding reference GTF file") add_gtf_file(ref_gtf_file, fileh, is_ref=True) logging.info("Adding test GTF file") add_gtf_file(test_gtf_file, fileh, is_ref=False) open(merge_done_file, 'w').close() if not os.path.exists(sort_done_file): logging.info("Sorting merged GTF file") # create temp directory tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir) # cleanup shutil.rmtree(tmp_dir) open(sort_done_file, 'w').close() # compare assemblies overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf') intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf') overlapping_file = os.path.join(output_dir, 'overlapping.tsv') overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv') overlapping_done_file = os.path.join(output_dir, 'overlapping.done') stats_file = os.path.join(output_dir, 'stats.txt') stats_obj = GlobalStats() num_intergenic = 0 if not os.path.exists(overlapping_done_file): logging.info("Comparing assemblies") gtf_fileh = open(overlapping_gtf_file, 'w') tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w') overlapping_fileh = open(overlapping_file, 'w') overlapping_consensus_fileh = open(overlapping_consensus_file, 'w') for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for t, match_stats in compare_locus(locus_transcripts): if len(match_stats) == 0: # write intergenic transcripts to analyze separately t.attrs['category'] = Category.to_str(Category.INTERGENIC) for f in t.to_gtf_features(source='assembly'): print >> tmp_gtf_fileh, str(f) num_intergenic += 1 else: # get consensus match information consensus_match = MatchStats.consensus(match_stats) assert consensus_match is not None t.attrs['category'] = consensus_match.category # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): consensus_match.add_gtf_attributes(f) print >> gtf_fileh, str(f) # tab-delimited text output print >> overlapping_consensus_fileh, str(consensus_match) for ms in match_stats: print >> overlapping_fileh, str(ms) # compute global statistics stats_obj.compute(locus_transcripts) logging.info("Reporting global statistics") with open(stats_file, 'w') as f: print >> f, stats_obj.report() gtf_fileh.close() tmp_gtf_fileh.close() overlapping_fileh.close() overlapping_consensus_fileh.close() open(overlapping_done_file, 'w').close() # resolve intergenic transcripts intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf') intergenic_file = os.path.join(output_dir, 'intergenic.tsv') intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv') intergenic_done_file = os.path.join(output_dir, 'intergenic.done') if not os.path.exists(intergenic_done_file): logging.info("Building interval index") locus_trees = build_locus_trees(merged_sorted_gtf_file) logging.info('Finding nearest matches to intergenic transcripts') gtf_fileh = open(intergenic_gtf_file, 'w') intergenic_fileh = open(intergenic_file, 'w') intergenic_best_fileh = open(intergenic_best_file, 'w') for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)): for t in locus_transcripts: # find nearest transcripts nearest_transcripts = find_nearest_transcripts( t.chrom, t.start, t.end, t.strand, locus_trees) match_stats = [] best_match = None if len(nearest_transcripts) == 0: best_match = MatchStats.from_transcript(t) best_match.category = Category.to_str(Category.INTERGENIC) match_stats.append(best_match) else: for ref, category, dist in nearest_transcripts: # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = 0 ms.shared_opp_strand_bp = 0 ms.shared_introns = 0 ms.shared_splicing = False ms.category = Category.to_str(category) ms.distance = dist match_stats.append(ms) # choose the consensus match best_match = MatchStats.choose_best(match_stats) # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): best_match.add_gtf_attributes(f) print >> gtf_fileh, str(f) # write tab-delimited data print >> intergenic_best_fileh, str(best_match) for ms in match_stats: print >> intergenic_fileh, str(ms) gtf_fileh.close() intergenic_fileh.close() intergenic_best_fileh.close() open(intergenic_done_file, 'w').close() # merge overlapping and intergenic results logging.info('Merging results') metadata_file = os.path.join(output_dir, 'metadata.txt') metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt') assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf') combine_done_file = os.path.join(output_dir, 'done') if not os.path.exists(combine_done_file): filenames = [overlapping_file, intergenic_file] with open(metadata_file, 'w') as outfile: print >> outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [overlapping_consensus_file, intergenic_best_file] with open(metadata_consensus_file, 'w') as outfile: print >> outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [intergenic_gtf_file, overlapping_gtf_file] with open(assembly_gtf_file, 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) open(combine_done_file, 'w').close() # cleanup logging.info("Done")
def orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, min_orf_length, first_orf_only, num_processes): # # extract transcript DNA sequences, translate to protein, and # search for ORFs # logging.debug('Finding ORFs in transcript sequences') # output files tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.info("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) orf_bed_file = os.path.join(output_dir, 'transcript_orfs.bed') unique_orf_file = os.path.join(output_dir, 'unique_orfs.txt') unique_orf_bed_file = os.path.join(output_dir, 'unique_orfs.bed') orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.txt') sorted_orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.sortbyorf.txt') sorted_orf_id_file = os.path.join(tmp_dir, 'transcript_orfs.sortbyorf.txt') signalp_file = os.path.join(output_dir, 'signalp.txt') pfam_file = os.path.join(output_dir, 'pfam.txt') merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbyorf.merged.txt') sorted_merged_orf_id_file = os.path.join( output_dir, 'transcript_orfs.sortbytranscript.merged.txt') # open output files orf_fileh = open(orf_file, 'w') orf_bed_fileh = open(orf_bed_file, 'w') # open genome fasta file ref_fa = pysam.Fastafile(genome_fasta_file) num_finished = 1 for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: if first_orf_only: # get first ORF orf = get_first_transcript_orf(t, ref_fa) if len(orf.seq) >= min_orf_length: print >> orf_fileh, '\t'.join(orf.to_table()) print >> orf_bed_fileh, '\t'.join(orf.to_bed()) else: # get all ORFs for orf in get_all_transcript_orfs(t, ref_fa, min_orf_length): print >> orf_fileh, '\t'.join(orf.to_table()) print >> orf_bed_fileh, '\t'.join(orf.to_bed()) if (num_finished % 10000) == 0: logging.debug('Processed %d transcripts' % (num_finished)) num_finished += 1 # cleanup orf_fileh.close() orf_bed_fileh.close() # # sort ORF table by ORF amino acid sequence to group identical ORFs # together # logging.debug('Sorting ORFs by amino acid sequence') def sort_by_seq(line): '''comparison function for batch_sort''' fields = line.strip().split('\t') return fields[ORFInfo.SEQ_COL_NUM] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=orf_file, output=sorted_orf_file, key=sort_by_seq, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) shutil.rmtree(sort_tmp_dir) # # assign each ORF a unique id and write to FASTA file # logging.debug('Determining unique ORFs') orf_fasta_prefix = os.path.join(tmp_dir, 'orf') orf_fasta_files = [] orf_fasta_sizes = [] for i in xrange(num_processes): orf_fasta_files.append(open('%s%d.fasta' % (orf_fasta_prefix, i), 'w')) orf_fasta_sizes.append(0) orf_file_index = 0 outfileh = open(sorted_orf_id_file, 'w') unique_orf_fileh = open(unique_orf_file, 'w') print >> unique_orf_fileh, '\t'.join([ 'orf_id', 'orf_length', 'total_occurrences', 'unique_genomic_occurrences' ]) unique_orf_bed_fileh = open(unique_orf_bed_file, 'w') with open(sorted_orf_file) as infileh: for orfs in group_unique_orfs(infileh): # write to master transcript/ORF table for orf in orfs: print >> outfileh, '\t'.join(orf.to_table()) # write ORF to fasta file lines = to_fasta(orfs[0].orf_id, orfs[0].seq.strip('*')) print >> orf_fasta_files[orf_file_index], lines orf_fasta_sizes[orf_file_index] += 1 # advance to next fasta file orf_file_index = (orf_file_index + 1) % (num_processes) # group by genomic position and write ORFs to BED file unique_genome_orfs = {} for orf in orfs: k = (orf.chrom, orf.strand, tuple(orf.exons)) if k in unique_genome_orfs: continue unique_genome_orfs[k] = orf for orf in unique_genome_orfs.itervalues(): print >> unique_orf_bed_fileh, '\t'.join(orf.to_bed( orf.orf_id)) # write unique ORF to tab-delimited text file fields = [ orfs[0].orf_id, len(orfs[0].seq), len(orfs), len(unique_genome_orfs) ] print >> unique_orf_fileh, '\t'.join(map(str, fields)) # cleanup unique_orf_bed_fileh.close() outfileh.close() # get fasta files with lines written orf_fasta_file_names = [] for i in xrange(len(orf_fasta_files)): orf_fasta_files[i].close() if orf_fasta_sizes[i] > 0: orf_fasta_file_names.append(orf_fasta_files[i].name) # # search FASTA file against signalp # logging.debug('Searching for signal peptides') retcode = run_signalp(orf_fasta_file_names, signalp_file, tmp_dir) if retcode != 0: logging.error('Error searching for signal peptides') return 1 # # search FASTA file against Pfam # logging.error('Scanning for Pfam domains') retcode = run_pfam(orf_fasta_file_names, pfam_dir, pfam_file, tmp_dir) if retcode != 0: logging.error('Error running pfam_scan.pl') # # merge results from Pfam and signalp # logging.debug('Merging SignalP and Pfam results') merge_results(sorted_orf_id_file, signalp_file, pfam_file, merged_orf_id_file) # # sort by transcript id # logging.debug('Sorting ORFs by transcript ID') def sort_by_transcript_id(line): return line.split('\t', 1)[0] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=merged_orf_id_file, output=sorted_merged_orf_id_file, key=sort_by_transcript_id, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) shutil.rmtree(sort_tmp_dir) # cleanup ref_fa.close() return 0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): # output files if not os.path.exists(output_dir): logging.info('Creating output dir: %s' % (output_dir)) os.makedirs(output_dir) # merge step merged_gtf_file = os.path.join(output_dir, "merged.gtf") merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf" merge_done_file = os.path.join(output_dir, 'merged.done') sort_done_file = os.path.join(output_dir, 'sort.done') if not os.path.exists(merge_done_file): # merge and sort ref/test gtf files logging.info("Merging reference and test GTF files") # make temporary file to store merged ref/test gtf files with open(merged_gtf_file, "w") as fileh: logging.info("Adding reference GTF file") add_gtf_file(ref_gtf_file, fileh, is_ref=True) logging.info("Adding test GTF file") add_gtf_file(test_gtf_file, fileh, is_ref=False) open(merge_done_file, 'w').close() if not os.path.exists(sort_done_file): logging.info("Sorting merged GTF file") # create temp directory tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir) # cleanup shutil.rmtree(tmp_dir) open(sort_done_file, 'w').close() # compare assemblies overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf') intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf') overlapping_file = os.path.join(output_dir, 'overlapping.tsv') overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv') overlapping_done_file = os.path.join(output_dir, 'overlapping.done') stats_file = os.path.join(output_dir, 'stats.txt') stats_obj = GlobalStats() num_intergenic = 0 if not os.path.exists(overlapping_done_file): logging.info("Comparing assemblies") gtf_fileh = open(overlapping_gtf_file, 'w') tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w') overlapping_fileh = open(overlapping_file, 'w') overlapping_consensus_fileh = open(overlapping_consensus_file, 'w') for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for t, match_stats in compare_locus(locus_transcripts): if len(match_stats) == 0: # write intergenic transcripts to analyze separately t.attrs['category'] = Category.to_str(Category.INTERGENIC) for f in t.to_gtf_features(source='assembly'): print >>tmp_gtf_fileh, str(f) num_intergenic += 1 else: # get consensus match information consensus_match = MatchStats.consensus(match_stats) assert consensus_match is not None t.attrs['category'] = consensus_match.category # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): consensus_match.add_gtf_attributes(f) print >>gtf_fileh, str(f) # tab-delimited text output print >>overlapping_consensus_fileh, str(consensus_match) for ms in match_stats: print >>overlapping_fileh, str(ms) # compute global statistics stats_obj.compute(locus_transcripts) logging.info("Reporting global statistics") with open(stats_file, 'w') as f: print >>f, stats_obj.report() gtf_fileh.close() tmp_gtf_fileh.close() overlapping_fileh.close() overlapping_consensus_fileh.close() open(overlapping_done_file, 'w').close() # resolve intergenic transcripts intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf') intergenic_file = os.path.join(output_dir, 'intergenic.tsv') intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv') intergenic_done_file = os.path.join(output_dir, 'intergenic.done') if not os.path.exists(intergenic_done_file): logging.info("Building interval index") locus_trees = build_locus_trees(merged_sorted_gtf_file) logging.info('Finding nearest matches to intergenic transcripts') gtf_fileh = open(intergenic_gtf_file, 'w') intergenic_fileh = open(intergenic_file, 'w') intergenic_best_fileh = open(intergenic_best_file, 'w') for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)): for t in locus_transcripts: # find nearest transcripts nearest_transcripts = find_nearest_transcripts(t.chrom, t.start, t.end, t.strand, locus_trees) match_stats = [] best_match = None if len(nearest_transcripts) == 0: best_match = MatchStats.from_transcript(t) best_match.category = Category.to_str(Category.INTERGENIC) match_stats.append(best_match) else: for ref,category,dist in nearest_transcripts: # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = 0 ms.shared_opp_strand_bp = 0 ms.shared_introns = 0 ms.shared_splicing = False ms.category = Category.to_str(category) ms.distance = dist match_stats.append(ms) # choose the consensus match best_match = MatchStats.choose_best(match_stats) # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): best_match.add_gtf_attributes(f) print >>gtf_fileh, str(f) # write tab-delimited data print >>intergenic_best_fileh, str(best_match) for ms in match_stats: print >>intergenic_fileh, str(ms) gtf_fileh.close() intergenic_fileh.close() intergenic_best_fileh.close() open(intergenic_done_file, 'w').close() # merge overlapping and intergenic results logging.info('Merging results') metadata_file = os.path.join(output_dir, 'metadata.txt') metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt') assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf') combine_done_file = os.path.join(output_dir, 'done') if not os.path.exists(combine_done_file): filenames = [overlapping_file, intergenic_file] with open(metadata_file, 'w') as outfile: print >>outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [overlapping_consensus_file, intergenic_best_file] with open(metadata_consensus_file, 'w') as outfile: print >>outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [intergenic_gtf_file, overlapping_gtf_file] with open(assembly_gtf_file, 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) open(combine_done_file, 'w').close() # cleanup logging.info("Done")
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--rename', dest='rename', action='store_true') parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file rename = args.rename if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) # parse transcripts num_transcripts = 0 # keep track of redundant gene/transcript counts gene_map = collections.defaultdict(lambda: {}) transcript_map = collections.defaultdict(lambda: {}) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: catstr = t.attrs['category'] catint = Category.to_int(catstr) gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] if catint == Category.SAME_STRAND: # impute gene type new_gene_type = ref_gene_type else: if gene_type == 'protein_coding': # don't change protein coding genes new_gene_type = gene_type elif t.length < 250: # categorize small RNA separately new_gene_type = 'misc_RNA' else: if ref_gene_type == 'protein_coding': # categorize based on overlap with reference new_gene_type = PROTEIN_CATEGORY_MAP[catint] else: # reference is also non-coding new_gene_type = 'lincRNA' # get gene category gene_category = GENCODE_CATEGORY_MAP[new_gene_type] new_gene_name = None if rename: # resolve upper/lower case issue with gene names from # different databases ref_gene_name = t.attrs['ref_gene_name'].upper() # build new gene name if ref_gene_name == 'NONE': new_gene_name = str(t.attrs['source']) elif catint == Category.SAME_STRAND: new_gene_name = str(ref_gene_name) else: new_gene_name = '%s.%s' % (ref_gene_name, catstr) # gene name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[new_gene_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[gene_id] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num) # write new attributes for f in t.to_gtf_features(source='assemblyline', score=1000): f.attrs['gene_type'] = new_gene_type f.attrs['gene_category'] = gene_category if rename: if 'gene_name' in f.attrs: f.attrs['orig_gene_name'] = f.attrs['gene_name'] f.attrs['gene_name'] = new_gene_name print str(f) num_transcripts += 1 return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') parser.add_argument('excl_file') parser.add_argument('chrom_sizes') parser.add_argument("output_prefix") args = parser.parse_args() prefix = args.output_prefix excl_file = args.excl_file chrom_sizes_file = args.chrom_sizes gtf_file = args.gtf_file # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) gene_intervals_file = prefix + '.gene_intervals.bed' gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed' shuffled_gtf_file = prefix + '.shuffle.gtf' sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf' logging.info('Parsing GTF file') with open(gene_intervals_file, 'w') as f: for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for g in get_gene_intervals(locus_transcripts): print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id])) # randomly shuffle genes logging.info("Shuffling genes") args = ['bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file, '-g', args.chrom_sizes] with open(gene_intervals_shuffled_file, 'w') as fileh: subprocess.call(args, stdout=fileh) # read new gene positions logging.info("Reading shuffled gene intervals") shuffle_gene_map = {} with open(gene_intervals_shuffled_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom = fields[0] start = int(fields[1]) end = int(fields[2]) gene_id = fields[3] shuffle_gene_map[gene_id] = (chrom, start, end) # reposition transcripts logging.info("Repositioning transcripts") with open(shuffled_gtf_file, 'w') as fileh: for locus_transcripts in parse_gtf(open(gtf_file)): # get original positions orig_gene_map = {} for g in get_gene_intervals(locus_transcripts): orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end) for t in locus_transcripts: gene_id = t.attrs['gene_id'] orig_chrom, orig_start, orig_end = orig_gene_map[gene_id] if gene_id not in shuffle_gene_map: logging.warning('Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end)) continue new_chrom, new_start, new_end = shuffle_gene_map[gene_id] # reposition transcript t.chrom = new_chrom t.start = new_start + (t.start - orig_start) t.end = new_start + (t.end - orig_start) for e in t.exons: e.start = new_start + (e.start - orig_start) e.end = new_start + (e.end - orig_start) fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons) print '\t'.join(fields) #for f in t.to_gtf_features(source='shuffle'): # print >>fileh, str(f) logging.info("Sorting GTF file") sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)