def getGeneTable(reffile): E.info("Loading reference") table = defaultdict(dict) for ens_gene in GTF.gene_iterator(GTF.iterator( IOTools.open_file(reffile))): geneid = ens_gene[0][0].gene_id table[geneid]["models"] = dict() table[geneid]["start_codons"] = defaultdict(list) for transcript in ens_gene: transcript_id = transcript[0].transcript_id table[geneid]["models"][transcript_id] = transcript CDS = GTF.asRanges(transcript, "start_codon") if len(CDS) == 0: continue if transcript[0].strand == "-": start_codon = max(e[1] for e in CDS) else: start_codon = min(e[0] for e in CDS) table[geneid]["start_codons"][start_codon].append(transcript_id) E.info("Reference Loaded") return table
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.set_defaults( is_gtf=False, ) (args, unknown) = E.start(parser, add_output_options=True, unknowns=True) if len(unknown) == 0: files = [args.stdin] else: files = args args.stdout.write("track\t%s" % ("\t".join(counter_gff.fields))) if args.is_gtf: args.stdout.write("\t%s" % ("\t".join(counter_exons.fields))) args.stdout.write("\n") for f in files: if f == args.stdin: infile = f args.stdout.write("stdin") else: infile = iotools.open_file(f) args.stdout.write(f) counters = [] if args.is_gtf: iterator = GTF.iterator(infile) counters.append(counter_gff(iterator)) counters.append(counter_exons(counters[0])) else: iterator = GTF.iterator(infile) counters.append(counter_gff(iterator)) c = counters[-1] for x in c: pass for c in counters: args.stdout.write("\t%s" % str(c)) args.stdout.write("\n") if infile != sys.stdin: infile.close() E.stop()
def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.items(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def annotateTTS(iterator, fasta, options): """annotate termination sites within iterator. Entries specified with ``--restrict-source are annotated``. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) tts = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) transcript_ids.append(transcript[0].transcript_id) # if tts is directly at start/end of contig, the tss will # be within an exon. otherwise, it is outside an exon. if is_negative_strand: tts.append( (max(0, mi - options.promotor), max(options.promotor, mi))) else: tts.append((min(ma, lcontig - options.promotor), min(lcontig, ma + options.promotor))) if options.merge_promotors: # merge the promotors (and rename - as sort order might have # changed) tts = Intervals.combine(tts) transcript_ids = ["%i" % (x + 1) for x in range(len(tts))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "tts" x = 0 for start, end in tts: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) npromotors += 1 x += 1 if options.loglevel >= 1: options.stdlog.write("# ngenes=%i, ntranscripts=%i, ntss=%i\n" % (ngenes, ntranscripts, npromotors))
def getTranscript2GeneMap(outfile): ''' Extract a 1:1 map of transcript_id to gene_id from the geneset ''' iterator = GTF.iterator(IOTools.open_file(PARAMS['geneset'])) transcript2gene_dict = {} for entry in iterator: try: gene_id = entry[PARAMS["gene_id_field"]] except KeyError: gene_id = entry.gene_id try: transcript_id = entry[PARAMS["transcript_id_field"]] except KeyError: transcript_id = entry.transcript_id # Check the same transcript_id is not mapped to multiple gene_ids! if transcript_id in transcript2gene_dict: if not gene_id == transcript2gene_dict[transcript_id]: raise ValueError( '''multipe gene_ids associated with the same transcript_id %s %s''' % (gene_id, transcript2gene_dict[transcript_id])) else: transcript2gene_dict[transcript_id] = gene_id with IOTools.open_file(outfile, "w") as outf: outf.write("transcript_id\tgene_id\n") for key, value in sorted(transcript2gene_dict.items()): outf.write("%s\t%s\n" % (key, value))
def filterGTF(gtf, filterstring, tempout): if "!=" in filterstring: column, value = filterstring.split("!=") value = value.split("+") filtertype = "notin" elif "=" in filterstring: column, value = filterstring.split("=") value = value.split("+") filtertype = "in" elif "-in_file-" in filterstring: column, value = filterstring.split("-in_file-") value = [line.strip() for line in iotools.open_file(value)] filtertype = "in_file" elif "-notin_file-" in filterstring: column, value = filterstring.split("-notin_file-") value = [line.strip() for line in iotools.open_file(value)] filtertype = "notin_file" elif "-morethan-" in filterstring: column, value = filterstring.split("-morethan-") value = float(value) filtertype = "morethan" elif "-lessthan-" in filterstring: column, value = filterstring.split("-lessthan-") value = float(value) filtertype = "lessthan" gfile = iotools.open_file(gtf) G = GTF.iterator(gfile) out = iotools.open_file(tempout, "w") for item in G: D = item.asDict() D['contig'] = item.contig D['source'] = item.source D['feature'] = item.feature D['start'] = item.start D['end'] = item.end D['strand'] = item.strand D['frame'] = item.frame if filtertype == "in" or filtertype == 'in_file': if D[column] in value: out.write("%s\n" % str(item)) elif filtertype == "notin" or filtertype == 'notin_file': if D[column] not in value: out.write("%s\n" % str(item)) elif filtertype == "morethan": if float(D[column]) > value: out.write("%s\n" % str(item)) elif filtertype == "lessthan": if float(D[column]) < value: out.write("%s\n" % str(item)) out.close() gfile.close()
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = iotools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = ''' zcat %(infile)s | cgat gtf2gtf --method=renumber-genes --pattern-identifier=%(gene_pattern)s%%i | cgat gtf2gtf --method=renumber-transcripts --pattern-identifier=%(transcript_pattern)s%%i | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def _iterator(iterator): """yield gene plus the locations of the end of the previous gene and start of next gene""" last_end, prev_end = 0, 0 last_contig = None last = None for matches in GTF.iterator_overlaps(iterator): this_start = min([x.start for x in matches]) this_end = max([x.end for x in matches]) if method == "tss": # restrict to tss if matches[0].strand == "+": this_end = this_start + 1 else: this_start = this_end - 1 this_contig = matches[0].contig if last_contig != this_contig: if last: yield prev_end, last, fasta.getLength(last_contig) last_end, prev_end = 0, 0 else: yield prev_end, last, this_start prev_end = last_end last_end = this_end last = matches last_contig = this_contig if last: yield prev_end, last, fasta.getLength(last_contig)
def update(self, bed): # convert to a gtf entry gtf = GTF.Entry() gtf.fromBed(bed) gtf.feature = 'exon' GeneModelAnalysis.Classifier.update(self, [gtf])
def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf)
def test_entry(frame, strand, xfrom, xto, start, end, ref): entry = GTF.Entry() entry.frame = frame entry.strand = strand entry.start = xfrom entry.end = xto intervals = transform_third_codon(start, end, [(xfrom, xto, entry)]) if ref != intervals: print("failed:", ref != intervals)
def buildIndex(self, filename): """read and index.""" idx = {} infile = iotools.open_file(filename, "r") for e in GTF.readFromFile(infile): if e.contig not in idx: idx[e.contig] = NCL.NCLSimple() idx[e.contig].add(e.start, e.end) infile.close() return idx
def buildRepeatTrack(infile, outfile): '''build a repeat track as negative control.''' nrepeats = 0 for gff in GTF.iterator(gzip.open(infile, "r")): nrepeats += 1 sample = set( random.sample(range(nrepeats), PARAMS["ancestral_repeats_samplesize"])) outf = gzip.open(outfile, "w") gtf = GTF.Entry() for x, gff in enumerate(GTF.iterator(gzip.open(infile, "r"))): if x not in sample: continue gtf.fromGTF(gff, "%08i" % x, "%08i" % x) outf.write("%s\n" % str(gtf)) outf.close() E.debug("created sample of %i repeats out of %i in %s" % (len(sample), nrepeats, outfile))
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' # just load each transcript with its classification temp = P.getTempFile(".") inf = iotools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() P.load(temp.name, outfile, options="--header-names=transcript_id,gene_id,class " "--add-index=transcript_id " "--add-index=gene_id") os.unlink(temp.name)
def convert_set(gffs, gene_pattern, transcript_pattern, options): ''' creates the gene_id and transcript_id fields from a string format pattern using fields of the gff. ''' for gff in gffs: gff.gene_id = str(gene_pattern) % gff.asDict() gff.transcript_id = str(gene_pattern) % gff.asDict() gtf_entry = GTF.Entry() gtf_entry.copy(gff) if "Parent" in gtf_entry: gtf_entry['Parent'] = ",".join(gtf_entry['Parent']) options.stdout.write(str(gtf_entry) + "\n")
def extractEnsemblLincRNA(infile, outfile): tmpf = P.getTempFile("/ifs/scratch") for gtf in GTF.iterator(iotools.openFile(infile)): if gtf.source == "lincRNA": tmpf.write(str(gtf) + "\n") else: continue tmpf.close() tmpf = tmpf.name statement = ("cat %(tmpf)s |" " cgat gtf2gtf" " --method=sort --sort-order=gene" " --log=%(outfile)s.log |" " gzip > %(outfile)s") P.run() os.unlink(tmpf)
def transcript2bed12(transcript): new_entry = Bed.Bed() start = min(entry.start for entry in transcript) end = max(entry.end for entry in transcript) try: thickStart = min(entry.start for entry in transcript if entry.feature == "CDS") thickEnd = max(entry.end for entry in transcript if entry.feature == "CDS") except ValueError: # if there is no CDS, then set first base of transcript as # start if transcript[0].strand == "-": thickStart = end thickEnd = end else: thickStart = start thickEnd = start exons = GTF.asRanges(transcript, "exon") exon_starts = [es - start for (es, ee) in exons] exon_lengths = [ee - es for (es, ee) in exons] exon_count = len(exons) new_entry.contig = transcript[0].contig new_entry.start = start new_entry.end = end new_entry["strand"] = transcript[0].strand new_entry["name"] = transcript[0].transcript_id new_entry["thickStart"] = thickStart new_entry["thickEnd"] = thickEnd new_entry["blockCount"] = exon_count new_entry["blockStarts"] = ",".join(map(str, exon_starts)) new_entry["blockSizes"] = ",".join(map(str, exon_lengths)) return new_entry
def get_contigs(infile, outfile): '''Generate a pseudo-contigs file from the geneset, where the length of each contigs is determined by the GTF entry with the highest end coordinate. Will not stop things going off the end on contigs, but that doesn't really matter for our purposes''' last_contig = None max_end = 0 outlines = [] for entry in GTF.iterator(iotools.open_file(infile)): if last_contig and entry.contig != last_contig: outlines.append([entry.contig, str(max_end)]) max_end = 0 max_end = max(max_end, entry.end) last_contig = entry.contig outlines.append([last_contig, str(max_end)]) iotools.write_lines(outfile, outlines, header=None)
def _count(self, filename, idx): overlapping_genes = set() genes = set() # iterate over exons infile = iotools.open_file(filename, "r") it = GTF.iterator(infile) nexons, nexons_overlapping = 0, 0 nbases, nbases_overlapping = 0, 0 for this in it: nexons += 1 nbases += this.end - this.start genes.add(this.gene_id) try: intervals = list(idx[this.contig].find(this.start, this.end)) except KeyError: continue if len(intervals) == 0: continue overlapping_genes.add(this.gene_id) nexons_overlapping += 1 start, end = this.start, this.end counts = numpy.zeros(end - start, numpy.int) for other_start, other_end, other_value in intervals: for x in range( max(start, other_start) - start, min(end, other_end) - start): counts[x] += 1 nbases_overlapping += sum([1 for x in counts if x > 0]) infile.close() return len(genes), len( overlapping_genes ), nexons, nexons_overlapping, nbases, nbases_overlapping
def addSegment(feature, start, end, template, options): """add a generic segment of type *feature*. """ if start >= end: return 0 entry = GTF.Entry() if isinstance(template, tuple): entry.copy(template[0]) entry.clearAttributes() entry.addAttribute("downstream_gene_id", template[1].gene_id) else: entry.copy(template) entry.clearAttributes() entry.start, entry.end = start, end entry.feature = feature if feature not in ("exon", "CDS", "UTR", "UTR3", "UTR5"): entry.score = "." options.stdout.write(str(entry) + "\n") return 1
def annotate(infile, annotation_file, outfile): ''' annotate infile with annotations from annotation gtf file ''' inf = open(infile) header = inf.readline() include = set() E.info("reading genes to keep") for line in inf.readlines(): data = line[:-1].split("\t") gene_id = data[8].strip('"') include.add(gene_id) E.info("reading annotations file") annotations = {} for gtf in GTF.iterator(iotools.openFile(annotation_file)): if gtf.gene_id in include: annotations[gtf.gene_id] = \ [gtf.gene_name, gtf.species, gtf.description] inf = open(infile) header = inf.readline() E.info("writing results with annotations") outf = open(outfile, "w") outf.write( header.strip("\n") + "\tgene_name\tspecies_centroid\tdescription\n") for line in inf.readlines(): data = line[:-1].split("\t") gene_id = data[8].strip('"') try: outf.write("\t".join(data + annotations[gene_id]) + "\n") except KeyError: outf.write("\t".join(data + ["NA", "NA", "NA"]) + "\n") outf.close()
def _count(self, filename, idx): overlapping_genes = set() genes = set() # iterate over exons infile = iotools.open_file(filename, "r") it = GTF.iterator(infile) for this in it: genes.add(this.gene_id) try: intervals = idx[this.contig].find(this.start, this.end) except KeyError: continue if len(intervals) == 0: continue overlapping_genes.add(this.gene_id) infile.close() return genes, overlapping_genes
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False): '''get refseq gene set from UCSC database and save as :term:`gtf` formatted file. Matches to ``chr_random`` are ignored (as does ENSEMBL). Note that this approach does not work as a gene set, as refseq maps are not real gene builds and unalignable parts cause differences that are not reconcilable. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`gtf` format. The filename aims to be close to the ENSEMBL gtf format. remove_duplicate : bool If True, duplicate mappings are removed. ''' duplicates = set() if remove_duplicates: cc = dbhandle.execute("""SELECT name, COUNT(*) AS c FROM refGene WHERE chrom NOT LIKE '%_random' GROUP BY name HAVING c > 1""") duplicates = set([x[0] for x in cc.fetchall()]) E.info("removing %i duplicates" % len(duplicates)) # these are forward strand coordinates statement = ''' SELECT gene.name, link.geneName, link.name, gene.name2, product, protAcc, chrom, strand, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, exonFrames FROM refGene as gene, refLink as link WHERE gene.name = link.mrnaAcc AND chrom NOT LIKE '%_random' ORDER by chrom, cdsStart ''' outf = iotools.open_file(outfile, "w") cc = dbhandle.execute(statement) SQLResult = collections.namedtuple( 'Result', '''transcript_id, gene_id, gene_name, gene_id2, description, protein_id, contig, strand, start, end, nexons, starts, ends, frames''') counts = E.Counter() counts.duplicates = len(duplicates) for r in map(SQLResult._make, cc.fetchall()): if r.transcript_id in duplicates: continue starts = list(map(int, r.starts.split(",")[:-1])) ends = list(map(int, r.ends.split(",")[:-1])) frames = list(map(int, r.frames.split(",")[:-1])) gtf = GTF.Entry() gtf.contig = r.contig gtf.source = "protein_coding" gtf.strand = r.strand gtf.gene_id = r.gene_id gtf.transcript_id = r.transcript_id gtf.addAttribute("protein_id", r.protein_id) gtf.addAttribute("transcript_name", r.transcript_id) gtf.addAttribute("gene_name", r.gene_name) assert len(starts) == len(ends) == len(frames) if gtf.strand == "-": starts.reverse() ends.reverse() frames.reverse() counts.transcripts += 1 i = 0 for start, end, frame in zip(starts, ends, frames): gtf.feature = "exon" counts.exons += 1 i += 1 gtf.addAttribute("exon_number", i) # frame of utr exons is set to -1 in UCSC gtf.start, gtf.end, gtf.frame = start, end, "." outf.write("%s\n" % str(gtf)) cds_start, cds_end = max(r.start, start), min(r.end, end) if cds_start >= cds_end: # UTR exons have no CDS # do not expect any in UCSC continue gtf.feature = "CDS" # invert the frame frame = (3 - frame % 3) % 3 gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame outf.write("%s\n" % str(gtf)) outf.close() E.info("%s" % str(counts))
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--output-equivalent", dest="write_equivalent", action="store_true", help="write equivalent entries [default=%default].") parser.add_option("-f", "--output-full", dest="write_full", action="store_true", help="write full gff entries [default=%default].") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage columns [default=%default].") parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information [default=%default].") parser.set_defaults( write_equivalent=False, write_full=False, add_percent=False, ignore_strand=False, as_gtf=False, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) != 2: raise ValueError("two arguments required") input_filename1, input_filename2 = args # duplicated features cause a problem. Make sure # features are non-overlapping by running # gff_combine.py on GFF files first. E.info("reading data started") idx, genes2 = {}, set() for e in GTF.readFromFile(iotools.open_file(input_filename2, "r")): genes2.add(e.gene_id) if e.contig not in idx: idx[e.contig] = quicksect.IntervalTree() idx[e.contig].add(e.start, e.end, e) overlaps_genes = [] E.info("reading data finished: %i contigs" % len(idx)) # outfile_diff and outfile_overlap not implemented # outfile_diff = getFile( options, "diff" ) # outfile_overlap = getFile( options, "overlap" ) overlapping_genes = set() genes1 = set() # iterate over exons with iotools.open_file(input_filename1, "r") as infile: for this in GTF.iterator(infile): genes1.add(this.gene_id) try: intervals = idx[this.contig].find( quicksect.Interval(this.start, this.end)) except KeyError: continue others = [x.data for x in intervals] for other in others: overlapping_genes.add((this.gene_id, other.gene_id)) # check for identical/half-identical matches output = None for other in others: if this.start == other.start and this.end == other.end: output, symbol = other, "=" break else: for other in others: if this.start == other.start or this.end == other.end: output, symbol = other, "|" break else: symbol = "~" # if outfile_diff != options.stdout: outfile_diff.close() # if outfile_overlap != options.stdout: outfile_overlap.close() outfile = None ################################################################## ################################################################## ################################################################## # print gene based information ################################################################## if overlapping_genes: outfile = getFile(options, "genes_ovl") outfile.write("gene_id1\tgene_id2\n") for a, b in sorted(overlapping_genes): outfile.write("%s\t%s\n" % (a, b)) if outfile != options.stdout: outfile.close() outfile_total = getFile(options, "genes_total") outfile_total.write( "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n") outfile = getFile(options, "genes_uniq1") b = set([x[0] for x in overlapping_genes]) d = genes1.difference(b) outfile.write("gene_id1\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename1), len(genes1), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1))) outfile = getFile(options, "genes_uniq2") b = set([x[1] for x in overlapping_genes]) d = genes2.difference(b) outfile.write("gene_id2\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename2), len(genes2), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2))) if outfile_total != options.stdout: outfile_total.close() E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-i", "--ignore-missing", dest="ignore_missing", action="store_true", help= "Ignore transcripts on contigs that are not in the genome-file [default=%default]." ) parser.add_option( "--min-intron-length", dest="min_intron_length", type="int", help= "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default]." ) parser.add_option("-m", "--method", dest="method", type="choice", choices=("full", ), help="method to apply [default=%default].") parser.set_defaults( genome_file=None, flank=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="full", report_step=1000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if not options.genome_file: raise ValueError("an indexed genome is required.") fasta = IndexedFasta.IndexedFasta(options.genome_file) iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) annotateGenome(iterator, fasta, options) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = iotools.read_map( iotools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(iotools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(iotools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = iotools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: " "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z " "andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default]") parser.add_option("-f", "--features", dest="features", type="string", action="append", help="features to collect " "[default=%default]") parser.add_option("-w", "--window-size", dest="window_size", type="int", help="window size in bp for histogram computation. " "Determines the bin size. " "[default=%default]") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="number of bins for histogram computation " "if window size is not given. " "[default=%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "genomic", "histogram", ), help="methods to apply. " "[default=%default]") parser.set_defaults( genome_file=None, window_size=None, num_bins=1000, value_format="%6.4f", features=[], method="genomic", ) (options, args) = E.start(parser, add_output_options=True) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.method == "histogram": gff = GTF.readFromFile(options.stdin) gff.sort(key=lambda x: (x.contig, x.start)) chunk = [] last_contig = None for entry in gff: if last_contig != entry.contig: processChunk(last_contig, chunk, options, fasta) last_contig = entry.contig chunk = [] chunk.append(entry) processChunk(last_contig, chunk, options, fasta) elif options.method == "genomic": intervals = collections.defaultdict(int) bases = collections.defaultdict(int) total = 0 for entry in GTF.iterator(options.stdin): intervals[(entry.contig, entry.source, entry.feature)] += 1 bases[(entry.contig, entry.source, entry.feature)] += entry.end - entry.start total += entry.end - entry.start options.stdout.write("contig\tsource\tfeature\tintervals\tbases") if fasta: options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n") else: options.stdout.write("\n") total_genome_size = sum( fasta.getContigSizes(with_synonyms=False).values()) for key in sorted(intervals.keys()): nbases = bases[key] nintervals = intervals[key] contig, source, feature = key options.stdout.write("\t".join( ("\t".join(key), str(nintervals), str(nbases)))) if fasta: options.stdout.write( "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig))) options.stdout.write( "\t%f\n" % (100.0 * float(nbases) / total_genome_size)) else: options.stdout.write("\n") E.stop()
def processChunk(contig, chunk, options, fasta=None): """ This function requires segments to be non-overlapping. """ if len(chunk) == 0: return # check whether there are overlapping features or not checked = [] for feature in chunk: checked.append(feature) others = [x for x in chunk if x not in checked] for otherFeature in others: if GTF.Overlap(feature, otherFeature): raise ValueError(" Histogram could not be created" " since the file contains overlapping " "features! \n%s\n%s " % (feature, otherFeature)) # clear auxiliary list del checked[:] # compute max_coordinate for the histogram max_coordinate = max([x.end for x in chunk]) # compute window size if options.window_size: window_size = options.window_size num_bins = int(math.ceil((float(max_coordinate) / window_size))) elif options.num_bins and fasta: contig_length = fasta.getLength(contig) assert max_coordinate <= contig_length, ( "maximum coordinate (%i) " "larger than contig size (%i)" " for contig %s" % (max_coordinate, contig_length, contig)) max_coordinate = contig_length window_size = int(math.floor(float(contig_length) / options.num_bins)) num_bins = options.num_bins else: raise ValueError("please specify a window size of provide " "genomic sequence with number of bins.") values = [[] for x in range(num_bins)] # do several parses for each feature, slow, but easier to code # alternatively: sort by feature and location. for feature in options.features: total = 0 bin = 0 end = window_size for entry in chunk: if entry.feature != feature: continue while end < entry.start: values[bin].append(total) bin += 1 end += window_size while entry.end > end: seg_start = max(entry.start, end - window_size) seg_end = min(entry.end, end) total += seg_end - seg_start values[bin].append(total) end += window_size bin += 1 else: seg_start = max(entry.start, end - window_size) seg_end = min(entry.end, end) total += seg_end - seg_start while bin < num_bins: values[bin].append(total) bin += 1 printValues(contig, max_coordinate, window_size, values, options)
def main(argv=None): ''' main function ''' if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-o", "--output-only-attributes", dest="only_attributes", action="store_true", help="output only attributes as separate columns " "[default=%default].") parser.add_option( "-f", "--attributes-as-columns", dest="output_full", action="store_true", help="output attributes as separate columns " "[default=%default].") parser.add_option("--is-gff3", dest="is_gtf", action="store_false", help="input file is in gtf format [default=%default] ") parser.add_option( "-i", "--invert", dest="invert", action="store_true", help="convert tab-separated table back to gtf " "[default=%default].") parser.add_option( "-m", "--output-map", dest="output_map", type="choice", choices=( "transcript2gene", "peptide2gene", "peptide2transcript"), help="output a map mapping transcripts to genes " "[default=%default].") parser.set_defaults( only_attributes=False, output_full=False, invert=False, output_map=None, is_gtf=True ) (options, args) = E.start(parser, argv=argv) if options.output_full: # output full table with column for each attribute attributes = set() data = [] if options.is_gtf: for gtf in GTF.iterator(options.stdin): data.append(gtf) attributes = attributes.union(set(gtf.keys())) else: for gff in GFF3.iterator_from_gff(options.stdin): data.append(gff) attributes = attributes.union(set(gff.attributes)) # remove gene_id and transcript_id, as they are used # explicitely later attributes.difference_update(["gene_id", "transcript_id"]) attributes = sorted(list(attributes)) # Select whether gtf of gff for output columns if options.is_gtf: if options.only_attributes: header = ["gene_id", "transcript_id"] + attributes else: header = ["contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", ] + attributes else: if options.only_attributes: header = attributes else: header = ["contig", "source", "feature", "start", "end", "score", "strand", "frame"] + attributes attributes_new = header options.stdout.write("\t".join(header) + "\n") if options.is_gtf: for gtf in data: first = True for a in attributes_new: try: val = getattr(gtf, a) except (AttributeError, KeyError): val = "" if first: options.stdout.write("%s" % val) first = False else: options.stdout.write("\t%s" % val) options.stdout.write("\n") else: for gff in data: options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t") % (gff.contig, gff.source, gff.feature, gff.start, gff.end, gff.score, gff.strand, gff.frame)) first = True for a in attributes: try: val = (gff.attributes[a]) except (AttributeError, KeyError): val = '' if first: options.stdout.write("%s" % val) first = False else: options.stdout.write("\t%s" % val) options.stdout.write("\n") elif options.invert: gtf = GTF.Entry() header = None for line in options.stdin: if line.startswith("#"): continue data = line[:-1].split("\t") if not header: header = data map_header2column = dict( [(y, x) for x, y in enumerate(header)]) continue # fill gtf entry with data try: gtf.contig = data[map_header2column["contig"]] gtf.source = data[map_header2column["source"]] gtf.feature = data[map_header2column["feature"]] # subtract -1 to start for 0-based coordinates gtf.start = int(data[map_header2column["start"]]) gtf.end = int(data[map_header2column["end"]]) gtf.score = data[map_header2column["score"]] gtf.strand = data[map_header2column["strand"]] gtf.frame = data[map_header2column["frame"]] gtf.gene_id = data[map_header2column["gene_id"]] gtf.transcript_id = data[map_header2column["transcript_id"]] gtf.parseInfo(data[map_header2column["attributes"]], line) except KeyError as msg: raise KeyError("incomplete entry %s: %s: %s" % (str(data), str(map_header2column), msg)) if gtf.frame is None: gtf.frame = "." # output gtf entry in gtf format options.stdout.write("%s\n" % str(gtf)) elif options.output_map: if options.output_map == "transcript2gene": fr = lambda x: x.transcript_id to = lambda x: x.gene_id options.stdout.write("transcript_id\tgene_id\n") elif options.output_map == "peptide2gene": fr = lambda x: x.protein_id to = lambda x: x.gene_id options.stdout.write("peptide_id\tgene_id\n") elif options.output_map == "peptide2transcript": fr = lambda x: x.protein_id to = lambda x: x.transcript_id options.stdout.write("peptide_id\ttranscript_id\n") map_fr2to = {} for gtf in GTF.iterator(options.stdin): try: map_fr2to[fr(gtf)] = to(gtf) except (AttributeError, KeyError): pass for x, y in sorted(map_fr2to.items()): options.stdout.write("%s\t%s\n" % (x, y)) else: header = ("contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", "attributes") options.stdout.write("\t".join(header) + "\n") for gtf in GTF.iterator(options.stdin): attributes = [] for a in list(gtf.keys()): if a in ("gene_id", "transcript_id"): continue attributes.append('%s %s' % (a, GTF.quote(gtf[a]))) attributes = "; ".join(attributes) # Capture if None and set to . format if gtf.frame is None: gtf.frame = "." options.stdout.write(str(gtf) + "\n") E.stop()
def convert_hierarchy(first_gffs, second_gffs, options): ''' Converts GFF to GTF by parsing the hierarchy. First parses :param:first_gffs to build the hierarchy then iterates over second_gffs using a call to the recursive function search_hierarchy to identify gene_ids and transcript_ids. If multiple gene and transcript_ids are found outputs a record for each combination. If no definitive transcript_id is found and options.missing_gene is True, it will use the possible_transcript_id as transcript_id, which is the ID one level below the entry used as gene_id. If this is also None (that is there was only on level), sets transcript_id to gene_id. Might raise ValueError if options.missing_gene is false and either no gene or no transcript_id was found for an entry. Might raise RuntimeError if the recursion limit was reached because the input contains circular references. ''' hierarchy = {} for gff in first_gffs: if not (options.parent == "Parent"): if options.parent in gff.asDict(): gff['Parent'] = gff[options.parent].split(",") else: gff['Parent'] = [] hierarchy[gff['ID']] = { "type": gff.feature, "Parent": gff.asDict().get("Parent", []), "gene_id": gff.attributes.get(options.gene_field_or_pattern, gff['ID']), "transcript_id": gff.attributes.get(options.transcript_field_or_pattern, gff['ID']) } for gff in second_gffs: if options.discard and ( (options.missing_gene and options.parent not in gff) or (gff.feature in (options.gene_type, options.transcript_type))): continue gene_ids, transcript_ids, poss_transcript_ids = search_hierarchy( gff['ID'], hierarchy, options) assert len(gene_ids) > 0 and len(transcript_ids) > 0 if options.missing_gene: transcript_ids = [ poss if found is None else found for found, poss in zip(transcript_ids, poss_transcript_ids) ] transcript_ids = [ gid if found is None else found for found, gid in zip(transcript_ids, gene_ids) ] elif None in transcript_ids: raise ValueError("failed to find transcript id for %s" % gff['ID']) for gene_id, transcript_id in zip(gene_ids, transcript_ids): gff.gene_id = gene_id gff.transcript_id = transcript_id gtf_entry = GTF.Entry() gtf_entry.copy(gff) if "Parent" in gtf_entry: gtf_entry['Parent'] = ",".join(gtf_entry['Parent']) options.stdout.write(str(gtf_entry) + "\n")