def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.set_defaults( is_gtf=False, ) (args, unknown) = E.start(parser, add_output_options=True, unknowns=True) if len(unknown) == 0: files = [args.stdin] else: files = args args.stdout.write("track\t%s" % ("\t".join(counter_gff.fields))) if args.is_gtf: args.stdout.write("\t%s" % ("\t".join(counter_exons.fields))) args.stdout.write("\n") for f in files: if f == args.stdin: infile = f args.stdout.write("stdin") else: infile = iotools.open_file(f) args.stdout.write(f) counters = [] if args.is_gtf: iterator = GTF.iterator(infile) counters.append(counter_gff(iterator)) counters.append(counter_exons(counters[0])) else: iterator = GTF.iterator(infile) counters.append(counter_gff(iterator)) c = counters[-1] for x in c: pass for c in counters: args.stdout.write("\t%s" % str(c)) args.stdout.write("\n") if infile != sys.stdin: infile.close() E.stop()
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = iotools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = ''' zcat %(infile)s | cgat gtf2gtf --method=renumber-genes --pattern-identifier=%(gene_pattern)s%%i | cgat gtf2gtf --method=renumber-transcripts --pattern-identifier=%(transcript_pattern)s%%i | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def getTranscript2GeneMap(outfile): ''' Extract a 1:1 map of transcript_id to gene_id from the geneset ''' iterator = GTF.iterator(IOTools.open_file(PARAMS['geneset'])) transcript2gene_dict = {} for entry in iterator: try: gene_id = entry[PARAMS["gene_id_field"]] except KeyError: gene_id = entry.gene_id try: transcript_id = entry[PARAMS["transcript_id_field"]] except KeyError: transcript_id = entry.transcript_id # Check the same transcript_id is not mapped to multiple gene_ids! if transcript_id in transcript2gene_dict: if not gene_id == transcript2gene_dict[transcript_id]: raise ValueError( '''multipe gene_ids associated with the same transcript_id %s %s''' % (gene_id, transcript2gene_dict[transcript_id])) else: transcript2gene_dict[transcript_id] = gene_id with IOTools.open_file(outfile, "w") as outf: outf.write("transcript_id\tgene_id\n") for key, value in sorted(transcript2gene_dict.items()): outf.write("%s\t%s\n" % (key, value))
def getGeneTable(reffile): E.info("Loading reference") table = defaultdict(dict) for ens_gene in GTF.gene_iterator(GTF.iterator( IOTools.open_file(reffile))): geneid = ens_gene[0][0].gene_id table[geneid]["models"] = dict() table[geneid]["start_codons"] = defaultdict(list) for transcript in ens_gene: transcript_id = transcript[0].transcript_id table[geneid]["models"][transcript_id] = transcript CDS = GTF.asRanges(transcript, "start_codon") if len(CDS) == 0: continue if transcript[0].strand == "-": start_codon = max(e[1] for e in CDS) else: start_codon = min(e[0] for e in CDS) table[geneid]["start_codons"][start_codon].append(transcript_id) E.info("Reference Loaded") return table
def filterGTF(gtf, filterstring, tempout): if "!=" in filterstring: column, value = filterstring.split("!=") value = value.split("+") filtertype = "notin" elif "=" in filterstring: column, value = filterstring.split("=") value = value.split("+") filtertype = "in" elif "-in_file-" in filterstring: column, value = filterstring.split("-in_file-") value = [line.strip() for line in iotools.open_file(value)] filtertype = "in_file" elif "-notin_file-" in filterstring: column, value = filterstring.split("-notin_file-") value = [line.strip() for line in iotools.open_file(value)] filtertype = "notin_file" elif "-morethan-" in filterstring: column, value = filterstring.split("-morethan-") value = float(value) filtertype = "morethan" elif "-lessthan-" in filterstring: column, value = filterstring.split("-lessthan-") value = float(value) filtertype = "lessthan" gfile = iotools.open_file(gtf) G = GTF.iterator(gfile) out = iotools.open_file(tempout, "w") for item in G: D = item.asDict() D['contig'] = item.contig D['source'] = item.source D['feature'] = item.feature D['start'] = item.start D['end'] = item.end D['strand'] = item.strand D['frame'] = item.frame if filtertype == "in" or filtertype == 'in_file': if D[column] in value: out.write("%s\n" % str(item)) elif filtertype == "notin" or filtertype == 'notin_file': if D[column] not in value: out.write("%s\n" % str(item)) elif filtertype == "morethan": if float(D[column]) > value: out.write("%s\n" % str(item)) elif filtertype == "lessthan": if float(D[column]) < value: out.write("%s\n" % str(item)) out.close() gfile.close()
def buildRepeatTrack(infile, outfile): '''build a repeat track as negative control.''' nrepeats = 0 for gff in GTF.iterator(gzip.open(infile, "r")): nrepeats += 1 sample = set( random.sample(range(nrepeats), PARAMS["ancestral_repeats_samplesize"])) outf = gzip.open(outfile, "w") gtf = GTF.Entry() for x, gff in enumerate(GTF.iterator(gzip.open(infile, "r"))): if x not in sample: continue gtf.fromGTF(gff, "%08i" % x, "%08i" % x) outf.write("%s\n" % str(gtf)) outf.close() E.debug("created sample of %i repeats out of %i in %s" % (len(sample), nrepeats, outfile))
def extractEnsemblLincRNA(infile, outfile): tmpf = P.getTempFile("/ifs/scratch") for gtf in GTF.iterator(iotools.openFile(infile)): if gtf.source == "lincRNA": tmpf.write(str(gtf) + "\n") else: continue tmpf.close() tmpf = tmpf.name statement = ("cat %(tmpf)s |" " cgat gtf2gtf" " --method=sort --sort-order=gene" " --log=%(outfile)s.log |" " gzip > %(outfile)s") P.run() os.unlink(tmpf)
def get_contigs(infile, outfile): '''Generate a pseudo-contigs file from the geneset, where the length of each contigs is determined by the GTF entry with the highest end coordinate. Will not stop things going off the end on contigs, but that doesn't really matter for our purposes''' last_contig = None max_end = 0 outlines = [] for entry in GTF.iterator(iotools.open_file(infile)): if last_contig and entry.contig != last_contig: outlines.append([entry.contig, str(max_end)]) max_end = 0 max_end = max(max_end, entry.end) last_contig = entry.contig outlines.append([last_contig, str(max_end)]) iotools.write_lines(outfile, outlines, header=None)
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' # just load each transcript with its classification temp = P.getTempFile(".") inf = iotools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() P.load(temp.name, outfile, options="--header-names=transcript_id,gene_id,class " "--add-index=transcript_id " "--add-index=gene_id") os.unlink(temp.name)
def _count(self, filename, idx): overlapping_genes = set() genes = set() # iterate over exons infile = iotools.open_file(filename, "r") it = GTF.iterator(infile) nexons, nexons_overlapping = 0, 0 nbases, nbases_overlapping = 0, 0 for this in it: nexons += 1 nbases += this.end - this.start genes.add(this.gene_id) try: intervals = list(idx[this.contig].find(this.start, this.end)) except KeyError: continue if len(intervals) == 0: continue overlapping_genes.add(this.gene_id) nexons_overlapping += 1 start, end = this.start, this.end counts = numpy.zeros(end - start, numpy.int) for other_start, other_end, other_value in intervals: for x in range( max(start, other_start) - start, min(end, other_end) - start): counts[x] += 1 nbases_overlapping += sum([1 for x in counts if x > 0]) infile.close() return len(genes), len( overlapping_genes ), nexons, nexons_overlapping, nbases, nbases_overlapping
def annotate(infile, annotation_file, outfile): ''' annotate infile with annotations from annotation gtf file ''' inf = open(infile) header = inf.readline() include = set() E.info("reading genes to keep") for line in inf.readlines(): data = line[:-1].split("\t") gene_id = data[8].strip('"') include.add(gene_id) E.info("reading annotations file") annotations = {} for gtf in GTF.iterator(iotools.openFile(annotation_file)): if gtf.gene_id in include: annotations[gtf.gene_id] = \ [gtf.gene_name, gtf.species, gtf.description] inf = open(infile) header = inf.readline() E.info("writing results with annotations") outf = open(outfile, "w") outf.write( header.strip("\n") + "\tgene_name\tspecies_centroid\tdescription\n") for line in inf.readlines(): data = line[:-1].split("\t") gene_id = data[8].strip('"') try: outf.write("\t".join(data + annotations[gene_id]) + "\n") except KeyError: outf.write("\t".join(data + ["NA", "NA", "NA"]) + "\n") outf.close()
def _count(self, filename, idx): overlapping_genes = set() genes = set() # iterate over exons infile = iotools.open_file(filename, "r") it = GTF.iterator(infile) for this in it: genes.add(this.gene_id) try: intervals = idx[this.contig].find(this.start, this.end) except KeyError: continue if len(intervals) == 0: continue overlapping_genes.add(this.gene_id) infile.close() return genes, overlapping_genes
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: " "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z " "andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default]") parser.add_option("-f", "--features", dest="features", type="string", action="append", help="features to collect " "[default=%default]") parser.add_option("-w", "--window-size", dest="window_size", type="int", help="window size in bp for histogram computation. " "Determines the bin size. " "[default=%default]") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="number of bins for histogram computation " "if window size is not given. " "[default=%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "genomic", "histogram", ), help="methods to apply. " "[default=%default]") parser.set_defaults( genome_file=None, window_size=None, num_bins=1000, value_format="%6.4f", features=[], method="genomic", ) (options, args) = E.start(parser, add_output_options=True) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.method == "histogram": gff = GTF.readFromFile(options.stdin) gff.sort(key=lambda x: (x.contig, x.start)) chunk = [] last_contig = None for entry in gff: if last_contig != entry.contig: processChunk(last_contig, chunk, options, fasta) last_contig = entry.contig chunk = [] chunk.append(entry) processChunk(last_contig, chunk, options, fasta) elif options.method == "genomic": intervals = collections.defaultdict(int) bases = collections.defaultdict(int) total = 0 for entry in GTF.iterator(options.stdin): intervals[(entry.contig, entry.source, entry.feature)] += 1 bases[(entry.contig, entry.source, entry.feature)] += entry.end - entry.start total += entry.end - entry.start options.stdout.write("contig\tsource\tfeature\tintervals\tbases") if fasta: options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n") else: options.stdout.write("\n") total_genome_size = sum( fasta.getContigSizes(with_synonyms=False).values()) for key in sorted(intervals.keys()): nbases = bases[key] nintervals = intervals[key] contig, source, feature = key options.stdout.write("\t".join( ("\t".join(key), str(nintervals), str(nbases)))) if fasta: options.stdout.write( "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig))) options.stdout.write( "\t%f\n" % (100.0 * float(nbases) / total_genome_size)) else: options.stdout.write("\n") E.stop()
def main(argv=None): ''' main function ''' if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-o", "--output-only-attributes", dest="only_attributes", action="store_true", help="output only attributes as separate columns " "[default=%default].") parser.add_option( "-f", "--attributes-as-columns", dest="output_full", action="store_true", help="output attributes as separate columns " "[default=%default].") parser.add_option("--is-gff3", dest="is_gtf", action="store_false", help="input file is in gtf format [default=%default] ") parser.add_option( "-i", "--invert", dest="invert", action="store_true", help="convert tab-separated table back to gtf " "[default=%default].") parser.add_option( "-m", "--output-map", dest="output_map", type="choice", choices=( "transcript2gene", "peptide2gene", "peptide2transcript"), help="output a map mapping transcripts to genes " "[default=%default].") parser.set_defaults( only_attributes=False, output_full=False, invert=False, output_map=None, is_gtf=True ) (options, args) = E.start(parser, argv=argv) if options.output_full: # output full table with column for each attribute attributes = set() data = [] if options.is_gtf: for gtf in GTF.iterator(options.stdin): data.append(gtf) attributes = attributes.union(set(gtf.keys())) else: for gff in GFF3.iterator_from_gff(options.stdin): data.append(gff) attributes = attributes.union(set(gff.attributes)) # remove gene_id and transcript_id, as they are used # explicitely later attributes.difference_update(["gene_id", "transcript_id"]) attributes = sorted(list(attributes)) # Select whether gtf of gff for output columns if options.is_gtf: if options.only_attributes: header = ["gene_id", "transcript_id"] + attributes else: header = ["contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", ] + attributes else: if options.only_attributes: header = attributes else: header = ["contig", "source", "feature", "start", "end", "score", "strand", "frame"] + attributes attributes_new = header options.stdout.write("\t".join(header) + "\n") if options.is_gtf: for gtf in data: first = True for a in attributes_new: try: val = getattr(gtf, a) except (AttributeError, KeyError): val = "" if first: options.stdout.write("%s" % val) first = False else: options.stdout.write("\t%s" % val) options.stdout.write("\n") else: for gff in data: options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t") % (gff.contig, gff.source, gff.feature, gff.start, gff.end, gff.score, gff.strand, gff.frame)) first = True for a in attributes: try: val = (gff.attributes[a]) except (AttributeError, KeyError): val = '' if first: options.stdout.write("%s" % val) first = False else: options.stdout.write("\t%s" % val) options.stdout.write("\n") elif options.invert: gtf = GTF.Entry() header = None for line in options.stdin: if line.startswith("#"): continue data = line[:-1].split("\t") if not header: header = data map_header2column = dict( [(y, x) for x, y in enumerate(header)]) continue # fill gtf entry with data try: gtf.contig = data[map_header2column["contig"]] gtf.source = data[map_header2column["source"]] gtf.feature = data[map_header2column["feature"]] # subtract -1 to start for 0-based coordinates gtf.start = int(data[map_header2column["start"]]) gtf.end = int(data[map_header2column["end"]]) gtf.score = data[map_header2column["score"]] gtf.strand = data[map_header2column["strand"]] gtf.frame = data[map_header2column["frame"]] gtf.gene_id = data[map_header2column["gene_id"]] gtf.transcript_id = data[map_header2column["transcript_id"]] gtf.parseInfo(data[map_header2column["attributes"]], line) except KeyError as msg: raise KeyError("incomplete entry %s: %s: %s" % (str(data), str(map_header2column), msg)) if gtf.frame is None: gtf.frame = "." # output gtf entry in gtf format options.stdout.write("%s\n" % str(gtf)) elif options.output_map: if options.output_map == "transcript2gene": fr = lambda x: x.transcript_id to = lambda x: x.gene_id options.stdout.write("transcript_id\tgene_id\n") elif options.output_map == "peptide2gene": fr = lambda x: x.protein_id to = lambda x: x.gene_id options.stdout.write("peptide_id\tgene_id\n") elif options.output_map == "peptide2transcript": fr = lambda x: x.protein_id to = lambda x: x.transcript_id options.stdout.write("peptide_id\ttranscript_id\n") map_fr2to = {} for gtf in GTF.iterator(options.stdin): try: map_fr2to[fr(gtf)] = to(gtf) except (AttributeError, KeyError): pass for x, y in sorted(map_fr2to.items()): options.stdout.write("%s\t%s\n" % (x, y)) else: header = ("contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", "attributes") options.stdout.write("\t".join(header) + "\n") for gtf in GTF.iterator(options.stdin): attributes = [] for a in list(gtf.keys()): if a in ("gene_id", "transcript_id"): continue attributes.append('%s %s' % (a, GTF.quote(gtf[a]))) attributes = "; ".join(attributes) # Capture if None and set to . format if gtf.frame is None: gtf.frame = "." options.stdout.write(str(gtf) + "\n") E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-q", "--quality-file", dest="quality_file", type=str, help="filename with genomic base quality " "information.") parser.add_argument("-b", "--bam-file", dest="bam_files", type=str, metavar="bam", help="filename with read mapping information. " "Multiple files can be submitted in a " "comma-separated list.") parser.add_argument("-i", "--bigwig-file", dest="bigwig_file", type=str, metavar="bigwig", help="filename with bigwig information ") parser.add_argument("-f", "--gff-file", dest="filename_gff", type=str, action="append", metavar='bed', help="filename with extra gff files. The order " "is important.") parser.add_argument("--filename-format", dest="filename_format", type=str, choices=("bed", "gff", "gtf"), help="format of secondary stream.") parser.add_argument("--restrict-source", dest="gff_sources", type=str, action="append", help="restrict input to this 'source' in extra " "gff file (for counter: overlap).") parser.add_argument("--restrict-feature", dest="gff_features", type=str, action="append", help="restrict input to this 'feature' in extra gff " "file (for counter: overlap).") parser.add_argument("-r", "--reporter", dest="reporter", type=str, choices=("genes", "transcripts"), help="report results for 'genes' or 'transcripts' ") parser.add_argument("-s", "--section", dest="sections", type=str, action="append", choices=("exons", "introns"), help="select range on which counters will operate ") parser.add_argument( "-c", "--counter", dest="counters", type=str, action="append", choices=("bigwig-counts", "binding-pattern", "classifier", "classifier-rnaseq", "classifier-rnaseq-splicing", "classifier-polii", "composition-na", "composition-cpg", "coverage", "distance", "distance-genes", "distance-tss", "length", 'neighbours', "overlap", "overlap-stranded", "overlap-transcripts", "overrun", "position", "proximity", "proximity-exclusive", "proximity-lengthmatched", "quality", "read-coverage", "read-extension", "read-overlap", "read-counts", "read-fullcounts", "readpair-counts", "readpair-fullcounts", "splice", "splice-comparison", "territories"), help="select counters to apply to input ") parser.add_argument("--add-gtf-source", dest="add_gtf_source", action="store_true", help="add gtf field of source to output ") parser.add_argument("--proximal-distance", dest="proximal_distance", type=int, help="distance to be considered proximal to " "an interval.") parser.add_argument("--multi-mapping-method", dest="multi_mapping", type=str, choices=('all', 'ignore', 'weight'), help="how to treat multi-mapping reads in " "bam-files. Requires " "the NH flag to be set by the mapper ") parser.add_argument("--use-barcodes", dest="use_barcodes", action="store_true", help="Use barcodes to count unique umi's. " "UMI's are specified in the read identifier " "as the last field, where fields are separated " "by underscores, e.g. " "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. " "When true, unique counts are returned. " "Currently only compatible with count-reads") parser.add_argument("--sample-probability", dest="sample_probability", type=float, help="Specify the probability of whether any" "given read or read pair in a file bam is counted" "Currently only compatible with count-reads") parser.add_argument("--column-prefix", dest="prefixes", type=str, action="append", help="add prefix to column headers - prefixes " "are used in the same order as the counters ") parser.add_argument("--library-type", dest="library_type", type=str, choices=("unstranded", "firststrand", "secondstrand", "fr-unstranded", "fr-firststrand", "fr-secondstrand"), help="library type of reads in bam file. ") parser.add_argument("--min-mapping-quality", dest="minimum_mapping_quality", type=float, help="minimum mapping quality. Reads with a quality " "score of less will be ignored. ") parser.set_defaults(genome_file=None, reporter="genes", with_values=True, sections=[], counters=[], filename_gff=[], filename_format=None, gff_features=[], gff_sources=[], add_gtf_source=False, proximal_distance=10000, bam_files=None, multi_mapping='all', library_type='fr-unstranded', prefixes=[], minimum_mapping_quality=0, use_barcodes=False, sample_probability=1.0) if not argv: argv = sys.argv (args) = E.start(parser, add_output_options=True, argv=argv) if args.prefixes: if len(args.prefixes) != len(args.counters): raise ValueError("if any prefix is given, the number of prefixes " "must be the same as the number of counters") # get files if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.quality_file: quality = IndexedFasta.IndexedFasta(args.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) else: quality = None if args.bam_files: bam_files = [] for bamfile in args.bam_files.split(","): bam_files.append(pysam.AlignmentFile(bamfile, "rb")) else: bam_files = None if args.bigwig_file: bigwig_file = pyBigWig.open(args.bigwig_file) else: bigwig_file = None counters = [] if not args.sections: E.info("counters will use the default section (exons)") args.sections.append(None) if not args.gff_sources: args.gff_sources.append(None) if not args.gff_features: args.gff_features.append(None) cc = E.Counter() for n, c in enumerate(args.counters): if args.prefixes: prefix = args.prefixes[n] else: prefix = None if c == "position": for section in args.sections: counters.append( GeneModelAnalysis.CounterPosition(section=section, options=args, prefix=prefix)) elif c == "length": for section in args.sections: counters.append( GeneModelAnalysis.CounterLengths(section=section, options=args, prefix=prefix)) elif c == "splice": if fasta is None: raise ValueError('splice requires a genomic sequence') counters.append( GeneModelAnalysis.CounterSpliceSites(fasta=fasta, prefix=prefix)) elif c == "quality": if fasta is None: raise ValueError('quality requires a quality score sequence') counters.append( GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix)) elif c == "overrun": counters.append( GeneModelAnalysis.CounterOverrun( filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-coverage": counters.append( GeneModelAnalysis.CounterReadCoverage(bam_files, options=args, prefix=prefix)) elif c == "read-extension": counters.append( GeneModelAnalysis.CounterReadExtension( bam_files, filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-overlap": counters.append( GeneModelAnalysis.CounterReadOverlap( bam_files, multi_mapping=args.multi_mapping, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-counts": counters.append( GeneModelAnalysis.CounterReadCounts( bam_files, multi_mapping=args.multi_mapping, use_barcodes=args.use_barcodes, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-fullcounts": counters.append( GeneModelAnalysis.CounterReadCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-counts": counters.append( GeneModelAnalysis.CounterReadPairCounts( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, library_type=args.library_type, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-fullcounts": counters.append( GeneModelAnalysis.CounterReadPairCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "bigwig-counts": counters.append( GeneModelAnalysis.CounterBigwigCounts(bigwig_file, options=args, prefix=prefix)) elif c == "splice-comparison": if fasta is None: raise ValueError('splice-comparison requires a genomic ' 'sequence') counters.append( GeneModelAnalysis.CounterSpliceSiteComparison( fasta=fasta, filename_gff=args.filename_gff, feature=None, source=None, options=args, prefix=prefix)) elif c == "composition-na": if fasta is None: raise ValueError('composition-na requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionNucleotides( fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "composition-cpg": if fasta is None: raise ValueError('composition-cpg requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionCpG(fasta=fasta, section=section, options=args, prefix=prefix)) elif c in ("overlap", "overlap-stranded", "overlap-transcripts", "proximity", "proximity-exclusive", "proximity-lengthmatched", "neighbours", "territories", "distance", "distance-genes", "distance-tss", "binding-pattern", "coverage"): if c == "overlap": template = GeneModelAnalysis.CounterOverlap if c == "overlap-stranded": template = GeneModelAnalysis.CounterOverlapStranded elif c == "overlap-transcripts": template = GeneModelAnalysis.CounterOverlapTranscripts elif c == "proximity": template = GeneModelAnalysis.CounterProximity elif c == "neighbours": template = GeneModelAnalysis.CounterNeighbours elif c == "proximity-exclusive": template = GeneModelAnalysis.CounterProximityExclusive elif c == "proximity-lengthmatched": template = GeneModelAnalysis.CounterProximityLengthMatched elif c == "territories": template = GeneModelAnalysis.CounterTerritories elif c == "distance": template = GeneModelAnalysis.CounterDistance elif c == "distance-genes": template = GeneModelAnalysis.CounterDistanceGenes elif c == "distance-tss": template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites elif c == "coverage": template = GeneModelAnalysis.CounterCoverage elif c == "binding-pattern": template = GeneModelAnalysis.CounterBindingPattern for section in args.sections: for source in args.gff_sources: for feature in args.gff_features: counters.append( template(filename_gff=args.filename_gff, feature=feature, source=source, fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "classifier": counters.append( GeneModelAnalysis.Classifier(filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq": counters.append( GeneModelAnalysis.ClassifierRNASeq( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq-splicing": counters.append( GeneModelAnalysis.ClassifierRNASeqSplicing( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-polii": counters.append( GeneModelAnalysis.ClassifierPolII( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) elif c == "binding-pattern": counters.append( GeneModelAnalysis.CounterBindingPattern( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) if args.reporter == "genes": iterator = GTF.flat_gene_iterator header = ["gene_id"] fheader = lambda x: [x[0].gene_id] elif args.reporter == "transcripts": iterator = GTF.transcript_iterator header = ["transcript_id"] fheader = lambda x: [x[0].transcript_id] if args.add_gtf_source: header.append("source") ffields = lambda x: [x[0].source] else: ffields = lambda x: [] args.stdout.write("\t".join(header + [x.getHeader() for x in counters]) + "\n") for gffs in iterator(GTF.iterator(args.stdin)): cc.input += 1 for counter in counters: counter.update(gffs) skip = len([x for x in counters if x.skip]) == len(counters) if skip: cc.skipped += 1 continue args.stdout.write("\t".join( fheader(gffs) + ffields(gffs) + [str(counter) for counter in counters]) + "\n") cc.output += 1 E.info("%s" % str(cc)) for counter in counters: E.info("%s\t%s" % (repr(counter), str(counter.counter))) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with iotools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = quicksect.IntervalTree() for start, end in e[contig]: intersector.add(start, end) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find( quicksect.Interval(start, end))] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input file is in gtf format") parser.add_argument("--set-name", dest="name", type=str, help="field from the GFF/GTF file to use as the " "name field in the BED file ", choices=("gene_id", "transcript_id", "class", "family", "feature", "source", "repName", "gene_biotype")) parser.add_argument("--track", dest="track", type=str, choices=("feature", "source", None), help="use feature/source field to define BED tracks ") parser.add_argument( "--bed12-from-transcripts", dest="bed12", action="store_true", default=False, help="Process GTF file into Bed12 entries, with blocks as exons" "and thick/thin as coding/non-coding") parser.set_defaults(track=None, name="gene_id", is_gtf=False) (args) = E.start(parser, add_pipe_options=True) ninput, noutput = 0, 0 iterator = GTF.iterator(args.stdin) if args.bed12: iterator = GTF.transcript_iterator(iterator) if args.track: all_input = list(iterator) if args.track == "feature": grouper = lambda x: x.feature elif args.track == "source": grouper = lambda x: x.source all_input.sort(key=grouper) bed = Bed.Bed() for key, vals in itertools.groupby(all_input, grouper): args.stdout.write("track name=%s\n" % key) for gff in vals: ninput += 1 if args.bed12: bed = transcript2bed12(gff) else: bed.fromGTF(gff, name=args.name) args.stdout.write(str(bed) + "\n") noutput += 1 else: bed = Bed.Bed() for gff in iterator: ninput += 1 if args.bed12: bed = transcript2bed12(gff) else: bed.fromGTF(gff, name=args.name) args.stdout.write(str(bed) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i" % (ninput, noutput)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--mask-bed-file", "--mask-gff-file", dest="filename_bed", type="string", metavar='GFF', help="gff formatted file with masking locations. The number of " "reads overlapping the intervals in the given file will be " "computed. Note that the computation currently does not take " "into account indels, so it is an approximate count only. " "[%default]") parser.add_option( "-f", "--ignore-masked-reads", dest="ignore_masked_reads", action="store_true", help="as well as counting reads in the file given by --mask-bed-file, " "also remove these reads for duplicate and match statistics. " "[%default]") parser.add_option( "-i", "--num-reads", dest="input_reads", type="int", help="the number of reads - if given, used to provide percentages " "[%default]") parser.add_option( "-d", "--output-details", dest="output_details", action="store_true", help="output per-read details into a separate file. Read names are " "md5/base64 encoded [%default]") parser.add_option("--output-readmap", dest="output_readmap", action="store_true", help="output map between read name and " "md5/base64 encoded short name[%default]") parser.add_option( "--add-alignment-details", dest="add_alignment_details", action="store_true", help= "add alignment details to per-read details. Implies --output-details " "[%default]") parser.add_option( "-q", "--fastq-file", dest="filename_fastq", help="filename with sequences and quality scores. This file is only " "used to collect sequence identifiers. Thus, for paired end data a " "single file is sufficient [%default]") parser.add_option( "--basic-counts", dest="detailed_count", action="store_false", help="perform basic counting and do not compute per read stats. " "This is more memory efficient and faster stats computation, " "but only a summary counts table is output [%default]") parser.set_defaults( filename_bed=None, ignore_masked_reads=False, input_reads=0, force_output=False, filename_fastq=None, detailed_count=True, output_details=False, output_readmap=False, add_alignment_details=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if options.filename_bed: bed_mask = GTF.readAndIndex( GTF.iterator(iotools.open_file(options.filename_bed))) else: bed_mask = None if options.add_alignment_details: options.output_details = True is_stdin = True if len(args) > 0: pysam_in = pysam.AlignmentFile(args[0], "rb") if args[0] != "-": is_stdin = False elif options.stdin == sys.stdin: pysam_in = pysam.AlignmentFile("-", "rb") else: pysam_in = pysam.AlignmentFile(options.stdin, "rb") if options.stdin != "-": is_stdin = False if options.output_details: outfile_details = E.open_output_file("details", "w") else: outfile_details = None if options.output_readmap: outfile_readmap = E.open_output_file("readmap", "w") else: outfile_readmap = None if options.filename_fastq and not os.path.exists(options.filename_fastq): raise IOError("file %s does not exist" % options.filename_fastq) (counter, flags_counts, nh_filtered, nh_all, nm_filtered, nm_all, mapq, mapq_all, max_hi, details_df) = \ bam2stats_count(pysam_in, bed_mask=bed_mask, ignore_masked_reads=options.ignore_masked_reads, is_stdin=is_stdin, filename_fastq=options.filename_fastq, outfile_details=outfile_details, add_alignment_details=options.add_alignment_details, outfile_readmap=outfile_readmap, detailed_count=options.detailed_count) if max_hi > 0 and max_hi != max(nh_all.keys()): E.warn("max_hi(%i) is inconsistent with max_nh (%i) " "- counts will be corrected" % (max_hi, max(nh_all.keys()))) outs = options.stdout outs.write("category\tcounts\tpercent\tof\n") def _write(outs, text, numerator, denominator, base): percent = iotools.pretty_percent(numerator, denominator) outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base)) ############################### ############################### ############################### # Output alignment information ############################### nalignments_unmapped = flags_counts["unmapped"] nalignments_mapped = counter.alignments_input - nalignments_unmapped _write(outs, "alignments_total", counter.alignments_input, counter.alignments_input, "alignments_total") if counter.alignments_input == 0: E.warn("no alignments in BAM file - no further output") E.stop() return _write(outs, "alignments_mapped", nalignments_mapped, counter.alignments_input, 'alignments_total') _write(outs, "alignments_unmapped", nalignments_unmapped, counter.alignments_input, 'alignments_total') if nalignments_mapped == 0: E.warn("no mapped alignments - no further output") E.stop() return for flag, counts in sorted(flags_counts.items()): if flag == "unmapped": continue _write(outs, 'alignments_' + flag, counts, nalignments_mapped, 'alignments_mapped') if options.filename_bed: _write(outs, "alignments_masked", counter.alignments_masked, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_notmasked", counter.alignments_notmasked, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_filtered", counter.alignments_filtered, nalignments_mapped, "alignments_mapped") if counter.filtered == nalignments_mapped: normby = "alignments_mapped" else: normby = "alignments_filtered" if counter.filtered > 0: _write(outs, "alignments_duplicates", counter.alignments_duplicates, counter.alignments_filtered, normby) _write(outs, "alignments_unique", counter.aligmnments_filtered - counter.alignments_duplicates, counter.alignments_filtered, normby) ############################### ############################### ############################### # Output read based information ############################### # derive the number of mapped reads in file from alignment counts if options.filename_fastq or not is_stdin: nreads_total = counter.total_read _write(outs, "reads_total", counter.total_read, nreads_total, 'reads_total') _write(outs, "reads_unmapped", counter.total_read_is_unmapped, nreads_total, 'reads_total') _write(outs, "reads_mapped", counter.total_read_is_mapped, nreads_total, 'reads_total') _write(outs, "reads_missing", counter.total_read_is_missing, nreads_total, 'reads_total') _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_multimapping", counter.total_read_is_mmap, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_mapped_supplementary", counter.total_read_has_supplementary, counter.total_read_is_mapped, 'reads_mapped') else: E.warn('inferring read counts from alignments and NH tags') nreads_unmapped = flags_counts["unmapped"] nreads_mapped = computeMappedReadsFromAlignments( nalignments_mapped, nh_all, max_hi) nreads_missing = 0 if options.input_reads: nreads_total = options.input_reads # unmapped reads in bam file? if nreads_unmapped: nreads_missing = nreads_total - nreads_unmapped - nreads_mapped else: nreads_unmapped = nreads_total - nreads_mapped elif nreads_unmapped: # if unmapped reads are in bam file, take those nreads_total = nreads_mapped + nreads_unmapped else: # otherwise normalize by mapped reads nreads_unmapped = 0 nreads_total = nreads_mapped outs.write("reads_total\t%i\t%5.2f\treads_total\n" % (nreads_total, 100.0)) outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" % (nreads_mapped, 100.0 * nreads_mapped / nreads_total)) outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" % (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total)) outs.write("reads_missing\t%i\t%5.2f\treads_total\n" % (nreads_missing, 100.0 * nreads_missing / nreads_total)) if len(nh_all) > 1: outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" % (nh_all[1], 100.0 * nh_all[1] / nreads_mapped)) pysam_in.close() ############################### ############################### ############################### # Output pair information ############################### if flags_counts["read2"] > 0: if options.filename_fastq: pairs_mapped = counter.total_pair_is_mapped # sanity check assert counter.total_pair_is_mapped == \ (counter.total_pair_is_proper_uniq + counter.total_pair_is_incomplete_uniq + counter.total_pair_is_incomplete_mmap + counter.total_pair_is_proper_duplicate + counter.total_pair_is_proper_mmap + counter.total_pair_not_proper_uniq + counter.total_pair_is_other) outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (counter.total_pairs, 100.0 * counter.total_pairs / counter.total_pairs)) outs.write( "pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs)) outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_unmapped, 100.0 * counter.total_pair_is_unmapped / counter.total_pairs)) outs.write( "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_uniq, 100.0 * counter.total_pair_is_proper_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_uniq, 100.0 * counter.total_pair_is_incomplete_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_mmap, 100.0 * counter.total_pair_is_incomplete_mmap / counter.total_pairs)) outs.write( "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_duplicate, 100.0 * counter.total_pair_is_proper_duplicate / counter.total_pairs)) outs.write( "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_mmap, 100.0 * counter.total_pair_is_proper_mmap / counter.total_pairs)) outs.write( "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_not_proper_uniq, 100.0 * counter.total_pair_not_proper_uniq / counter.total_pairs)) outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_other, 100.0 * counter.total_pair_is_other / counter.total_pairs)) nread1_total = counter.total_read1 _write(outs, "read1_total", counter.total_read1, nread1_total, 'read1_total') _write(outs, "read1_unmapped", counter.total_read1_is_unmapped, nread1_total, 'read1_total') _write(outs, "read1_mapped", counter.total_read1_is_mapped, nread1_total, 'read1_total') _write(outs, "read1_mapped_unique", counter.total_read1_is_mapped_uniq, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "reads_multimapping", counter.total_read1_is_mmap, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "read1_missing", counter.total_read1_is_missing, counter.total_read1_is_mapped, 'read1_total') nread2_total = counter.total_read2 _write(outs, "read2_total", counter.total_read2, nread2_total, 'read2_total') _write(outs, "read2_unmapped", counter.total_read2_is_unmapped, nread2_total, 'read2_total') _write(outs, "read2_mapped", counter.total_read2_is_mapped, nread2_total, 'read2_total') _write(outs, "read2_mapped_unique", counter.total_read2_is_mapped_uniq, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "reads_multimapping", counter.total_read2_is_mmap, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "read2_missing", counter.total_read2_is_missing, counter.total_read2_is_mapped, 'read2_total') else: # approximate counts pairs_total = nreads_total // 2 pairs_mapped = flags_counts["proper_pair"] // 2 _write(outs, "pairs_total", pairs_total, pairs_total, "pairs_total") _write(outs, "pairs_mapped", pairs_mapped, pairs_total, "pairs_total") else: # no paired end data pairs_total = pairs_mapped = 0 outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (pairs_total, 0.0)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 0.0)) outs.write("error_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.error_counts, counter.error_rate * 100.0)) outs.write("insertion_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.insertion_counts, counter.insertion_rate * 100.0)) outs.write("deletion_rate\t%i\t%5.2f\tmatches+deletions\n" % (counter.deletion_counts, counter.deletion_rate * 100.0)) outs.write("mismatch_rate\t%i\t%5.2f\tmatches\n" % (counter.mismatch_counts, counter.mismatch_rate * 100.0)) outs.write("match_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.match_counts, counter.match_rate * 100.0)) if options.force_output or len(nm_filtered) > 0: outfile = E.open_output_file("nm", "w") outfile.write("NM\talignments\n") if len(nm_filtered) > 0: for x in range(0, max(nm_filtered.keys()) + 1): outfile.write("%i\t%i\n" % (x, nm_filtered[x])) else: outfile.write("0\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(nh_all) > 1: outfile = E.open_output_file("nh_all", "w") outfile.write("NH\treads\n") if len(nh_all) > 0: writeNH(outfile, nh_all, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.mapped_reads)) outfile.close() if options.force_output or len(nh_filtered) > 1: outfile = E.open_output_file("nh", "w") outfile.write("NH\treads\n") if len(nh_filtered) > 0: writeNH(outfile, nh_filtered, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(mapq_all) > 1: outfile = E.open_output_file("mapq", "w") outfile.write("mapq\tall_reads\tfiltered_reads\n") for x in range(0, max(mapq_all.keys()) + 1): outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x])) outfile.close() if details_df is not None: with E.open_output_file("summaries", "w") as outf: details_df.describe().transpose().to_csv(outf, sep="\t", index_label="metric") bins = numpy.arange(0, 1.01, 0.01) histogram_df = pandas.DataFrame.from_items([ (x, numpy.histogram(details_df[x].dropna(), bins=bins)[0]) for x in details_df.columns ]) histogram_df.index = numpy.arange(0, 1.0, 0.01) row_sums = histogram_df.sum(axis=1) histogram_df = histogram_df[row_sums != 0] with E.open_output_file("histogram", "w") as outf: histogram_df.to_csv(outf, sep="\t", index_label="bin") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--min-chunk-size", dest="min_chunk_size", type="int", help="minimum chunk size [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="do not create any files [default=%default].") parser.set_defaults( method="overlap", dry_run=False, min_chunk_size=2, output_filename_pattern="%06i.chunk", ) (options, args) = E.start(parser, add_output_options=True) gffs = GTF.iterator(options.stdin) ninput, noutput, nchunks = 0, 0, 0 outputChunk = OutputChunk(options.output_filename_pattern, dry_run=options.dry_run) if options.method == "overlap": last_contig, last_to = None, 0 chunk = [] for gff in gffs: ninput += 1 if len(chunk) >= options.min_chunk_size and \ (gff.contig != last_contig or gff.start > last_to): noutput += outputChunk(chunk) nchunks += 1 chunk = [] last_contig, last_to = gff.contig, gff.end chunk.append(gff) last_to = max(gff.end, last_to) noutput += outputChunk(chunk) nchunks += 1 E.info("ninput=%i, noutput=%i, nchunks=%i" % (ninput, noutput, nchunks)) E.stop()
def cropGFF(gffs, filename_gff): """crop intervals in gff file.""" # read regions to crop with and convert intervals to intersectors E.info("reading gff for cropping: started.") other_gffs = GTF.iterator(iotools.open_file(filename_gff, "r")) cropper = GTF.readAsIntervals(other_gffs) ntotal = 0 for contig in list(cropper.keys()): intersector = quicksect.IntervalTree() for start, end in cropper[contig]: intersector.add(start, end) ntotal += 1 cropper[contig] = intersector E.info("reading gff for cropping: finished.") E.info("reading gff for cropping: %i contigs with %i intervals." % (len(cropper), ntotal)) ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0 # do the actual cropping for gff in gffs: ninput += 1 if gff.contig in cropper: start, end = gff.start, gff.end overlaps = cropper[gff.contig].find(quicksect.Interval(start, end)) if overlaps: l = end - start a = numpy.ones(l) for i in overlaps: s = max(0, i.start - start) e = min(l, i.end - start) a[s:e] = 0 segments = Intervals.fromArray(a) if len(segments) == 0: ndeleted += 1 else: ncropped += 1 for s, e in segments: gff.start, gff.end = s + start, e + start noutput += 1 yield (gff) continue noutput += 1 yield (gff) E.info("ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i" % (ninput, noutput, ncropped, ndeleted))
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-m", "--method", dest="method", type=str, choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand", "rename-chr"), help="method to apply ") parser.add_argument("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf.") parser.add_argument("-c", "--contigs-tsv-file", dest="input_filename_contigs", type=str, help="filename with contig lengths.") parser.add_argument( "--agp-file", dest="input_filename_agp", type=str, help="agp file to map coordinates from contigs to scaffolds.") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("--crop-gff-file", dest="filename_crop_gff", type=str, help="GFF/GTF file to crop against.") parser.add_argument( "--group-field", dest="group_field", type=str, help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... .""") parser.add_argument( "--filter-range", dest="filter_range", type=str, help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_argument("--sanitize-method", dest="sanitize_method", type=str, choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " ".") parser.add_argument( "--flank-method", dest="flank_method", type=str, choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " ".") parser.add_argument("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised .") parser.add_argument( "--contig-pattern", dest="contig_pattern", type=str, help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize .") parser.add_argument( "--assembly-report", dest="assembly_report", type=str, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-hasids", dest="assembly_report_hasIDs", type=int, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type=int, help="column in the assembly report containing ucsc contig ids" ".") parser.add_argument( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type=int, help="column in the assembly report containing ensembl contig ids") parser.add_argument( "--assembly-extras", dest="assembly_extras", type=str, help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome .") parser.add_argument("--extension-upstream", dest="extension_upstream", type=float, help="extension for upstream end .") parser.add_argument("--extension-downstream", dest="extension_downstream", type=float, help="extension for downstream end .") parser.add_argument("--min-distance", dest="min_distance", type=int, help="minimum distance of features to merge/join .") parser.add_argument("--max-distance", dest="max_distance", type=int, help="maximum distance of features to merge/join .") parser.add_argument("--min-features", dest="min_features", type=int, help="minimum number of features to merge/join .") parser.add_argument("--max-features", dest="max_features", type=int, help="maximum number of features to merge/join .") parser.add_argument( "--rename-chr-file", dest="rename_chr_file", type=str, help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, rename_chr_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (args) = E.start(parser, argv=argv) contigs = None genome_fasta = None chr_map = None if args.input_filename_contigs: contigs = Genomics.readContigSizes( iotools.open_file(args.input_filename_contigs, "r")) if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() if args.rename_chr_file: chr_map = {} with open(args.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") if args.assembly_report: df = pd.read_csv(args.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if args.assembly_report_hasIDs == 1: ucsccol = args.assembly_report_ucsccol ensemblcol = args.assembly_report_ensemblcol df.loc[df[1] == "assembled-molecule", ensemblcol] = df.loc[df[1] == "assembled-molecule", 0] if args.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif args.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if args.assembly_extras is not None: assembly_extras = args.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if args.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if args.input_filename_agp: agp = AGP.AGP() agp.readFromFile(iotools.open_file(args.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(args.stdin) if args.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == args.method add_downstream_flank = "add-downstream-flank" == args.method if args.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(args.extension_upstream) downstream_flank = int(args.extension_downstream) extend_flank = args.flank_method == "extend" if args.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, args.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: args.stdout.write(str(gff) + "\n") elif args.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: if args.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start args.stdout.write(str(x) + "\n") x.start = c.end elif args.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" args.stdout.write(str(x) + "\n") elif args.method == "join-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=False, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "merge-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=True, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "crop": for gff in cropGFF(gffs, args.filename_crop_gff): args.stdout.write(str(gff) + "\n") elif args.method == "crop-unique": for gff in cropGFFUnique(gffs): args.stdout.write(str(gff) + "\n") elif args.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", args.filter_range).groups() except AttributeError: raise "can not parse range %s" % args.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): args.stdout.write(str(gff) + "\n") elif args.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif args.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif args.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if args.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if args.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if args.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if args.contig_pattern: to_remove = [ re.compile(x) for x in args.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue args.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) elif args.method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") for gff in renameChromosomes(gffs, chr_map): args.stdout.write(str(gff) + "\n") else: for gff in gffs: if args.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if args.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) args.stdout.write(str(gff) + "\n") E.stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-i", "--ignore-missing", dest="ignore_missing", action="store_true", help="Ignore transcripts on contigs that are not " "in the genome-file.") parser.add_argument("-s", "--restrict-source", dest="restrict_source", type=str, choices=("protein_coding", "pseudogene", "lncRNA"), help="restrict input by source.") parser.add_argument("-m", "--method", dest="method", type=str, choices=( "full", "genome", "exons", "promotors", "tts", "regulons", "tts-regulons", "genes", "territories", "tss-territories", "great-domains", ), help="method for defining segments.") parser.add_argument("-r", "--territory-extension", dest="radius", type=int, help="radius of a territory.") parser.add_argument("-f", "--flank-size", dest="flank", type=int, help="size of the flanking region next to a gene.") parser.add_argument( "--flank-increment-size", dest="increment", type=int, help="size of increment in flank in genestructure annotation ") parser.add_argument("-p", "--promotor-size", dest="promotor", type=int, help="size of a promotor region.") parser.add_argument("-u", "--upstream-extension", dest="upstream", type=int, help="size of region upstream of tss.") parser.add_argument("-d", "--downstream-extension", dest="downstream", type=int, help="size of region downstream of tss.") parser.add_argument("--gene-detail", dest="detail", type=str, choices=("introns+exons", "exons", "introns"), help="level of detail for gene structure annotation ") parser.add_argument("--merge-overlapping-promotors", dest="merge_promotors", action="store_true", help="merge overlapping promotors.") parser.add_argument( "--min-intron-length", dest="min_intron_length", type=int, help="minimum intron length. If the distance between two " "consecutive exons is smaller, the region will be marked " "'unknown'.") parser.add_argument( "--is-unsorted", dest="is_sorted", action="store_false", help="sort input before processing. Otherwise, the input is assumed " "to be sorted.") parser.set_defaults( genome_file=None, flank=1000, increment=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="genome", radius=50000, promotor=5000, merge_promotors=False, upstream=5000, downstream=5000, detail="exons", is_sorted=True, ) (args) = E.start(parser) if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: raise ValueError("please specify a --genome-file") if not args.restrict_source: iterator = GTF.iterator(args.stdin) elif args.restrict_source: iterator = GTF.iterator_filtered(GTF.iterator(args.stdin), source=args.restrict_source) # elif options.method in ("promotors", "tts", "regulons"): # iterator = GTF.iterator_filtered( GTF.iterator(options.stdin), source = "protein_coding") # else: # iterator = GTF.iterator(options.stdin) if not args.is_sorted: iterator = GTF.iterator_sorted(iterator, sort_order="position") if args.method == "full" or args.method == "genome": segmentor = annotateGenome(iterator, fasta, args) elif args.method == "territories": segmentor = buildTerritories(iterator, fasta, 'gene', args) elif args.method == "tss-territories": segmentor = buildTerritories(iterator, fasta, 'tss', args) elif args.method == "exons": segmentor = annotateExons(iterator, fasta, args) elif args.method == "promotors": segmentor = annotatePromoters(iterator, fasta, args) elif args.method == "regulons": segmentor = annotateRegulons(iterator, fasta, True, args) elif args.method == "tts-regulons": segmentor = annotateRegulons(iterator, fasta, False, args) elif args.method == "tts": segmentor = annotateTTS(iterator, fasta, args) elif args.method == "genes": segmentor = annotateGenes(iterator, fasta, args) elif args.method == "great-domains": segmentor = annotateGREATDomains(iterator, fasta, args) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-i", "--ignore-missing", dest="ignore_missing", action="store_true", help= "Ignore transcripts on contigs that are not in the genome-file [default=%default]." ) parser.add_option( "--min-intron-length", dest="min_intron_length", type="int", help= "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default]." ) parser.add_option("-m", "--method", dest="method", type="choice", choices=("full", ), help="method to apply [default=%default].") parser.set_defaults( genome_file=None, flank=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="full", report_step=1000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if not options.genome_file: raise ValueError("an indexed genome is required.") fasta = IndexedFasta.IndexedFasta(options.genome_file) iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) annotateGenome(iterator, fasta, options) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome") parser.add_argument( "-i", "--ignore-missing", dest="ignore_missing", action="store_true", help="Ignore transcripts on contigs that are not in the genome-file.") parser.add_argument( "--min-intron-length", dest="min_intron_length", type=int, help= "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown" ) parser.add_argument("-m", "--method", dest="method", type=str, choices=["full"], help="method to apply") parser.set_defaults( genome_file=None, flank=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="full", report_step=1000, ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) if not args.genome_file: raise ValueError("an indexed genome is required.") fasta = IndexedFasta.IndexedFasta(args.genome_file) iterator = GTF.transcript_iterator(GTF.iterator(args.stdin)) annotateGenome(iterator, fasta, args) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = iotools.read_map( iotools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(iotools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(iotools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = iotools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome (indexed).") parser.add_argument("-w", "--windows-bed-file", dest="filename_windows", type=str, help="gff file with windows to use.") parser.add_argument("-d", "--filename-data", dest="filename_data", type=str, help="gff file with data to use.") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="filename-data is gtf file") parser.add_argument("-f", "--features", dest="features", type=str, action="append", choices=("GC", ), help="features to compute.") parser.add_argument("-c", "--decorator", dest="decorator", type=str, choices=("counts", "gc", "gc3", "mean-length", "median-length", "percent-coverage", "median-score", "mean-score", "stddev-score", "min-score", "max-score"), help="decorators to use.") parser.add_argument("-e", "--skip-empty", dest="skip_empty", action="store_true", help="skip empty windows.") parser.add_argument( "-t", "--transform=", dest="transform", type=str, choices=("none", "overlap", "complement", "third_codon"), help="transform to use when mapping overlapping regions onto window.") parser.set_defaults( genome_file=None, filename_windows=None, filename_data=None, features=[], skip_empty=False, decorator="counts", transform="none", is_gtf=False, ) (args) = E.start(parser) # test_transform_third_codon() if not args.filename_windows: raise ValueError("please supply a gff file with window information.") if args.loglevel >= 1: args.stdlog.write("# reading windows...") args.stdlog.flush() windows = GTF.readAsIntervals( GTF.iterator(iotools.open_file(args.filename_windows, "r"))) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() if args.filename_data: if args.loglevel >= 1: args.stdlog.write("# reading data...") args.stdlog.flush() if args.is_gtf: gff_data = GTF.readFromFile( iotools.open_file(args.filename_data, "r")) else: gff_data = GTF.readFromFile( IOTOols.open_file(args.filename_data, "r")) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() data_ranges = GTF.SortPerContig(gff_data) else: # use windows to compute properties # by supplying no data and asking for the complement = original window gff_data = None data_ranges = None args.transform = "complement" map_contig2size = {} if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) map_contig2size = fasta.getContigSizes() else: for contig, values in list(windows.items()): map_contig2size[contig] = max(lambda x: x[1], values) fasta = None contigs = list(map_contig2size.keys()) contigs.sort() # proceed contig wise noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0 args.stdout.write("\t".join( map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1", "l1", "n2", "l2", "score", "extra_info"))) + "\n") for contig in contigs: skip = False if contig not in windows: ncontigs_skipped_windows += 1 skip = True if data_ranges and contig not in data_ranges: ncontigs_skipped_data += 1 skip = True if skip: continue noutput_contigs += 1 if data_ranges: annotateWindows( contig, windows[contig], gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta, args) else: annotateWindows(contig, windows[contig], [], fasta, args) E.info( "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i" % (len(windows), noutput_contigs, len(contigs), ncontigs_skipped_windows, ncontigs_skipped_data)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-e", "--exons-file", "--gtf-file", dest="filename_exons", type=str, metavar="gtf", help="gtf formatted file with non-overlapping exon " "locations (required). ") parser.set_defaults( filename_exons=None, read_length=200, ) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) exons = GTF.readAndIndex( GTF.iterator(iotools.open_file(args.filename_exons))) pysam_in = pysam.AlignmentFile("-", "rb") nspliced = 0 nspliced_ignored = 0 nspliced_nooverlap = 0 nspliced_halfoverlap = 0 nspliced_bothoverlap = 0 nspliced_overrun = [0] * 2 * (args.read_length + 10) nspliced_exact = 0 nspliced_inexact = 0 nunspliced = 0 nunspliced_overlap = 0 nunspliced_ignored = 0 nunspliced_nooverlap = 0 nunspliced_overrun = [0] * (args.read_length + 10) overrun_offset = args.read_length + 10 ninput = 0 nunmapped = 0 c = E.Counter() def _splice_overrun(start, end, overlap): '''return splicesite over/underrun. positive values: overrun negative values: underrun 0: no over/underrun ''' exon_start = min([x[0] for x in overlap]) exon_end = max([x[1] for x in overlap]) if start <= exon_start and end > exon_start: # overrun at start or match r = exon_start - start elif start < exon_end and end >= exon_end: # overrun at end or match r = end - exon_end else: # underrun - distance to closest exon boundary r = -min(start - exon_start, exon_end - end) return r for read in pysam_in: ninput += 1 if read.is_unmapped: nunmapped += 1 continue # check for BAM_CREF_SKIP code in cigar string cigar = read.cigar is_spliced = 3 in [x[0] for x in cigar] contig = pysam_in.getrname(read.tid) start = read.pos end = read.aend if is_spliced: # count both ends nspliced += 1 if len(cigar) != 3: nspliced_ignored += 1 continue start5, end5 = start, start + cigar[0][1] start3, end3 = end - cigar[2][1], end try: overlap3 = list(exons.get(contig, start3, end3)) overlap5 = list(exons.get(contig, start5, end5)) except KeyError: overlap3 = overlap5 = [] ovl3 = len(overlap3) ovl5 = len(overlap5) o3 = o5 = None if not ovl3 and not ovl5: nspliced_nooverlap += 1 elif ovl3 and not ovl5: nspliced_halfoverlap += 1 o3 = _splice_overrun(start3, end3, overlap3) elif ovl5 and not ovl3: nspliced_halfoverlap += 1 o5 = _splice_overrun(start5, end5, overlap5) else: # both overlap nspliced_bothoverlap += 1 o3 = _splice_overrun(start3, end3, overlap3) o5 = _splice_overrun(start5, end5, overlap5) if o3 is not None: if o3 == 0: nspliced_exact += 1 else: nspliced_inexact += 1 nspliced_overrun[max(0, overrun_offset + o3)] += 1 if o5 is not None: if o5 == 0: nspliced_exact += 1 else: nspliced_inexact += 1 nspliced_overrun[max(0, overrun_offset + o5)] += 1 else: nunspliced += 1 try: overlap = list(exons.get(contig, start, end)) except KeyError: overlap = [] if len(overlap) == 0: nunspliced_nooverlap += 1 elif len(overlap) >= 1: nunspliced_overlap += 1 # multiple overlap - merge exons (usually: small introns) exon_start = min([x[0] for x in overlap]) exon_end = max([x[1] for x in overlap]) ostart = max(0, exon_start - start) oend = max(0, end - exon_end) o = min(end, exon_end) - max(start, exon_start) overrun = ostart + oend nunspliced_overrun[overrun] += 1 # output histograms outfile = E.open_output_file("overrun") outfile.write( "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n" ) _nspliced_overrun = nspliced_overrun[overrun_offset:] _nspliced_underrun = nspliced_overrun[:overrun_offset + 1] _nspliced_underrun.reverse() for x, v in enumerate( zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)): outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v)))) outfile.close() # output summary # convert to counter c.input = ninput c.unmapped = nunmapped c.mapped = ninput - nunmapped c.unspliced = nunspliced c.unspliced_nooverlap = nunspliced_nooverlap c.unspliced_nooverrun = nunspliced_overrun[0] c.unspliced_overlap = nunspliced_overlap c.unspliced_overrun = sum(nunspliced_overrun[1:]) c.spliced = nspliced c.spliced_nooverlap = nspliced_nooverlap c.spliced_halfoverlap = nspliced_halfoverlap c.spliced_bothoverlap = nspliced_bothoverlap c.spliced_exact = nspliced_exact c.spliced_inexact = nspliced_inexact c.spliced_ignored = nspliced_ignored c.spliced_underrun = sum(_nspliced_underrun[1:]) c.spliced_overrun = sum(_nspliced_overrun[1:]) outfile = args.stdout outfile.write("category\tcounts\n") for k, v in sorted(c.items()): outfile.write("%s\t%i\n" % (k, v)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $1.0$", usage=globals()["__doc__"]) parser.add_option("-r", "--reffile", dest="reffile", type="string", help="Supply reference gtf file name") parser.add_option("-d", "--class-file", dest="classfile", type="string", help="Supply database name") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="Supply output bed file name") parser.add_option("-u", "--indivfile", dest="indivfile", type="string", help="Supply output bed file name for individual utrons") parser.add_option("-p", "--partfile", dest="partfile", type="string", help="Supply output bed file name for partnered utrons") parser.add_option( "-q", "--indivpartfile", dest="indivpartfile", type="string", help="Supply output bed file name for individual partnered utrons") parser.add_option("-n", "--novel-file", dest="novelfile", type="string", help="Supply output bed file name for novel introns") parser.add_option( "--novel-transcript", dest="novel_id", type="string", help="DEBUG: Output info for this transcript from the STDIN") parser.add_option( "--target-transcript", dest="target_id", type="string", help="DEBUG: Output info for this transcript from ref-file") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) outlines = [] individuals = [] partnered = [] individualpartnered = [] novel = [] db = pandas.read_csv(options.classfile, sep="\t") # This keeps just one entry per-transcript - why? #db = db.groupby("transcript_id").first() db = db.set_index("transcript_id") enshashtable = getGeneTable(options.reffile) for novel_transcript in GTF.transcript_iterator(GTF.iterator( options.stdin)): # Why do it on a gene by gene basis rather than transcript by transcript basis? transcript_id = novel_transcript[0].transcript_id if transcript_id == options.novel_id: output_novel = True else: output_novel = False try: geneid = db.loc[transcript_id].match_gene_id except KeyError: if output_novel: E.debug("Transcript %s not in class table" % transcript_id) continue if pandas.isnull(geneid): if output_novel: E.debug("Transcript %s matches no gene in class table" % transcript_id) continue ens_gene = enshashtable[geneid] all_ref_introns = set() novel_transcript_exons = GTF.asRanges(novel_transcript, "exon") novel_transcript_introns = GTF.toIntronIntervals(novel_transcript) for ref_transcript in ens_gene["models"].values(): ref_introns = GTF.toIntronIntervals(ref_transcript) all_ref_introns.update(ref_introns) #Identify comparison set def _in_exon(position, exons): return any(e[0] <= position <= e[1] for e in exons) # check if this ever gets the wrong start_codon. filtered_starts = [ s for s in ens_gene["start_codons"] if _in_exon(s, novel_transcript_exons) ] if len(filtered_starts) == 0: if output_novel: E.debug("No starts found for %s" % transcript_id) continue #if novel_transcript[0].strand == "-": # selected_start = max(filtered_starts) #else: # selected_start = min(filtered_starts) selected_models = list() for startc in filtered_starts: selected_models.extend(ens_gene["start_codons"][startc]) if output_novel: E.debug("Transcripts with compatible starts are %s" % selected_models) for ref_transcript_id in selected_models: if output_novel and ref_transcript_id == options.target_id: output_ref = True else: output_ref = False second = ens_gene["models"][ref_transcript_id] ens_CDS = GTF.asRanges(second, "CDS") if len(ens_CDS) == 0: if output_ref: E.debug("%s is not coding" ) # ensure only protein-coding transcripts continue ens_exons = GTF.asRanges(second, "exon") first_introns = set(novel_transcript_introns) second_introns = set(GTF.toIntronIntervals(second)) first_CDSintrons = [ intron for intron in first_introns if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1]) ] second_CDSintrons = [ intron for intron in second_introns if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1]) ] first_CDSintrons = set(first_CDSintrons) second_CDSintrons = set(second_CDSintrons) if not first_CDSintrons == second_CDSintrons: if output_ref: E.debug("CDS chains do not match. Chains are:") first_CDSintrons = sorted(list(first_CDSintrons)) second_CDSintrons = sorted(list(second_CDSintrons)) output = "\n".join( map(str, zip(first_CDSintrons, second_CDSintrons))) E.debug(output) continue # match CDS intron chain firstUTRintrons = first_introns - first_CDSintrons if len(firstUTRintrons) == 0: if output_ref: E.debug("No UTR introns") continue secondUTRintrons = second_introns - second_CDSintrons found = False for intron in first_introns: if (intron[0] < ens_CDS[-1][1] and intron[1] > ens_CDS[-1][1]) or \ (intron[0] < ens_CDS[0][0] and intron[1] > ens_CDS[0][0]): found = True break # ensure pruned transcript doesn't have # introns overlapping start or stop codons in ensembl # transcript if found: if output_ref: E.debug("Start or stop in intron") continue if second[0].strand == "+": ens_stop = ens_CDS[-1][1] UTR3introns = [ intron for intron in firstUTRintrons if intron[0] >= ens_CDS[-1][1] and intron[1] < ens_exons[-1][1] ] secondUTR3introns = [ intron for intron in secondUTRintrons if intron[0] >= ens_CDS[-1][1] and intron[1] < ens_exons[-1][1] ] else: ens_stop = ens_CDS[0][0] UTR3introns = [ intron for intron in firstUTRintrons if intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0] ] secondUTR3introns = [ intron for intron in secondUTRintrons if intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0] ] if len(UTR3introns) == 0: if output_ref: E.debug("No UTR introns") continue outbed = Bed.Bed() outbed.fields = ['.', '.', '.', '.', '.', '.', '.', '.', '.'] outbed.fromIntervals(UTR3introns) outbed.contig = novel_transcript[0].contig outbed["name"] = novel_transcript[0].transcript_id outbed["strand"] = novel_transcript[0].strand outlines.append(outbed) # get output for each transcript for item in UTR3introns: outbed2 = Bed.Bed() outbed2.fields = ['.', '.', '.', '.'] outbed2.fromIntervals([item]) outbed2.contig = novel_transcript[0].contig outbed2['name'] = novel_transcript[0].transcript_id outbed2["strand"] = novel_transcript[0].strand outbed2["thickStart"] = ens_stop individuals.append(outbed2) # get output for each intron UTR3introns = set(UTR3introns) secondUTR3introns = set(secondUTR3introns) extraUTR3introns = list(UTR3introns - secondUTR3introns) if output_ref and len(secondUTR3introns - UTR3introns) > 0: E.debug("Following introns in UTR of %s but not %s" % (options.target_id, options.novel_id)) E.debug(secondUTRintrons - UTR3introns) # get only introns that are not in matched transcript if len(extraUTR3introns) != 0 and len(secondUTR3introns - UTR3introns) == 0: outbed3 = Bed.Bed() outbed3.fields = ['.'] * 9 outbed3.fromIntervals(extraUTR3introns) outbed3.contig = novel_transcript[0].contig outbed3["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed3["strand"] = novel_transcript[0].strand partnered.append(outbed3) for item in extraUTR3introns: outbed4 = Bed.Bed() outbed4.fields = ['.', '.', '.', '.'] outbed4.fromIntervals([item]) outbed4.contig = novel_transcript[0].contig outbed4["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed4["strand"] = novel_transcript[0].strand outbed4["thickStart"] = ens_stop individualpartnered.append(outbed4) if len(all_ref_introns) == 0: ens_starts, ens_ends = [], [] else: ens_starts, ens_ends = zip(*all_ref_introns) novelEvents = [ i for i in UTR3introns if i[0] not in ens_starts and i[1] not in ens_ends ] for item in novelEvents: outbed5 = Bed.Bed() outbed5.fields = ['.'] * 4 outbed5.fromIntervals([item]) outbed5.contig = novel_transcript[0].contig outbed5["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed5["strand"] = novel_transcript[0].strand outbed5["thickStart"] = ens_stop novel.append(outbed5) with IOTools.open_file(options.outfile, "w") as outf: for line in outlines: outf.write(str(line) + "\n") if options.indivfile is not None: with IOTools.open_file(options.indivfile, "w") as outf2: for line in individuals: outf2.write(str(line) + "\n") if options.partfile is not None: with IOTools.open_file(options.partfile, "w") as outf3: for line in partnered: outf3.write(str(line) + "\n") if options.indivpartfile is not None: with IOTools.open_file(options.indivpartfile, "w") as outf4: for line in individualpartnered: outf4.write(str(line) + "\n") if options.novelfile is not None: with IOTools.open_file(options.novelfile, "w") as outf5: for line in novel: outf5.write(str(line) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--output-equivalent", dest="write_equivalent", action="store_true", help="write equivalent entries [default=%default].") parser.add_option("-f", "--output-full", dest="write_full", action="store_true", help="write full gff entries [default=%default].") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage columns [default=%default].") parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information [default=%default].") parser.set_defaults( write_equivalent=False, write_full=False, add_percent=False, ignore_strand=False, as_gtf=False, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) != 2: raise ValueError("two arguments required") input_filename1, input_filename2 = args # duplicated features cause a problem. Make sure # features are non-overlapping by running # gff_combine.py on GFF files first. E.info("reading data started") idx, genes2 = {}, set() for e in GTF.readFromFile(iotools.open_file(input_filename2, "r")): genes2.add(e.gene_id) if e.contig not in idx: idx[e.contig] = quicksect.IntervalTree() idx[e.contig].add(e.start, e.end, e) overlaps_genes = [] E.info("reading data finished: %i contigs" % len(idx)) # outfile_diff and outfile_overlap not implemented # outfile_diff = getFile( options, "diff" ) # outfile_overlap = getFile( options, "overlap" ) overlapping_genes = set() genes1 = set() # iterate over exons with iotools.open_file(input_filename1, "r") as infile: for this in GTF.iterator(infile): genes1.add(this.gene_id) try: intervals = idx[this.contig].find( quicksect.Interval(this.start, this.end)) except KeyError: continue others = [x.data for x in intervals] for other in others: overlapping_genes.add((this.gene_id, other.gene_id)) # check for identical/half-identical matches output = None for other in others: if this.start == other.start and this.end == other.end: output, symbol = other, "=" break else: for other in others: if this.start == other.start or this.end == other.end: output, symbol = other, "|" break else: symbol = "~" # if outfile_diff != options.stdout: outfile_diff.close() # if outfile_overlap != options.stdout: outfile_overlap.close() outfile = None ################################################################## ################################################################## ################################################################## # print gene based information ################################################################## if overlapping_genes: outfile = getFile(options, "genes_ovl") outfile.write("gene_id1\tgene_id2\n") for a, b in sorted(overlapping_genes): outfile.write("%s\t%s\n" % (a, b)) if outfile != options.stdout: outfile.close() outfile_total = getFile(options, "genes_total") outfile_total.write( "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n") outfile = getFile(options, "genes_uniq1") b = set([x[0] for x in overlapping_genes]) d = genes1.difference(b) outfile.write("gene_id1\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename1), len(genes1), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1))) outfile = getFile(options, "genes_uniq2") b = set([x[1] for x in overlapping_genes]) d = genes2.difference(b) outfile.write("gene_id2\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename2), len(genes2), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2))) if outfile_total != options.stdout: outfile_total.close() E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--no-empty-bins", dest="no_empty_bins", action="store_true", help="do not display empty bins.") parser.add_option("--with-empty-bins", dest="no_empty_bins", action="store_false", help="display empty bins.") parser.add_option( "--ignore-out-of-range", dest="ignore_out_of_range", action="store_true", help="ignore values that are out of range (as opposed to truncating " "them to range border.") parser.add_option("--missing-value", dest="missing_value", type="string", help="entry for missing values [%default].") parser.add_option("--use-dynamic-bins", dest="dynamic_bins", action="store_true", help="each value constitutes its own bin.") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf", "bed"), help="input file format [%default].") parser.add_option("--method", dest="methods", type="choice", action="append", choices=("all", "hist", "stats", "overlaps", "values"), help="methods to apply [%default].") parser.add_option("--output-section", dest="output_section", type="choice", choices=("all", "size", "distance"), help="data to compute [%default].") parser.set_defaults( no_empty_bins=True, bin_size=None, dynamic_bins=False, ignore_out_of_range=False, min_value=None, max_value=None, nonull=None, missing_value="na", output_filename_pattern="%s", methods=[], output_section="all", format="gff", ) (options, args) = E.start(parser, add_output_options=True) if "all" in options.methods: options.methods = ("hist", "stats", "overlaps") if not options.output_filename_pattern: options.output_filename_pattern = "%s" if len(options.methods) == 0: raise ValueError( "please provide counting method using --method option") if options.format in ("gff", "gtf"): gffs = GTF.iterator(options.stdin) elif options.format == "bed": gffs = Bed.iterator(options.stdin) values_between = [] values_within = [] values_overlaps = [] if "overlaps" in options.methods: if not options.output_filename_pattern: options.output_filename_pattern = "%s" outfile_overlaps = E.open_output_file("overlaps") else: outfile_overlaps = None last = None ninput, noverlaps = 0, 0 for this in gffs: ninput += 1 values_within.append(this.end - this.start) if last and last.contig == this.contig: if this.start < last.end: noverlaps += 1 if outfile_overlaps: outfile_overlaps.write("%s\t%s\n" % (str(last), str(this))) values_overlaps.append( min(this.end, last.end) - max(last.start, this.start)) if this.end > last.end: last = this continue else: values_between.append(this.start - last.end) # if this.start - last.end < 10: # print str(last) # print str(this) # print "==" values_overlaps.append(0) last = this if "hist" in options.methods: outfile = E.open_output_file("hist") h_within = Histogram.Calculate( values_within, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) h_between = Histogram.Calculate( values_between, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) if "all" == options.output_section: outfile.write("residues\tsize\tdistance\n") combined_histogram = Histogram.Combine( [h_within, h_between], missing_value=options.missing_value) Histogram.Write(outfile, combined_histogram, nonull=options.nonull) elif options.output_section == "size": outfile.write("residues\tsize\n") Histogram.Write(outfile, h_within, nonull=options.nonull) elif options.output_section == "distance": outfile.write("residues\tdistance\n") Histogram.Write(outfile, h_between, nonull=options.nonull) outfile.close() if "stats" in options.methods: outfile = E.open_output_file("stats") outfile.write("data\t%s\n" % Stats.Summary().getHeader()) if options.output_section in ("size", "all"): outfile.write("size\t%s\n" % str(Stats.Summary(values_within))) if options.output_section in ("distance", "all"): outfile.write("distance\t%s\n" % str(Stats.Summary(values_between))) outfile.close() if "values" in options.methods: outfile = E.open_output_file("distances") outfile.write("distance\n%s\n" % "\n".join(map(str, values_between))) outfile.close() outfile = E.open_output_file("sizes") outfile.write("size\n%s\n" % "\n".join(map(str, values_within))) outfile.close() outfile = E.open_output_file("overlaps") outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps))) outfile.close() E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" % (ninput, len(values_between), len(values_within), noverlaps)) E.stop()