def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2disruptions.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome pattern." ) parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int", help="maximum extension for start codon (make divisible by 3)." ) parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int", help="maximum extension for stop codon (make divisible by 3)." ) parser.set_defaults( genome_file = "genome.fasta", stop_codons = ("TAG", "TAA", "TGA") ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) p = PredictionParser.PredictionParserEntry() fasta = IndexedFasta.IndexedFasta( options.genome_file ) for line in sys.stdin: if line[0] == "#": continue p.Read(line) genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ) if options.loglevel >= 2: options.stdlog.write ("# parsing alignment %s\n" % p.mAlignmentString) try: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions =\ Genomics.CountGeneFeatures( 0, p.mMapPeptide2Genome, genomic_sequence, border_stop_codon = 0, stop_codons = options.stop_codons ) except ValueError, msg: options.stderr.write( "# parsing error: %s in line %s\n" % (line[:-1], msg)) sys.exit(1) for type, \ cds_pos_from, cds_pos_to, \ genome_pos_from, genome_pos_to in disruptions: options.stdout.write( "\t".join(map(str, (p.mPredictionId, type, cds_pos_from, cds_pos_to, genome_pos_from + p.mSbjctGenomeFrom, genome_pos_to + p.mSbjctGenomeFrom) ) )+ "\n") options.stdout.flush()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true") parser.add_option("-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff")) parser.add_option("-o", "--output-format", dest="output_format", help="output format", type="choice", choices=('exontable', 'exons', 'predictions', 'cds', 'fasta')) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed).") parser.add_option( "--predictions-file", dest="predictions_file", type="string", help= "filename with predictions. Use gene structures from this file if available." ) parser.add_option("-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section.") parser.add_option( "-p", "--filename-peptides", dest="filename_peptides", type="string", help= "Filename with peptide sequences. If given, it is used to check the predicted translated sequences." ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly.") parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly.") parser.add_option( "--input-coordinates", dest="input_coordinates", type="string", help= "specify input format for input coordinates [forward|both-zero|one-closed|open]." ) parser.set_defaults(trans=False, output_format="predictions", format="psl", gff_field_id='id', input_coordinates="both-zero-open", filename_peptides=None, genome_file=None, do_realignment=True, predictions_file=None, remove_unaligned=False) (options, args) = E.Start(parser) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0, 0, 0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_peptides, "r")) predictor = Predictor.PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter(options.input_coordinates) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile(options.predictions_file, "r")) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line, )) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons( contig_sizes=contig_sizes) else: raise "unknown format %s for output option %s" % ( options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n") options.stdlog.flush() results = parser.Parse(sys.stdin.readlines()) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n") options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors())) for error, msg in parser.mErrors: options.stdlog.write("# %s : %s\n" % (str(error), msg)) options.stdlog.flush() # if genomes are given: build translation if options.genome_file: results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken)) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write( "# processing entry %s:%s on %s:%s %i/%i.\n" % (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results))) options.stdlog.flush() try: lgenome = fasta.getLength(entry.mSbjctToken) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome)) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken)) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write( "# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand)) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence) entry.score = entry.mMapPeptide2Translation.getColTo( ) - entry.mMapPeptide2Translation.getColFrom() + 1 (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \ Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str( entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity( reference, translation, options) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stdlog.flush() result = predictor( entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80)) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity( reference, translation, options) else: if options.loglevel >= 2: options.stdlog.write( "# %s: realignment returned empty result\n" % (entry.mPredictionId)) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches on %s ... no realignment\n" % ( entry.mPredictionId, entry.mSbjctToken, )) if options.loglevel >= 3: options.stdlog.write( "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" % (entry.mPredictionId, reference, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n")
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/assignments2pairs.py 2011 2008-07-04 10:40:51Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "--peptides", dest="filename_peptides", type="string", help="" ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option("-s", "--suffix", dest="suffix", type="string", help="" ) parser.add_option("-p", "--prefix", dest="prefix", type="string", help="" ) parser.add_option("-a", "--output-pattern", dest="filename_output_pattern", type="string", help="" ) parser.add_option("-f", "--format", dest="format", type="string", help="" ) parser.add_option("-i", "--input-format", dest="input_format", type="string", help="" ) parser.add_option("-u", "--clusters", dest="filename_clusters", type="string", help="" ) parser.add_option( "--filename-previous", dest="filename_previous", type="string", help="" ) parser.add_option("-m", "--max-margin", dest="max_margin", type="int", help="" ) parser.add_option("-n", "--min-margin", dest="min_margin", type="int", help="" ) parser.add_option("-d", "--default-margin", dest="default_margin", type="int", help="" ) parser.add_option("-r", "--max-region", dest="max_region_nr", type="int", help="" ) parser.add_option("-c", "--chunk", dest="chunk_size", type="int", help="" ) parser.add_option("-k", "--offset-key", dest="offset_key", action="store_true", help="" ) parser.add_option("-t", "--conserve-strand", dest="conserve_strand", action="store_true", help="" ) parser.add_option("-o", "--forward-coordinates", dest="forward_coordinates", action="store_true", help="" ) parser.add_option( "--no-sequence", dest="no_sequence", action="store_true", help="" ) parser.add_option( "--combine-exons", dest="combine_exons", action="store_true", help="" ) parser.set_defaults( ## pattern for genomes, %s is substituted for the sbjct_token genome_file = "genome_%s.fasta", filename_peptides = None, ## margin to add to genomic segments max_margin = 0, min_margin = 0, default_margin = 0, offset_key = 0, chunk_size = 100, report_step = 1000, ## wheher to combine exons combine_exons = False, ## output format format = "single_fasta", ## prefix/suffix for output files suffix = ".fasta", prefix = "", filename_clusters = None, output = None, ## conserve strand conserve_strand = None, ## input format input_format = None, forward_coordinates = None, ## maximum number of predictions per region (0=all) max_region_nr = 0, filename_output_pattern = None, ## do not write sequences into output no_sequence = None, ## filename with previous results filename_previous = None, ) (options, args) = E.Start( parser ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(1) if not options.filename_output_pattern: options.filename_output_pattern = options.prefix + "%i" + options.suffix # read peptide sequences if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r") ) else: peptide_sequences = {} if options.loglevel >= 1: print "# read %i peptide sequences." % len(peptide_sequences) sys.stdout.flush() # read clustering information if options.filename_clusters: ## Note: if there are no alternative transcripts, map_rep2mem and map_mem2rep will be empty. ## thus add some dummy variables so that filtering will work. map_rep2mem, map_mem2rep = Genomics.ReadMap( open(options.filename_clusters, "r")) map_rep2mem['dummy'] = ["dummy",] map_mem2rep['dummy'] = "dummy" else: map_rep2mem, map_mem2rep = {}, {} if options.loglevel >= 1: print "# read members: mem2rep=%i, rep2mem=%i" % (len(map_mem2rep), len(map_rep2mem)) sys.stdout.flush() map_previous = {} # read previous data if options.filename_previous: entry = PredictionParser.PredictionParserEntry() infile = open(options.filename_previous, "r") for line in infile: if line[0] == "#": continue if options.input_format == "graph": data = line.split("\t") (region_id, region_nr, region_max_nr, sbjct_token, sbjct_strand, region_from, region_to, query_token, weight) = data[:9] entry.Read( "\t".join(data[9:])) key = "%s_vs_%s_%s" % (query_token, sbjct_token, sbjct_strand ) if key not in map_previous: map_previous[key] = [ (entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ) ] else: map_previous[key].append((entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo )) if options.loglevel >= 1: print "# read %i old entries." % (len(map_previous)) sys.stdout.flush() ## variables for file numbering global_nchunks = 0 global_chunk_size = options.chunk_size global_outfile = None ## counters of pairs/regions npairs = 0 nregions = 0 nskipped = 0 region_id = None region_nr = None region_max_nr = None last_region_id = None last_margin_sbjct_from, last_margin_sbjct_to = None, None segments = [] map_query2segments = {} entry = PredictionParser.PredictionParserEntry() for line in sys.stdin: if line[0] == "#": continue try: if options.input_format == "minimal": (entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) = line[:-1].split("\t") entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = int(entry.mSbjctGenomeFrom), int(entry.mSbjctGenomeTo) elif options.input_format == "ensembl": (dummy, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mSbjctStrand, entry.mSbjctToken, entry.mQueryToken ) = line[:-1].split("\t") if entry.mSbjctStrand == "1": entry.mSbjctStrand = "+" else: entry.mSbjctStrand = "-" entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = int(entry.mSbjctGenomeFrom), int(entry.mSbjctGenomeTo) elif options.input_format == "graph": data = line.split("\t") (region_id, region_nr, region_max_nr, sbjct_token, sbjct_strand, region_from, region_to, query_token, weight) = data[:9] entry.Read( "\t".join( data[9:]) ) if map_previous: key = "%s_vs_%s_%s" % (query_token, sbjct_token, sbjct_strand) if key in map_previous: found = False ## check for overlap for a, b in map_previous[key]: if min(b, entry.mSbjctGenomeTo) - max(entry.mSbjctGenomeFrom, a) > 0: found = True break if found: nskipped += 1 continue region_nr, region_max_nr = map(int, (region_nr, region_max_nr)) if last_region_id != region_id: nregions += 1 last_region_id = region_id if options.max_region_nr: region_max_nr = min(region_max_nr, options.max_region_nr) if region_nr > options.max_region_nr: continue elif options.input_format == "exons": (entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, phase, entry.mRank, peptide_from, peptide_to, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) = line[:-1].split("\t") entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mRank = map( int, (entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mRank)) else: entry.Read( line ) if entry.mSbjctStrand == "1": entry.mSbjctStrand = "+" if entry.mSbjctStrand == "-1": entry.mSbjctStrand = "-" except ValueError, IndexError: print "# Parsing error line: %s" % line[:-1] continue ## increase margin with minimal range if options.min_margin: min_sbjct_from = max(0, entry.mSbjctGenomeFrom - options.min_margin ) min_sbjct_to = entry.mSbjctGenomeTo + options.min_margin else: min_sbjct_from = entry.mSbjctGenomeFrom min_sbjct_to = entry.mSbjctGenomeTo margin_sbjct_from = min_sbjct_from margin_sbjct_to = min_sbjct_to ## increase margin around putative gene region if options.default_margin >= 0: margin_sbjct_from = max(0, min_sbjct_from - options.default_margin ) margin_sbjct_to = min_sbjct_to + options.default_margin else: if entry.mQueryFrom > 0: margin_sbjct_from = max(0, min_sbjct_from - options.max_margin ) if entry.mQueryTo < entry.mQueryLength: margin_sbjct_to = min_sbjct_to + options.max_margin segments.append( [region_id, region_nr, region_max_nr, min_sbjct_from, min_sbjct_to, margin_sbjct_from, margin_sbjct_to, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand] ) if entry.mQueryToken not in map_query2segments: map_query2segments[entry.mQueryToken] = [] map_query2segments[entry.mQueryToken].append( [entry.mSbjctToken, entry.mSbjctStrand, margin_sbjct_from, margin_sbjct_to, len(segments)-1] )
else: # array with final predictions old_predictions = [] if param_loglevel >= 1: print "# reading predictions." sys.stdout.flush() nread = 0 ninput = 0 for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry(expand=0) entry.Read(line) nread += 1 # set prediction id if not entry.mPredictionId: entry.mPredictionId = nread # filter bad predictions right here in order to save memory: if entry.score < param_min_total_score: if param_loglevel >= 2: print "# PRUNING: reason: score below minimum: removing: %s" % str( entry) continue elif entry.mQueryCoverage < param_min_coverage_query: if param_loglevel >= 2:
for k in exons.keys(): ee = exons[k] id = 0 for e in ee: id += 1 print "\t".join( map(str, (e.mQueryToken, id, e.mPeptideFrom, e.mPeptideTo, e.frame, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, e.mGenomeFrom, e.mGenomeTo))) elif options.output_format == "exons": if options.format == "exons": parser = PredictionParser.PredictionParserExons( contig_sizes=contig_sizes) else: raise "unknown format %s." % options.format results = parser.Parse(sys.stdin.readlines()) id = 0 for entry in results: exons = Exons.Alignment2Exons( entry.mMapPeptide2Genome, entry.mQueryFrom, entry.mSbjctGenomeFrom, ) for e in exons: id += 1 print "\t".join(
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/regions2graph.py 2754 2009-09-04 16:50:22Z andreas $", usage=globals()["__doc__"]) parser.add_option("-b", "--benchmark", dest="filename_benchmark", type="string", help="") parser.add_option("-y", "--benchmark-synonyms", dest="benchmark_synonyms", type="string", help="") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="") parser.add_option("-c", "--min-coverage-query", dest="min_coverage_query", type="float", help="") parser.add_option("-s", "--min-score", dest="min_total_score", type="float", help="") parser.add_option("-i", "--min-percent-identity", dest="min_percent_identity", type="float", help="") parser.add_option("-o", "--max-percent-overlap", dest="max_percent_overlap", type="float", help="") parser.add_option("--overlap-min-score", dest="overlap_min_score", type="float", help="") parser.add_option("--overlap-min-coverage", dest="overlap_min_coverage", type="float", help="") parser.add_option("--overlap-min-identity", dest="overlap_min_identity", type="float", help="") parser.add_option("--overlap-max-coverage", dest="overlap_max_coverage", type="float", help="") parser.add_option("-m", "--max-matches", dest="max_matches", type="int", help="") parser.add_option("-j", "--join-regions", dest="join_regions", type="int", help="") parser.add_option("--join-regions-max-regions", dest="join_regions_max_regions", type="int", help="") parser.add_option("--join-regions-max-coverage", dest="join_regions_max_coverage", type="float", help="") parser.add_option("--min-length", dest="min_length", type="int", help="") parser.add_option("--test", dest="test", type="int", help="") parser.add_option("--filter-queries", dest="filename_filter_queries", type="string", help="") parser.add_option("--filter-regions", dest="filter_regions", type="string", help="") parser.add_option("--conserve-memory", dest="conserve_memory", action="store_true", help="") parser.add_option("--filter-suboptimal", dest="filter_suboptimal", action="store_true", help="") parser.set_defaults( ## overlap allowed for matches on genomic region max_percent_overlap=20, gop=-10.0, gep=-2.0, ## thresholds for joining regions overlap_min_score=80, overlap_min_coverage=80, overlap_max_coverage=90, overlap_min_identity=50, ## threshold for filtering bad predictions: ## minimum score min_total_score=80, ## joining regions join_regions=0, ## maximum coverage of query for predictions to be joined ## (This is to ensure not to join duplications. A range check ## would be better, but runs into trouble with repeats). join_regions_max_coverage=90, ## minimum coverage of query min_coverage_query=10, ## conserve memory conserve_memory=0, ## minimum percent identity min_percent_identity=0, ## minimum length min_length=0, max_matches=0, filename_peptides=None, filename_filter_queries=None, ## turn on/off various filters filter_suboptimal=False, filter_regions=False, ## parameters for filter of suboptimal predictions min_relative_coverage=0.5, min_relative_score=0.5, min_relative_percent_identity=0.5, ## minimum difference between non-correlated conflicts to keep them both. conflicts_min_difference=0.1, ## benchmarking data benchmarks=None, benchmark_synonyms=None, filename_benchmark=None, filename_benchmark_synonyms=None, test=None, max_intron=50000) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) ##################################################################################### # read filtering filter_queries = {} if options.filename_filter_queries: for line in open(options.filename_filter_queries, "r"): if line[0] == "#": continue query_token = line[:-1].split("\t")[0] filter_queries[query_token] = True if options.loglevel >= 1: options.stdlog.write("# filtering for %i queries.\n" % len(filter_queries)) ##################################################################################### # read benchmarking regions if options.filename_benchmark: options.benchmarks = ReadBenchmarkingRegions( open(options.filename_benchmark, "r")) if options.loglevel >= 1: options.stdlog.write( "# read benchmarking regions for %i tokens\n" % len(options.benchmarks)) sys.stdout.flush() if options.filename_benchmark_synonyms: infile = open(options.filename_benchmark_synonyms, "r") options.benchmark_synonyms = {} for line in infile: if line[0] == "#": continue value, key = line[:-1].split("\t") options.benchmark_synonyms[key] = value else: options.benchmark_synonyms = {} else: options.benchmarks = {} options.benchmark_synonyms = {} ##################################################################################### # read peptide sequences if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) else: peptide_sequences = {} if options.conserve_memory: old_predictions, filename_old_predictions = tempfile.mkstemp() os.close(old_predictions) old_predictions = PredictionFile.PredictionFile() old_predictions.open(filename_old_predictions, "w") else: ## array with final predictions old_predictions = [] if options.loglevel >= 1: options.stdlog.write("# reading predictions.\n") sys.stdout.flush() nread = 0 ninput = 0 for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry(expand=0) entry.Read(line) nread += 1 # set prediction id if not entry.mPredictionId: entry.mPredictionId = nread ## filter bad predictions right here in order to save memory: if entry.score < options.min_total_score: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: score below minimum: removing: %s\n" % str(entry)) continue elif entry.mQueryCoverage < options.min_coverage_query: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: coverage below minimum: removing: %s\n" % str(entry)) continue elif entry.mPercentIdentity < options.min_percent_identity: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: percent identity below minimum: removing: %s\n" % str(entry)) continue elif entry.mSbjctTo - entry.mSbjctFrom < options.min_length: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: length of transcript below minimum: removing: %s\n" % str(entry)) continue ninput += 1 if options.test and ninput > options.test: break old_predictions.append(entry) if options.loglevel >= 1: options.stdlog.write("# predictions after input: %i\n" % ninput) sys.stdout.flush() if options.loglevel >= 10: options.stdlog.write( "############## start: predictions after input ###################################\n" ) for x in old_predictions: options.stdlog.write("# %s\n" % str(x)) options.stdlog.write( "############## end: predictions after input #####################################\n" ) sys.stdout.flush() if ninput == 0: options.stdlog.write("# ERROR: no predictions\n") sys.exit(1) ##################################################################################### ## set up stacks of regions if options.conserve_memory: old_predictions.close() old_predictions.open(mode="r") removed_predictions, filename_removed_predictions = tempfile.mkstemp() os.close(removed_predictions) removed_predictions = PredictionFile.PredictionFile() removed_predictions.open(filename_removed_predictions, "w") new_predictions, filename_new_predictions = tempfile.mkstemp() os.close(new_predictions) new_predictions = PredictionFile.PredictionFile() new_predictions.open(filename_new_predictions, "w") else: removed_predictions = [] new_predictions = [] if options.benchmarks: EvaluateBenchmark(old_predictions) ##################################################################################### ## join regions if options.join_regions and options.join_regions_max_coverage: if options.loglevel >= 1: options.stdlog.write( "# joining regions: maximum distance between segments = %i and maximum query coverage = %i\n" % (options.join_regions, options.join_regions_max_coverage)) sys.stdout.flush() njoined = JoinRegions(old_predictions, new_predictions) if options.conserve_memory: ExchangeStreams(old_predictions, new_predictions) else: old_predictions = new_predictions new_predictions = [] if options.loglevel >= 1: options.stdlog.write("# predictions after joining: %i\n" % njoined) sys.stdout.flush() if options.loglevel >= 10: options.stdlog.write( "############## start: predictions after joining ###################################\n" ) for x in old_predictions: options.stdlog.write("# %s" % str(x)) options.stdlog.write( "############## end: predictions after joining #####################################\n" ) sys.stdout.flush() else: if options.loglevel >= 1: options.stdlog.write("# joining regions: skipped\n") sys.stdout.flush() njoined = ninput ################################################################################################## ## build map of best predictions if options.filter_suboptimal: if options.loglevel >= 1: options.stdlog.write("# calculating best predictions\n") sys.stdout.flush() best_predictions = GetBestPredictions(old_predictions) else: best_predictions = {} if options.loglevel >= 1: options.stdlog.write("# calculated best predictions: %i\n" % len(best_predictions)) sys.stdout.flush() ################################################################################################## ## get regions to eliminate filter_regions = {} if options.filter_regions: entry = PredictionParser.PredictionParserEntry(expand=0) filenames = options.filter_regions.split(",") for filename in filenames: if options.loglevel >= 1: options.stdlog.write("# reading regions to filter from %s.\n" % (filename)) sys.stdout.flush() if filename.endswith(".gz"): infile = gzip.open(filename, "r") else: infile = open(filename, "r") for line in infile: if line[0] == "#": continue entry.Read(line) exons = Exons.Alignment2Exons( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, entry.mSbjctGenomeFrom) key = "%s-%s" % (entry.mSbjctToken, entry.mSbjctStrand) if key not in filter_regions: filter_regions[key] = [] for exon in exons: filter_regions[key].append( (exon.mGenomeFrom, exon.mGenomeTo)) infile.close() for k in filter_regions.keys(): filter_regions[k].sort() ################################################################################################## ## bipartite graph construction ################################################################################################## ## sort predictions by genomic region if options.conserve_memory: old_predictions.sort(('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo')) else: old_predictions.sort(lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x. mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y. mSbjctGenomeFrom, y.mSbjctGenomeTo))) ################################################################################################## ## filter predictions and resolve conflicts based on genomic overlap ## deleted segments are put in a temporary storage space. min_from, max_from = None, None min_to, max_to = None, None region_id = 0 noverlaps = 0 last_prediction = None predictions = [] region = Region() nclusters = 0 neliminated_suboptimal = 0 neliminated_overlap = 0 noutput, nfiltered = 0, 0 for this_prediction in old_predictions: ## Filter 1: skip suboptimal predictions if this_prediction.mQueryToken in best_predictions: best_prediction = best_predictions[this_prediction.mQueryToken] neliminated_suboptimal += 1 if float( this_prediction.mQueryCoverage ) / best_prediction.mQueryCoverage < options.min_relative_coverage: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: coverage below best: removing %s\n" % str(this_prediction)) continue if float(this_prediction.score ) / best_prediction.score < options.min_relative_score: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: score below best: removing %s\n" % str(this_prediction)) continue if float( this_prediction.mPercentIdentity ) / best_prediction.mPercentIdentity < options.min_relative_percent_identity: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: percent identity below best: removing %s\n" % str(this_prediction)) continue neliminated_suboptimal -= 1 ## Filter 2: remove predictions overlapping with certain segments key = "%s-%s" % (this_prediction.mSbjctToken, this_prediction.mSbjctStrand) if key in filter_regions: exons = Exons.Alignment2Exons( Genomics.String2Alignment(this_prediction.mAlignmentString), this_prediction.mQueryFrom, this_prediction.mSbjctGenomeFrom) if CheckOverlap(map(lambda x: (x.mGenomeFrom, x.mGenomeTo), exons), filter_regions[key]): if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: overlapping with taboo region: removing %s\n" % str(this_prediction)) neliminated_overlap += 1 continue try: this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None ## process first entry if min_from == None: min_from = this_prediction.mSbjctGenomeFrom max_from = this_prediction.mSbjctGenomeFrom max_to = this_prediction.mSbjctGenomeTo min_to = this_prediction.mSbjctGenomeTo predictions.append(this_prediction) last_prediction = this_prediction continue overlap = min_to > this_prediction.mSbjctGenomeFrom and \ last_prediction.mSbjctToken == this_prediction.mSbjctToken and \ last_prediction.mSbjctStrand == this_prediction.mSbjctStrand if options.loglevel >= 4: options.stdlog.write("# from=%i, to=%i, working on: %s\n" % (min_from, max_to, str(this_prediction))) options.stdlog.flush() # resolve overlap between different genes if overlap: noverlaps += 1 else: region.mSbjctToken = last_prediction.mSbjctToken region.mSbjctStrand = last_prediction.mSbjctStrand region.mSbjctGenomeFrom = min_from region.mSbjctGenomeTo = max_to region_id, nxoutput, nxfiltered = ProcessRegion( predictions, region_id, region, peptide_sequences, filter_queries) noutput += nxoutput nfiltered += nxfiltered nclusters += 1 predictions = [] min_from = this_prediction.mSbjctGenomeFrom max_from = this_prediction.mSbjctGenomeFrom min_to = this_prediction.mSbjctGenomeTo max_to = this_prediction.mSbjctGenomeTo predictions.append(this_prediction) min_from = min(min_from, this_prediction.mSbjctGenomeFrom) max_from = max(max_from, this_prediction.mSbjctGenomeFrom) min_to = min(min_to, this_prediction.mSbjctGenomeTo) max_to = max(max_to, this_prediction.mSbjctGenomeTo) last_prediction = this_prediction if last_prediction: region.mSbjctToken = last_prediction.mSbjctToken region.mSbjctStrand = last_prediction.mSbjctStrand region.mSbjctGenomeFrom = min_from region.mSbjctGenomeTo = max_to region_id, nxoutput, nxfiltered = ProcessRegion( predictions, region_id, region, peptide_sequences, filter_queries) noutput += nxoutput nfiltered += nxfiltered nclusters += 1 if options.conserve_memory: os.remove(filename_old_predictions) os.remove(filename_new_predictions) os.remove(filename_removed_predictions) if options.loglevel >= 1: options.stdlog.write( "# pairs: nread=%i, input=%i, joined=%i, clusters=%i, regions=%i, eliminated_subopt=%i, eliminated_overlap=%i, noutput=%i, nfiltered=%i\n" % \ (nread, ninput, njoined, nclusters, region_id, neliminated_suboptimal, neliminated_overlap, noutput, nfiltered )) E.Stop()
def __init__(self): Predictor.__init__(self) self.mParser = PredictionParser.PredictionParserGenewise() self.mExecutable = "genewise" self.mOptions = "-pseudo -init endbias" self.mOutputOptions = "-quiet -sum -gff -trans -pep -alb"
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string", help="filename with summary information." ) parser.add_option( "--skip-header", dest="skip_header", action="store_true", help="skip header." ) parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int", help="maximum extension for start codon (make divisible by 3)." ) parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int", help="maximum extension for stop codon (make divisible by 3)." ) parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice", choices=("first-start", "first-stop-backtrack"), help="extension mode for 5' end.") parser.add_option( "--fill-introns", dest="fill_introns", type="int", help="fill intron if divisible by three and no stop codon up to a maximum length of #." ) parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int", help="maximum number of stop codons to tolerate within an intron." ) parser.add_option( "--output-format", dest="output_format", type="choice", choices=("predictions", "extensions", "filled-introns"), help="output format." ) parser.set_defaults( genome_file = "genome", start_codons = ("ATG"), stop_codons = ("TAG", "TAA", "TGA"), start_codon_boundary = 9999, stop_codon_boundary = 9999, fill_introns = 0, introns_max_stops = 0, left_splice_signals = ("GT",), right_splice_signals = ("AG",), output_format="extensions", left_extension_mode = "first-start", skip_header = False, output_filename_summary = None, ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) options.start_codon_boundary = int(options.start_codon_boundary / 3) options.stop_codon_boundary = int(options.stop_codon_boundary / 3) fasta = IndexedFasta.IndexedFasta( options.genome_file ) p = PredictionParser.PredictionParserEntry() ninput, noutput = 0, 0 nfilled = 0 nseqs_filled = 0 nseqs_extended = 0 left_extensions = [] right_extensions = [] filled_introns = [] if not options.skip_header: if options.output_format == "predictions": options.stdout.write( Prediction.Prediction().getHeader() + "\n" ) elif options.output_format == "filled-introns": options.stdout.write("\t".join( ("prediction_id", "intron", "peptide_sequence", "genomic_sequence") ) + "\n" ) if options.output_filename_summary: outfile_summary = open (options.output_filename_summary, "w" ) outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" ) else: outfile_summary = None for line in options.stdin: if line[0] == "#": continue ninput += 1 p.Read(line) lsequence = fasta.getLength( p.mSbjctToken ) genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary) genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary) genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, genome_from, genome_to ).upper() ######################################################################## ######################################################################## ######################################################################## ## Do extensions if options.start_codon_boundary or options.stop_codon_boundary: extension_start = p.mSbjctGenomeFrom - genome_from extension_stop = genome_to - p.mSbjctGenomeTo fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom lfragment = len(genomic_sequence) ######################################################################## ######################################################################## ######################################################################## ## find start codon start = extension_start found_start = False if options.left_extension_mode == "first-start": found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons, options.stop_codons ) elif options.left_extension_mode == "first-stop-backtrack": if genomic_sequence[start:start+3] in options.start_codons: found_start = True else: found_start, start = findCodonReverse( genomic_sequence, start, options.stop_codons ) if found_start: E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) ) ## bracktrack to first start codon found_start = False while start < extension_start: start += 3 if genomic_sequence[start:start+3] in options.start_codons: found_start = True break else: start = extension_start if found_start: E.info("start codon found at %i (%i)." % ( start, extension_start - start) ) else: E.info("no start codon found." ) else: E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) ) found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons ) E.info("prediction %s: no start codon found." % ( p.mPredictionId ) ) if found_start: start += genome_from else: start = p.mSbjctGenomeFrom dstart = p.mSbjctGenomeFrom - start ######################################################################## ######################################################################## ######################################################################## ## find stop codon ## stop points to the beginning of the codon, thus the stop codon will ## not be part of the sequence. stop = fragment_to found_stop = 0 while stop < lfragment and \ genomic_sequence[stop:stop+3] not in ("NNN", "XXX"): if genomic_sequence[stop:stop+3] in options.stop_codons: found_stop = 1 break stop += 3 if found_stop: stop += genome_from else: stop = p.mSbjctGenomeTo dstop = stop - p.mSbjctGenomeTo ######################################################################## ######################################################################## ######################################################################## ## build new prediction map_peptide2genome = [] if dstart: map_peptide2genome.append( ("G", 0, dstart) ) map_peptide2genome += p.mMapPeptide2Genome if dstop: map_peptide2genome.append( ("G", 0, dstop) ) E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) ) ## save results p.mMapPeptide2Genome = map_peptide2genome p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome ) p.mSbjctGenomeFrom -= dstart p.mSbjctGenomeTo += dstop p.mSbjctFrom += dstart / 3 p.mSbjctTo += dstart / 3 + dstop / 3 if dstart or dstop: if dstart: left_extensions.append( dstart ) if dstop: right_extensions.append( dstop ) nseqs_extended += 1 ## update genomic sequence because borders might have changed. genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ).upper() if options.fill_introns: has_filled = False exons = Exons.Alignment2Exons( p.mMapPeptide2Genome, query_from = 0, sbjct_from = 0 ) new_exons = [] last_e = exons[0] nintron = 0 for e in exons[1:]: nintron += 1 lintron = e.mGenomeFrom - last_e.mGenomeTo if lintron > options.fill_introns or (lintron) % 3 != 0: E.debug( "prediction %s: intron %i of size %i discarded." % \ (p.mPredictionId, nintron, lintron ) ) new_exons.append(last_e) last_e = e continue ## get sequence, include also residues from split codons ## when checking for stop codons. if e.mAlignment[0][0] == "S": offset_left = last_e.mAlignment[-1][2] offset_right = e.mAlignment[0][2] else: offset_left, offset_right = 0, 0 sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right] ## check for splice sites for signal in options.left_splice_signals: if sequence[offset_left:offset_left+len(signal)] == signal: left_signal = True break else: left_signal = False for signal in options.right_splice_signals: if sequence[-(len(signal)+offset_right):-offset_right] == signal: right_signal = True break else: right_signal = False nstops, ngaps = 0, 0 for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]: if codon in options.stop_codons: nstops += 1 if "N" in codon.upper(): ngaps += 1 E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \ (p.mPredictionId, nintron, lintron, offset_left, offset_right, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom + last_e.mGenomeTo, p.mSbjctGenomeFrom + e.mGenomeFrom, nstops, ngaps, left_signal, right_signal ) ) if nstops + ngaps > options.introns_max_stops: new_exons.append(last_e) last_e = e continue E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \ (p.mPredictionId, nintron, lintron, nstops, ngaps, left_signal, right_signal)) e.Merge( last_e ) has_filled = True nfilled += 1 last_e = e if options.output_format == "filled-introns": options.stdout.write( "\t".join( map(str, ( p.mPredictionId, nintron, Genomics.TranslateDNA2Protein( sequence ), sequence ) ) ) + "\n" ) filled_introns.append(lintron) p.mNIntrons -= 1 new_exons.append(last_e) if has_filled: nseqs_filled += 1 Exons.UpdatePeptideCoordinates( new_exons ) p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons ) p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome ) ## build translated sequence p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \ p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence ) ## output info if options.output_format == "predictions": options.stdout.write( str(p) + "\n" ) elif options.output_format == "extensions": if found_start: found_start = 1 if found_stop: found_stop = 1 options.stdout.write( "\t".join( map(str, ( p.mPredictionId, found_start, found_stop, dstart, dstop, p.mTranslation, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, p.mAlignmentString ))) + "\n" ) noutput += 1 options.stdout.flush() E.info("stats : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() )) E.info("left : %s" % str(Stats.DistributionalParameters(left_extensions)) ) E.info("right : %s" % str(Stats.DistributionalParameters(right_extensions)) ) E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) ) E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\ ninput, noutput, nseqs_extended, nseqs_filled, nfilled)) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser.add_option("-m", "--filename-map", dest="filename_map", type="string", help="filename with mapping information.") parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string", help="pattern for mapping new to old identifiers: extract string from old.") parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string", help="pattern for mapping new to old identifiers: put string into new.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="genome_file.") parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string", help="filename with peptide sequences.") parser.add_option("-f", "--input-format", dest="input_format", type="choice", help="format of mapping file", choices=("alignment", "offsets") ) parser.add_option("-i", "--write-missed", dest="write_missed", type="string", help="write missed identifiers to separate file.") parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string", help="filename with gene information.") parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string", help="filename with old peptide information.") parser.add_option("--no-renumber", dest="renumber", action="store_false", help="do not renumber predictions.") parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string", help="contig sizes for old data.") parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string", help="contig sizes for new data.") parser.add_option("--skip-errors", dest="skip_errors", action="store_true", help="skip entries with errors.") parser.set_defaults( filename_map = None, pattern_old = "(.+)", pattern_new = "%s", genome_file = None, filename_peptides = None, write_missed = None, filename_genes = None, filename_old_peptides = None, renumber = True, input_format = "alignment", contig_sizes_old = None, contig_sizes_new = None, skip_errors = None ) (options, args) = E.Start( parser, add_pipe_options = True) predictor = PredictorExonerate() ## the different mapping criteria map_sbjcts = {} breakpoints = {} ################################################################################################ map_transcript2gene = {} if options.filename_genes: infile = open(options.filename_genes, "r") for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())): map_transcript2gene[transcript] = gene infile.close() ################################################################################################ peptides = {} if options.filename_peptides: peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides)) ################################################################################################ ## read old query sequences and compare against new query sequences ## this can be used to build a map between old and new queries query_map_old2new = {} if options.filename_old_peptides: old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r")) options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides)) query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides) options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped))) if options.loglevel >= 2: options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable)) options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped)) ################################################################################################ ## read old/new contig sizes for mapping positive/negative coordinates contig_sizes_old = {} contig_sizes_new = {} if options.contig_sizes_old: contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") ) if options.contig_sizes_new: contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") ) ################################################################################################ if options.filename_map: infile = open(options.filename_map) if options.input_format == "alignments": for line in infile: if line[0] == "#": continue x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t") map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali) if options.loglevel >= 1: options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts)) elif options.input_format == "offsets": ## input is a list of segments and their offsets. breakpoints, endpoints, offsets = ReadOffsets( infile ) if options.loglevel >= 1: options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints)) infile.close() ################################################################################################ ################################################################################################ ################################################################################################ ## end of input section ################################################################################################ ################################################################################################ ################################################################################################ rx = re.compile(options.pattern_old) last_sbjct_token = None ninput = 0 nerrors = 0 nerrors_map = 0 nerrors_inconsistencies = 0 nerrors_boundaries = 0 nerrors_translation = 0 nerrors_inconsequential = 0 nerrors_realigned = 0 nmapped = 0 nfiltered = 0 naligned = 0 noutput = 0 found_transcripts = {} nduplicates = 0 output = {} for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry() entry.Read( line ) ninput += 1 is_positive = entry.mSbjctStrand == "+" is_error = False ## check if query token is mappable: using sequence map if (query_map_old2new and entry.mQueryToken not in query_map_old2new): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue else: ## check if query token is mappable: using filter if (peptides and entry.mQueryToken not in peptides): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0] ########################################################################################################## ## Map via alignments if entry.mSbjctToken in map_sbjcts: nmapped += 1 if last_sbjct_token != entry.mSbjctToken: old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken] map_a2b = alignlib_lite.makeAlignmentVector() alignlib_lite.AlignmentFormatExplicit( int(old_from), old_ali, int(new_from), new_ali).copy( map_a2b ) last_sbjct_token = entry.mSbjctToken if options.loglevel >= 3: print "#", str(entry) print "#", map_sbjcts[entry.mSbjctToken] sys.stdout.flush() old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t first_res, last_res = f + 1, t else: f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t first_res, last_res = f, t + 1 ## map first and last residues mfirst_res = map_a2b.mapRowToCol( first_res ) mlast_res = map_a2b.mapRowToCol( last_res ) if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ): options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, f, t)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# %s\n" % str(entry)) options.stderr.flush() nerrors_boundaries += 1 is_error = True ## get extended boundaries for alignment later on while mfirst_res == 0 and first_res > 1: first_res -= 1 mfirst_res = map_a2b.mapRowToCol(first_res) while mlast_res == 0 and last_res < map_a2b.getRowTo(): last_res += 1 mlast_res = map_a2b.mapRowToCol(last_res) ## convert to genomic coordinates ## convert negative strand coordinates if is_positive: new_f = mfirst_res - 1 new_t = mlast_res else: new_f = mfirst_res new_t = mlast_res - 1 new_f = map_a2b.getColTo() - new_f new_t = map_a2b.getColTo() - new_t ## Now map the alignment. try: MapAlignment( entry, map_a2b ) except ValueError: options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.flush() nerrors_map += 1 is_error= True if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo: options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) nerrors_inconsistencies += 1 is_error = True ########################################################################################################## ## Map via offsets if entry.mSbjctToken in breakpoints: old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t else: f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f o1 = GetOffset( f, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) o2 = GetOffset( t, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) if o1 != o2: options.stderr.write("# break within gene %s\n" % str(entry)) nerrors_map += 1 is_error = True f += o1 t += o2 if not is_positive: f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo: options.stderr.write("# mapping error: start after end %s\n" % str(entry)) nerrors_map += 1 is_error = True ########################################################################################################## ## do translation check, if genome is given if options.genome_file: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, options.genome_file, loglevel = 0) map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \ entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence ) if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation): options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) if map_sbjcts: options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation)) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome))) nerrors_translation += 1 is_error = True if peptides and entry.mQueryToken in peptides: naligned += 1 options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \ entry.mQueryToken, new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) # do a quick reprediction if entry.mQueryToken in peptides: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, 0, 0, genome_file = options.genome_pattern, loglevel = 0) predictor.mLogLevel = 0 result = predictor(entry.mQueryToken, peptides[entry.mQueryToken], entry.mSbjctToken, genomic_sequence, "--exhaustive --subopt FALSE --score '%s' " % str(80), new_f - 10, new_t + 10) prediction_id = entry.mPredictionId if result: entry = result[0] entry.mPredictionId = prediction_id nerrors_realigned += 1 else: if is_error: nerrors_inconsequential += 1 entry.mSbjctToken = new_sbjct_token ## map query tokens if query_map_old2new: query_tokens = query_map_old2new[entry.mQueryToken] else: query_tokens = (entry.mQueryToken,) if options.skip_errors and is_error: continue for query_token in query_tokens: entry.mQueryToken = query_token prediction_id = entry.mPredictionId entry.mPredictionId = 0 hid = Genomics.GetHID( str(entry) ) if hid in output: nduplicates += 1 continue noutput += 1 if options.renumber: prediction_id = noutput entry.mPredictionId = prediction_id options.stdout.write( str(entry) + "\n") options.stdout.flush() found_transcripts[entry.mQueryToken] = 1 ## write out found transcripts and genes nmissed_transcripts = 0 missed_transcripts = [] found_genes = {} if peptides: for x in peptides.keys(): if x not in found_transcripts: nmissed_transcripts += 1 missed_transcripts.append( x ) else: found_genes[map_transcript2gene[x]] = 1 missed_genes = {} nmissed_genes = 0 if map_transcript2gene: for t in missed_transcripts: g = map_transcript2gene[t] if g not in found_genes: missed_genes[g] = 1 nmissed_genes = len(missed_genes) if options.write_missed: outfile = open(options.write_missed, "w") for x in missed_transcripts: if x in unmapped: status = "unmapped" else: status = "mapped" outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status )) for x in missed_genes: status = "unknown" outfile.write( "%s\t%s\t%s\n" % ("gene", x, status )) outfile.close() options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\ ninput, noutput, nfiltered, nduplicates, nmapped, nerrors )) options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\ nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned )) options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\ len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) ) E.Stop()
def BuildLines(dbhandle, statement, genome_lengths, prefix="", default_color=None): c = dbhandle.cursor() c.execute(statement) if param_loglevel >= 2: print "# received %i results." % c.rowcount sbjct_token = "" sbjct_strand = None sbjct_from = 10000000000000000 sbjct_to = 0 lines = [] nmatches = 0 for line in c.fetchall(): entry = PredictionParser.PredictionParserEntry() entry.FillFromTable(line) if not genome_lengths.has_key(entry.mSbjctToken): filename_genome = param_genome_file % entry.mSbjctToken forward_sequences, reverse_sequences = Genomics.ReadGenomicSequences( open(filename_genome, "r")) genome_lengths[entry.mSbjctToken] = (len( forward_sequences[entry.mSbjctToken]), 0) lgenome, offset = genome_lengths[entry.mSbjctToken] if param_loglevel >= 4: print "# lgenome=%i, offset=%i" % (lgenome, offset) # get cds information exons = [] if param_tablename_exons: cc = dbhandle.cursor() statement = """SELECT exon_from, exon_to, exon_frame, genome_exon_from, genome_exon_to FROM %s WHERE prediction_id = %i""" % ( param_tablename_exons, entry.mPredictionId, ) if param_restrict_good_exons: statement += " AND is_ok = TRUE" try: cc.execute(statement) result = cc.fetchall() except pgdb.DatabaseError, msg: print "# query failed with message", msg result = [] exons = result cc.close() if not exons: if entry.mMapPeptide2Genome: exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome, query_from=entry.mQueryFrom - 1, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=1) else: exons = [("", "", 0, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)] # select gene id if param_tablename_genes: cc = dbhandle.cursor() statement = """SELECT gene_id FROM %s WHERE prediction_id = %i""" % (param_tablename_genes, entry.mPredictionId) try: cc.execute(statement) result = cc.fetchone() except pgdb.DatabaseError, msg: print "# query failed with message", msg result = None gene_id = result[0] dbhandle.commit() cc.close()
sbjct_genome_to, map_query2genome FROM %s AS p WHERE p.sbjct_token = '%s' AND p.sbjct_strand = '%s' AND OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) map_reference2target = alignlib_lite.makeAlignmentVector() assignment_id = 0 for line in cr.fetchall(): reference = PredictionParser.PredictionParserEntry() reference.FillFromTable(line) ct = dbhandle.cursor() ct.execute(statement % (param_tablename_predictions_target, reference.mSbjctToken, reference.mSbjctStrand, reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo)) reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome, 0, reference.mSbjctFrom) for line2 in ct.fetchall(): target = PredictionParser.PredictionParserEntry() target.FillFromTable(line2)
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage = globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "-c", "--cds", dest="filename_cds", type="string", help="filename with cds seguences." ) parser.add_option( "-f", "--format", dest="format", type="choice", choices=("paired_fasta", ), help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format" ) parser.set_defaults( genome_file = "genome", filename_cds = "cds.fasta", format = "paired_fasta", filename_suffix = ".fasta", filename_prefix = "", ) (options, args) = E.Start( parser, add_psql_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(1) fasta = IndexedFasta.IndexedFasta( options.genome_file ) ## reading CDS sequences if options.filename_cds: cds_sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r") ) else: cds_sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i CDS sequences\n" % len(cds_sequences) ) last_filename_genome = None p = PredictionParser.PredictionParserEntry() ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0 for line in options.stdin: if line[0] == "#": continue if line[0] == '"': continue p.Read(line) ninput += 1 genomic_fragment = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ) if len(genomic_fragment) == 0: raise "ERROR: empty fragment %s:%s for line" % (p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line try: cds_fragment = cds_sequences[p.mQueryToken] except KeyError: options.stdlog.write( "# ERROR: cds not found: query %s.\n" % p.mQueryToken ) continue map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA( p.mMapPeptide2Genome, query_from = p.mQueryFrom, sbjct_from = 0, genome = genomic_fragment ) ## check for errors: if map_query2sbjct.getRowTo() != p.mQueryTo * 3: options.stdlog.write( "# ERROR: boundary shift in query at line %s\n# %i %i\n" % (line, map_query2sbjct.getRowTo(), p.mQueryTo * 3 ) ) if map_query2sbjct.getColTo() > len(genomic_fragment): options.stdlog.write( "# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" %\ (line, len(genomic_fragment), map_query2sbjct.getColTo()) ) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment )) nlength += 1 continue if map_query2sbjct.getRowTo() > len(cds_fragment): options.stdlog.write( "# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" %\ (line, len(cds_fragment), map_query2sbjct.getRowTo()) ) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment )) nlength += 1 continue cds_seq = alignlib_lite.makeSequence( cds_fragment ) genomic_seq = alignlib_lite.makeSequence( genomic_fragment ) f = alignlib_lite.AlignmentFormatExplicit( map_query2sbjct, cds_seq, genomic_seq ) row_ali = f.mRowAlignment col_ali = f.mColAlignment row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(row_ali, col_ali) row_ali = Genomics.MaskStopCodons( row_ali ) col_ali = Genomics.MaskStopCodons( col_ali ) if len(row_ali) != len(col_ali): options.stdlog.write( "# ERROR: wrong alignment lengths.\n" ) sys.exit(1) if len(row_ali) % 3 or len(col_ali) % 3: options.stdlog.write( "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line ) options.stdlog.write( "# %6i %s\n# %6i %s\n" % (len(row_ali), str(row_ali), len(col_ali), str(col_ali) ) ) n3 += 1 input = re.sub( "[-X]", "", p.mTranslation ) ref = re.sub( "[-X]", "", Genomics.TranslateDNA2Protein( col_ali ) ) if input != ref: if options.loglevel >= 1: options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken, len(input), input, len(ref), ref ) ) nsanity += 1 continue options.stdout.write( ">%s\n%s\n" % (p.mPredictionId, row_ali) ) options.stdout.write( ">%s_vs_%s_%s_%i_%i\n%s\n" % \ (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali) ) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % (ninput, noutput, nsanity, nlength, n3) ) E.Stop()
# read peptide sequences if param_filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(param_filename_peptides, "r")) else: peptide_sequences = {} # print HEADER if param_loglevel >= 2: print SHORT_HEADER_SUMMARY # aligned entries from exonerate entries = [] parser = PredictionParser.PredictionParserExonerate() if param_format == "exonerate": for line in sys.stdin: if line[0] == "#": continue if line[:3] != "diy": continue data = string.split(line[:-1], "\t") query_token = data[1] # parser has to go inside, because GetBestMatch returns reference # copy
if o in ("-v", "--verbose"): param_loglevel = int(a) elif o in ("--version", ): print "version=" sys.exit(0) elif o in ("-h", "--help"): print USAGE sys.exit(0) elif o in ("-t", "--trans"): param_trans = 1 print E.GetHeader() print E.GetParams() if param_trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line, )) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage=globals()["__doc__"], ) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true") parser.add_option( "-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff") ) parser.add_option( "-o", "--output-format", dest="output_format", help="output format", type="choice", choices=("exontable", "exons", "predictions", "cds", "fasta"), ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "--predictions-file", dest="predictions_file", type="string", help="filename with predictions. Use gene structures from this file if available.", ) parser.add_option( "-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section.", ) parser.add_option( "-p", "--filename-peptides", dest="filename_peptides", type="string", help="Filename with peptide sequences. If given, it is used to check the predicted translated sequences.", ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly.", ) parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly.", ) parser.add_option( "--input-coordinates", dest="input_coordinates", type="string", help="specify input format for input coordinates [forward|both-zero|one-closed|open].", ) parser.set_defaults( trans=False, output_format="predictions", format="psl", gff_field_id="id", input_coordinates="both-zero-open", filename_peptides=None, genome_file=None, do_realignment=True, predictions_file=None, remove_unaligned=False, ) (options, args) = E.Start(parser) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0, 0, 0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences(IOTools.openFile(options.filename_peptides, "r")) predictor = Predictor.PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter(options.input_coordinates) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions(IOTools.openFile(options.predictions_file, "r")) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line,)) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons(contig_sizes=contig_sizes) else: raise "unknown format %s for output option %s" % (options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n") options.stdlog.flush() results = parser.Parse(sys.stdin.readlines()) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n") options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors()) ) for error, msg in parser.mErrors: options.stdlog.write("# %s : %s\n" % (str(error), msg)) options.stdlog.flush() # if genomes are given: build translation if options.genome_file: results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken)) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write( "# processing entry %s:%s on %s:%s %i/%i.\n" % ( entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results), ) ) options.stdlog.flush() try: lgenome = fasta.getLength(entry.mSbjctToken) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome), ) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken) ) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write( "# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand) ) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence ) entry.score = entry.mMapPeptide2Translation.getColTo() - entry.mMapPeptide2Translation.getColFrom() + 1 ( entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions, ) = Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str(entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity(reference, translation, options) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) ) options.stdlog.flush() result = predictor( entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80), ) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity(reference, translation, options) else: if options.loglevel >= 2: options.stdlog.write( "# %s: realignment returned empty result\n" % (entry.mPredictionId) ) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" % ( entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation, ) ) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches on %s ... no realignment\n" % (entry.mPredictionId, entry.mSbjctToken) ) if options.loglevel >= 3: options.stdlog.write( "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" % (entry.mPredictionId, reference, translation) ) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-o", "--forward-coordinates", dest="forward_coordinates", action="store_true", help="input uses forward coordinates.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("default", "cds", "cdnas", "map", "gff", "intron-fasta", "exons"), help="output format.") parser.add_option("-r", "--reset-to-start", dest="reset_to_start", action="store_true", help="move genomic coordinates to begin from 0.") parser.add_option("--reset-query", dest="reset_query", action="store_true", help="move peptide coordinates to begin from 0.") parser.set_defaults(genome_file=None, forward_coordinates=False, format="default", reset_to_start=False, reset_query=False) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) cds_id = 1 entry = PredictionParser.PredictionParserEntry() fasta = IndexedFasta.IndexedFasta(options.genome_file) ninput, noutput, nskipped, nerrors = 0, 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue if line.startswith("id"): continue ninput += 1 try: entry.Read(line) except ValueError, msg: options.stdlog.write("# parsing failed with msg %s in line %s" % (msg, line)) nerrors += 1 continue cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) for cd in cds: cd.mSbjctToken = entry.mSbjctToken cd.mSbjctStrand = entry.mSbjctStrand if cds[-1].mGenomeTo != entry.mSbjctGenomeTo: options.stdlog.write( "# WARNING: discrepancy in exon calculation!!!\n") for cd in cds: options.stdlog.write("# %s\n" % str(cd)) options.stdlog.write("# %s\n" % entry) lsequence = fasta.getLength(entry.mSbjctToken) genomic_sequence = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) # deal with forward coordinates: convert them to negative strand # coordinates if options.forward_coordinates and \ entry.mSbjctStrand == "-": entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \ entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom for cd in cds: cd.InvertGenomicCoordinates(lsequence) # attach sequence to cds for cd in cds: start = cd.mGenomeFrom - entry.mSbjctGenomeFrom end = cd.mGenomeTo - entry.mSbjctGenomeFrom cd.mSequence = genomic_sequence[start:end] # reset coordinates for query if options.reset_to_start: offset = entry.mPeptideFrom for cd in cds: cd.mPeptideFrom -= offset cd.mPeptideTo -= offset # play with coordinates if options.reset_to_start: offset = entry.mSbjctGenomeFrom for cd in cds: cd.mGenomeFrom -= offset cd.mGenomeTo -= offset else: offset = 0 if options.format == "cds": rank = 0 for cd in cds: rank += 1 cd.mQueryToken = entry.mQueryToken cd.mSbjctToken = entry.mSbjctToken cd.mSbjctStrand = entry.mSbjctStrand cd.mRank = rank print str(cd) if options.format == "exons": rank = 0 for cd in cds: rank += 1 options.stdout.write("\t".join( map(str, (entry.mPredictionId, cd.mSbjctToken, cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom, cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) + "\n") elif options.format == "cdnas": print string.join( map(str, (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset, entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t") elif options.format == "map": map_prediction2genome = alignlib_lite.makeAlignmentSet() for cd in cds: alignlib_lite.addDiagonal2Alignment( map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo, (cd.mGenomeFrom - offset) - cd.mPeptideFrom) print string.join( map(str, (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, alignlib_lite.AlignmentFormatEmissions( map_prediction2genome))), "\t") elif options.format == "intron-fasta": rank = 0 if len(cds) == 1: nskipped += 1 continue last = cds[0].mGenomeTo for cd in cds[1:]: rank += 1 key = "%s %i %s:%s:%i:%i" % ( entry.mPredictionId, rank, entry.mSbjctToken, entry.mSbjctStrand, last, entry.mSbjctGenomeFrom) sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd. mGenomeFrom - entry.mSbjctGenomeFrom] options.stdout.write(">%s\n%s\n" % (key, sequence)) last = cd.mGenomeTo elif options.format == "gff-match": print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \ (entry.mSbjctToken, "gpipe", "similarity", entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mPercentIdentity, entry.mSbjctStrand, ".", entry.mQueryToken, entry.mQueryFrom, entry.mQueryTo, entry.score, entry.mNIntrons, entry.mNFrameShifts, entry.mNStopCodons) elif options.format == "gff-exon": rank = 0 for cd in cds: rank += 1 print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \ (entry.mSbjctToken, "gpipe", "similarity", cd.mGenomeFrom, cd.mGenomeTo, entry.mPercentIdentity, entry.mSbjctStrand, ".", entry.mQueryToken, cd.mPeptideFrom / 3 + 1, cd.mPeptideTo / 3 + 1, entry.score, rank, len(cds), entry.mPredictionId) else: exon_from = 0 for cd in cds: cd.mPeptideFrom = exon_from exon_from += cd.mGenomeTo - cd.mGenomeFrom cd.mPeptideTo = exon_from print string.join( map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom, cd.mPeptideTo, cd.frame, cd.mGenomeFrom, cd.mGenomeTo, cd.mSequence)), "\t") cds_id += 1 noutput += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-b", "--boundaries", dest="filename_boundaries", type="string", help="filename with exon boundaries.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename with exons (output).") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences.") parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true", help="print exons for predictions not found in reference.") parser.add_option("-q", "--quality-pide", dest="quality_threshold_pide", type="int", help="quality threshold (pide) for exons.") parser.set_defaults( genome_file="genome", filename_boundaries=None, filename_exons=None, filename_peptides=None, quality_threshold_pide=0, write_notfound=False, ## allowed number of nucleotides for exon boundaries to ## be considered equivalent. slipping_exon_boundary=9, ## stop codons to search for stop_codons=("TAG", "TAA", "TGA"), ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) reference_exon_boundaries = {} if options.filename_boundaries: reference_exon_boundaries = Exons.ReadExonBoundaries(open( options.filename_boundaries, "r"), do_invert=1, remove_utr=1) E.info("read exon boundaries for %i queries" % len(reference_exon_boundaries)) if options.filename_exons: outfile_exons = open(options.filename_exons, "w") outfile_exons.write("%s\n" % "\t".join( ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame", "reference_id", "reference_from", "reference_to", "reference_phase", "pidentity", "psimilarity", "nframeshifts", "ngaps", "nstopcodons", "is_ok", "genome_exon_from", "genome_exon_to"))) else: outfile_exons = None if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) E.info("read peptide sequences for %i queries" % len(peptide_sequences)) else: peptide_sequences = {} entry = PredictionParser.PredictionParserEntry() last_filename_genome = None nfound, nmissed_exons, nmissed_length = 0, 0, 0 nempty_alignments = 0 fasta = IndexedFasta.IndexedFasta(options.genome_file) options.stdout.write("%s\n" % "\t".join( ("prediction_id", "number", "dubious_exons", "boundaries_sum", "boundaries_max", "identical_exons", "inserted_exons", "deleted_exons", "inserted_introns", "deleted_introns", "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons", "deleted_Cexons", "inserted_Nexons", "inserted_Cexons"))) for line in sys.stdin: if line[0] == "#": continue try: entry.Read(line) except ValueError, msg: print "# parsing failed with msg %s in line %s" % (msg, line[:-1]) sys.exit(1) exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) if exons[-1][4] != entry.mSbjctGenomeTo: print "# WARNING: discrepancy in exon calculation!!!" for e in exons: print "#", str(e) print "#", str(entry) if options.loglevel >= 5: for e in exons: print "#", str(e) genomic_fragment = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) skip = False if peptide_sequences.has_key(entry.mQueryToken): query_sequence = alignlib_lite.makeSequence( peptide_sequences[entry.mQueryToken]) sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation) percent_similarity, percent_identity = 0, 0 if query_sequence.getLength( ) < entry.mMapPeptide2Translation.getRowTo(): print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken, query_sequence.getLength(), entry.mMapPeptide2Translation.getRowTo()) sys.stdout.flush() nmissed_length += 1 skip = True elif sbjct_sequence.getLength( ) < entry.mMapPeptide2Translation.getColTo(): print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken, sbjct_sequence.getLength(), entry.mMapPeptide2Translation.getColTo()) sys.stdout.flush() nmissed_length += 1 skip = True else: alignlib_lite.rescoreAlignment( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence, alignlib_lite.makeScorer(query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( entry.mMapPeptide2Translation) * 100 E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % (str(entry.mPredictionId), entry.mPercentSimilarity, entry.mPercentIdentity, percent_similarity, percent_identity)) else: query_sequence = None sbjct_sequence = None # default values exons_num_exons = "na" exons_boundaries_sum = "na" exons_boundaries_max = "na" dubious_exons = "na" ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0 truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0 ndeleted_Nexons, ndeleted_Cexons = 0, 0 ninserted_Nexons, ninserted_Cexons = 0, 0 exons_offset = exons[0][3] if not reference_exon_boundaries.has_key(entry.mQueryToken): print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken) sys.stdout.flush() nmissed_exons += 1 skip = True if not skip: nfound += 1 ref_exons = reference_exon_boundaries[entry.mQueryToken] ref_exons_offset = ref_exons[0].mGenomeFrom exons_num_exons = len(ref_exons) - len(exons) exons_boundaries_sum = 0 exons_phase = 0 exons_boundaries_max = 0 dubious_exons = 0 inserted_exons = 0 temp_inserted_exons = 0 if options.loglevel >= 3: for e in exons: options.stdlog.write("# %s\n" % str(e)) for e in ref_exons: options.stdlog.write("# %s\n" % str(e)) min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100 in_sync = 0 e, r = 0, 0 while e < len(exons) and r < len(ref_exons): this_e, this_r = e + 1, r + 1 percent_identity = 0 percent_similarity = 0 is_good_exon = 0 if options.loglevel >= 4: options.stdlog.write("# current exons: %i and %i\n" % (e, r)) sys.stdout.flush() exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[ e][0:6] ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset ## get percent identity for exon exon_percent_identity = 0 exon_percent_similarity = 0 if query_sequence and sbjct_sequence: tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = exon_from / 3 xquery_to = exon_to / 3 alignlib_lite.copyAlignment(tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# WARNING: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) nempty_alignments += 1 else: if options.loglevel >= 5: options.stdlog.write("# %s\n" % str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence))) exon_percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 exon_percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if exon_percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 if e < len(exons) - 1: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e + 1][0:6] else: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, [] if r < len(ref_exons) - 1: next_ref_from, next_ref_to, next_ref_phase = ( ref_exons[r + 1].mPeptideFrom, ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame) else: next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0 if options.loglevel >= 2: options.stdlog.write("# %s\n" % "\t".join( map(str, (entry.mQueryToken, exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, ref_from, ref_to, ref_phase)))) sys.stdout.flush() # beware of small exons. # if less than options.slipping_exon_boundary: boundary is 0 # check if end is more than options.splipping_exon_boundary apart as well. if exon_to - exon_from <= options.slipping_exon_boundary or \ ref_to - ref_from <= options.slipping_exon_boundary: boundary = 0 else: boundary = options.slipping_exon_boundary if ref_to <= exon_from + boundary and \ ref_to <= exon_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if e == 0: ndeleted_Nexons += 1 else: ndeleted_exons += 1 r += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0 overlap = 0 elif exon_to <= ref_from + boundary and \ exon_to <= ref_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if r == 0: ninserted_Nexons += 1 else: ninserted_exons += 1 e += 1 ref_from, ref_to, ref_phase = 0, 0, 0 overlap = 0 else: ## overlap overlap = 1 dfrom = int(math.fabs(exon_from - ref_from)) dto = int(math.fabs(exon_to - ref_to)) ## get percent identity for overlapping fragment if query_sequence and sbjct_sequence: ## this the problem tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = max(ref_from / 3, exon_from / 3) xquery_to = min(ref_to / 3, exon_to / 3) alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# warning: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) percent_identity = 0 percent_similarity = 0 else: if options.loglevel >= 5: print str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 dubious_exons += 1 ## adjust regions for terminal exons if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0: if is_good_exon: truncated_Nterminal_exon = dfrom dfrom = 0 ## truncated terminal exons if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: if is_good_exon: truncated_Cterminal_exon = dto dto = 0 ## do not count deviations for terminal query exons if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0: dfrom = 0 if e == len(exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: dto = 0 ## permit difference of one codon (assumed to be stop) if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto == 3: dto = 0 ## deal with different boundary conditions: if dfrom == 0 and dto == 0: if is_good_exon: nidentical_exons += 1 e += 1 r += 1 ## next exon within this ref_exon elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary: if is_good_exon: ninserted_introns += 1 e += 1 in_sync = 1 dto = 0 ## next ref_exon within this exon elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary: if is_good_exon: ndeleted_introns += 1 r += 1 in_sync = 1 dto = 0 else: e += 1 r += 1 if in_sync: dfrom = 0 if is_good_exon: exons_boundaries_sum += dfrom + dto exons_boundaries_max = max(dfrom, exons_boundaries_max) exons_boundaries_max = max(dto, exons_boundaries_max) ########################################################### ## count inserted/deleted introns and misplaced boundaries ## ## if exon and next_exon in ref_exon: inserted intron ## if ref_exon and next_ref_exon in exon: deleted intron if outfile_exons: if genomic_fragment and exon_genome_to: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment, border_stop_codon=0) else: nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0 if exon_to == 0: this_e = 0 if ref_to == 0: this_r = 0 outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, this_r, ref_from, ref_to, ref_phase, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, is_good_exon, exon_genome_from, exon_genome_to, )), "\t") + "\n") while e < len(exons): exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[ e][0:5] e += 1 ninserted_Cexons += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") while r < len(ref_exons): ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ndeleted_Cexons += 1 ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset r += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, 0, 0, 0, 0, r, ref_from, ref_to, ref_phase, 0, 0, 0, 0, 0, 0, 0, 0, )), "\t") + "\n") else: if options.write_notfound: this_e = 0 ## use prediction's identity/similarity for exons. ## This will still then flag stop-codons in later analysis percent_identity = entry.mPercentIdentity percent_similarity = entry.mPercentSimilarity for exon in exons: this_e += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[ 0:6] if genomic_fragment: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment) outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") options.stdout.write("\t".join( map(str, (entry.mPredictionId, exons_num_exons, dubious_exons, exons_boundaries_sum, exons_boundaries_max, nidentical_exons, ninserted_exons, ndeleted_exons, ninserted_introns, ndeleted_introns, truncated_Nterminal_exon, truncated_Cterminal_exon, ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons, ninserted_Cexons))) + "\n")
print E.GetHeader() print E.GetParams() # reading CDS sequences if param_filename_cds: cds_sequences = Genomics.ReadPeptideSequences( open(param_filename_cds, "r")) else: cds_sequences = {} if param_loglevel >= 1: print "# read %i CDS sequences" % len(cds_sequences) last_filename_genome = None p = PredictionParser.PredictionParserEntry() for line in sys.stdin: if line[0] == "#": continue if line[0] == '"': continue p.Read(line) # read genomic sequence if "%s" in param_genome_file: filename_genome = param_genome_file % p.mSbjctToken else: filename_genome = param_genome_file
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string", help="filename with summary information.") parser.add_option("--skip-header", dest="skip_header", action="store_true", help="skip header.") parser.add_option( "--fill-introns", dest="fill_introns", type="int", help= "fill intron if divisible by three and no stop codon up to a maximum length of #." ) parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int", help="maximum number of stop codons to tolerate within an intron.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("predictions", "extensions", "filled-introns"), help="output format.") parser.set_defaults( genome_file="genome", start_codons=("ATG"), stop_codons=("TAG", "TAA", "TGA"), skip_header=False, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) fasta = IndexedFasta.IndexedFasta(options.genome_file) p = PredictionParser.PredictionParserEntry() ninput, noutput = 0, 0 nfilled = 0 nseqs_filled = 0 nseqs_extended = 0 left_extensions = [] right_extensions = [] filled_introns = [] if not options.skip_header: options.stdout.write("\t".join(( "prediction_id", "intron", "contig", "strand", "start", "end", "length", "nstops", "type", "prime5", "prime3", )) + "\n") for line in sys.stdin: if line[0] == "#": continue ninput += 1 p.Read(line) lsequence = fasta.getLength(p.mSbjctToken) genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo).upper() exons = Exons.Alignment2Exons(p.mMapPeptide2Genome, query_from=0, sbjct_from=0) new_exons = [] last_e = exons[0] nintron = 0 for e in exons[1:]: nintron += 1 lintron = e.mGenomeFrom - last_e.mGenomeTo intron_is_l3 = lintron % 3 != 0 if intron_is_l3: ## get sequence, include also residues from split codons ## when checking for stop codons. ## note that e.mAlignment can sometimes be empty. This might ## be an exonerate bug. In the alignment string there are two ## consecutive exons. if e.mAlignment and last_e.mAlignment and e.mAlignment[0][ 0] == "S": offset_left = last_e.mAlignment[-1][2] offset_right = e.mAlignment[0][2] else: offset_left, offset_right = 0, 0 sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom + offset_right] intron_nstops = 0 for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: if codon in options.stop_codons: intron_nstops += 1 else: intron_nstops = 0 ## check for splice signals sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom] intron_type, prime5, prime3 = Genomics.GetIntronType(sequence) if options.loglevel >= 2: options.stdlog.write( "\t".join(map(str, (p.mPredictionId, nintron, lintron, intron_nstops, intron_type, genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\ sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" ) options.stdout.write("\t".join( map(str, (p.mPredictionId, nintron, p.mSbjctToken, p.mSbjctStrand, last_e.mGenomeTo + p.mSbjctGenomeFrom, e.mGenomeFrom + p.mSbjctGenomeFrom, lintron, intron_nstops, intron_type, prime5, prime3))) + "\n") last_e = e noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\ ninput, noutput)) E.Stop()
ninput, noutput, nskipped = 0,0,0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0,0,0,0,0,0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile( options.filename_peptides, "r")) predictor = PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter( options.input_coordinates ) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile( options.predictions_file, "r") ) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue