def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser.add_option("-m", "--filename-map", dest="filename_map", type="string", help="filename with mapping information.") parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string", help="pattern for mapping new to old identifiers: extract string from old.") parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string", help="pattern for mapping new to old identifiers: put string into new.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="genome_file.") parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string", help="filename with peptide sequences.") parser.add_option("-f", "--input-format", dest="input_format", type="choice", help="format of mapping file", choices=("alignment", "offsets") ) parser.add_option("-i", "--write-missed", dest="write_missed", type="string", help="write missed identifiers to separate file.") parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string", help="filename with gene information.") parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string", help="filename with old peptide information.") parser.add_option("--no-renumber", dest="renumber", action="store_false", help="do not renumber predictions.") parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string", help="contig sizes for old data.") parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string", help="contig sizes for new data.") parser.add_option("--skip-errors", dest="skip_errors", action="store_true", help="skip entries with errors.") parser.set_defaults( filename_map = None, pattern_old = "(.+)", pattern_new = "%s", genome_file = None, filename_peptides = None, write_missed = None, filename_genes = None, filename_old_peptides = None, renumber = True, input_format = "alignment", contig_sizes_old = None, contig_sizes_new = None, skip_errors = None ) (options, args) = E.Start( parser, add_pipe_options = True) predictor = PredictorExonerate() ## the different mapping criteria map_sbjcts = {} breakpoints = {} ################################################################################################ map_transcript2gene = {} if options.filename_genes: infile = open(options.filename_genes, "r") for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())): map_transcript2gene[transcript] = gene infile.close() ################################################################################################ peptides = {} if options.filename_peptides: peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides)) ################################################################################################ ## read old query sequences and compare against new query sequences ## this can be used to build a map between old and new queries query_map_old2new = {} if options.filename_old_peptides: old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r")) options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides)) query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides) options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped))) if options.loglevel >= 2: options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable)) options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped)) ################################################################################################ ## read old/new contig sizes for mapping positive/negative coordinates contig_sizes_old = {} contig_sizes_new = {} if options.contig_sizes_old: contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") ) if options.contig_sizes_new: contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") ) ################################################################################################ if options.filename_map: infile = open(options.filename_map) if options.input_format == "alignments": for line in infile: if line[0] == "#": continue x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t") map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali) if options.loglevel >= 1: options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts)) elif options.input_format == "offsets": ## input is a list of segments and their offsets. breakpoints, endpoints, offsets = ReadOffsets( infile ) if options.loglevel >= 1: options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints)) infile.close() ################################################################################################ ################################################################################################ ################################################################################################ ## end of input section ################################################################################################ ################################################################################################ ################################################################################################ rx = re.compile(options.pattern_old) last_sbjct_token = None ninput = 0 nerrors = 0 nerrors_map = 0 nerrors_inconsistencies = 0 nerrors_boundaries = 0 nerrors_translation = 0 nerrors_inconsequential = 0 nerrors_realigned = 0 nmapped = 0 nfiltered = 0 naligned = 0 noutput = 0 found_transcripts = {} nduplicates = 0 output = {} for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry() entry.Read( line ) ninput += 1 is_positive = entry.mSbjctStrand == "+" is_error = False ## check if query token is mappable: using sequence map if (query_map_old2new and entry.mQueryToken not in query_map_old2new): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue else: ## check if query token is mappable: using filter if (peptides and entry.mQueryToken not in peptides): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0] ########################################################################################################## ## Map via alignments if entry.mSbjctToken in map_sbjcts: nmapped += 1 if last_sbjct_token != entry.mSbjctToken: old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken] map_a2b = alignlib.makeAlignmentVector() alignlib.AlignmentFormatExplicit( int(old_from), old_ali, int(new_from), new_ali).copy( map_a2b ) last_sbjct_token = entry.mSbjctToken if options.loglevel >= 3: print "#", str(entry) print "#", map_sbjcts[entry.mSbjctToken] sys.stdout.flush() old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t first_res, last_res = f + 1, t else: f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t first_res, last_res = f, t + 1 ## map first and last residues mfirst_res = map_a2b.mapRowToCol( first_res ) mlast_res = map_a2b.mapRowToCol( last_res ) if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ): options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, f, t)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# %s\n" % str(entry)) options.stderr.flush() nerrors_boundaries += 1 is_error = True ## get extended boundaries for alignment later on while mfirst_res == 0 and first_res > 1: first_res -= 1 mfirst_res = map_a2b.mapRowToCol(first_res) while mlast_res == 0 and last_res < map_a2b.getRowTo(): last_res += 1 mlast_res = map_a2b.mapRowToCol(last_res) ## convert to genomic coordinates ## convert negative strand coordinates if is_positive: new_f = mfirst_res - 1 new_t = mlast_res else: new_f = mfirst_res new_t = mlast_res - 1 new_f = map_a2b.getColTo() - new_f new_t = map_a2b.getColTo() - new_t ## Now map the alignment. try: MapAlignment( entry, map_a2b ) except ValueError: options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.flush() nerrors_map += 1 is_error= True if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo: options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) nerrors_inconsistencies += 1 is_error = True ########################################################################################################## ## Map via offsets if entry.mSbjctToken in breakpoints: old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t else: f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f o1 = GetOffset( f, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) o2 = GetOffset( t, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) if o1 != o2: options.stderr.write("# break within gene %s\n" % str(entry)) nerrors_map += 1 is_error = True f += o1 t += o2 if not is_positive: f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo: options.stderr.write("# mapping error: start after end %s\n" % str(entry)) nerrors_map += 1 is_error = True ########################################################################################################## ## do translation check, if genome is given if options.genome_file: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, options.genome_file, loglevel = 0) map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \ entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence ) if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation): options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) if map_sbjcts: options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation)) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome))) nerrors_translation += 1 is_error = True if peptides and entry.mQueryToken in peptides: naligned += 1 options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \ entry.mQueryToken, new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) # do a quick reprediction if entry.mQueryToken in peptides: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, 0, 0, genome_file = options.genome_pattern, loglevel = 0) predictor.mLogLevel = 0 result = predictor(entry.mQueryToken, peptides[entry.mQueryToken], entry.mSbjctToken, genomic_sequence, "--exhaustive --subopt FALSE --score '%s' " % str(80), new_f - 10, new_t + 10) prediction_id = entry.mPredictionId if result: entry = result[0] entry.mPredictionId = prediction_id nerrors_realigned += 1 else: if is_error: nerrors_inconsequential += 1 entry.mSbjctToken = new_sbjct_token ## map query tokens if query_map_old2new: query_tokens = query_map_old2new[entry.mQueryToken] else: query_tokens = (entry.mQueryToken,) if options.skip_errors and is_error: continue for query_token in query_tokens: entry.mQueryToken = query_token prediction_id = entry.mPredictionId entry.mPredictionId = 0 hid = Genomics.GetHID( str(entry) ) if hid in output: nduplicates += 1 continue noutput += 1 if options.renumber: prediction_id = noutput entry.mPredictionId = prediction_id options.stdout.write( str(entry) + "\n") options.stdout.flush() found_transcripts[entry.mQueryToken] = 1 ## write out found transcripts and genes nmissed_transcripts = 0 missed_transcripts = [] found_genes = {} if peptides: for x in peptides.keys(): if x not in found_transcripts: nmissed_transcripts += 1 missed_transcripts.append( x ) else: found_genes[map_transcript2gene[x]] = 1 missed_genes = {} nmissed_genes = 0 if map_transcript2gene: for t in missed_transcripts: g = map_transcript2gene[t] if g not in found_genes: missed_genes[g] = 1 nmissed_genes = len(missed_genes) if options.write_missed: outfile = open(options.write_missed, "w") for x in missed_transcripts: if x in unmapped: status = "unmapped" else: status = "mapped" outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status )) for x in missed_genes: status = "unknown" outfile.write( "%s\t%s\t%s\n" % ("gene", x, status )) outfile.close() options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\ ninput, noutput, nfiltered, nduplicates, nmapped, nerrors )) options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\ nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned )) options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\ len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) ) E.Stop()
# read peptide sequences if param_filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(param_filename_peptides, "r")) else: peptide_sequences = {} map_a2b = {} if param_filename_map: infile = open(param_filename_map, "r") for line in infile: a, b = string.split(line[:-1], "\t") map_a2b[a] = b predictor = PredictorExonerate() nmissed, nfound, nfailed = 0, 0, 0 for line in sys.stdin: gene, sbjct_genome_from, sbjct_genome_to, sbjct_strand, chromosome, query_token = re.split( "\s+", line[:-1]) sbjct_token = "chr" + chromosome sbjct_genome_from, sbjct_genome_to = map( int, (sbjct_genome_from, sbjct_genome_to)) if sbjct_strand == "1": sbjct_strand = "+" else: sbjct_strand = "-"
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser.add_option("-m", "--filename-map", dest="filename_map", type="string", help="filename with mapping information.") parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string", help="pattern for mapping new to old identifiers: extract string from old.") parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string", help="pattern for mapping new to old identifiers: put string into new.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="genome_file.") parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string", help="filename with peptide sequences.") parser.add_option("-f", "--input-format", dest="input_format", type="choice", help="format of mapping file", choices=("alignment", "offsets") ) parser.add_option("-i", "--write-missed", dest="write_missed", type="string", help="write missed identifiers to separate file.") parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string", help="filename with gene information.") parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string", help="filename with old peptide information.") parser.add_option("--no-renumber", dest="renumber", action="store_false", help="do not renumber predictions.") parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string", help="contig sizes for old data.") parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string", help="contig sizes for new data.") parser.add_option("--skip-errors", dest="skip_errors", action="store_true", help="skip entries with errors.") parser.set_defaults( filename_map = None, pattern_old = "(.+)", pattern_new = "%s", genome_file = None, filename_peptides = None, write_missed = None, filename_genes = None, filename_old_peptides = None, renumber = True, input_format = "alignment", contig_sizes_old = None, contig_sizes_new = None, skip_errors = None ) (options, args) = E.Start( parser, add_pipe_options = True) predictor = PredictorExonerate() ## the different mapping criteria map_sbjcts = {} breakpoints = {} ################################################################################################ map_transcript2gene = {} if options.filename_genes: infile = open(options.filename_genes, "r") for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())): map_transcript2gene[transcript] = gene infile.close() ################################################################################################ peptides = {} if options.filename_peptides: peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides)) ################################################################################################ ## read old query sequences and compare against new query sequences ## this can be used to build a map between old and new queries query_map_old2new = {} if options.filename_old_peptides: old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r")) options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides)) query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides) options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped))) if options.loglevel >= 2: options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable)) options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped)) ################################################################################################ ## read old/new contig sizes for mapping positive/negative coordinates contig_sizes_old = {} contig_sizes_new = {} if options.contig_sizes_old: contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") ) if options.contig_sizes_new: contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") ) ################################################################################################ if options.filename_map: infile = open(options.filename_map) if options.input_format == "alignments": for line in infile: if line[0] == "#": continue x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t") map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali) if options.loglevel >= 1: options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts)) elif options.input_format == "offsets": ## input is a list of segments and their offsets. breakpoints, endpoints, offsets = ReadOffsets( infile ) if options.loglevel >= 1: options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints)) infile.close() ################################################################################################ ################################################################################################ ################################################################################################ ## end of input section ################################################################################################ ################################################################################################ ################################################################################################ rx = re.compile(options.pattern_old) last_sbjct_token = None ninput = 0 nerrors = 0 nerrors_map = 0 nerrors_inconsistencies = 0 nerrors_boundaries = 0 nerrors_translation = 0 nerrors_inconsequential = 0 nerrors_realigned = 0 nmapped = 0 nfiltered = 0 naligned = 0 noutput = 0 found_transcripts = {} nduplicates = 0 output = {} for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry() entry.Read( line ) ninput += 1 is_positive = entry.mSbjctStrand == "+" is_error = False ## check if query token is mappable: using sequence map if (query_map_old2new and entry.mQueryToken not in query_map_old2new): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue else: ## check if query token is mappable: using filter if (peptides and entry.mQueryToken not in peptides): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0] ########################################################################################################## ## Map via alignments if entry.mSbjctToken in map_sbjcts: nmapped += 1 if last_sbjct_token != entry.mSbjctToken: old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken] map_a2b = alignlib_lite.makeAlignmentVector() alignlib_lite.AlignmentFormatExplicit( int(old_from), old_ali, int(new_from), new_ali).copy( map_a2b ) last_sbjct_token = entry.mSbjctToken if options.loglevel >= 3: print "#", str(entry) print "#", map_sbjcts[entry.mSbjctToken] sys.stdout.flush() old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t first_res, last_res = f + 1, t else: f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t first_res, last_res = f, t + 1 ## map first and last residues mfirst_res = map_a2b.mapRowToCol( first_res ) mlast_res = map_a2b.mapRowToCol( last_res ) if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ): options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, f, t)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# %s\n" % str(entry)) options.stderr.flush() nerrors_boundaries += 1 is_error = True ## get extended boundaries for alignment later on while mfirst_res == 0 and first_res > 1: first_res -= 1 mfirst_res = map_a2b.mapRowToCol(first_res) while mlast_res == 0 and last_res < map_a2b.getRowTo(): last_res += 1 mlast_res = map_a2b.mapRowToCol(last_res) ## convert to genomic coordinates ## convert negative strand coordinates if is_positive: new_f = mfirst_res - 1 new_t = mlast_res else: new_f = mfirst_res new_t = mlast_res - 1 new_f = map_a2b.getColTo() - new_f new_t = map_a2b.getColTo() - new_t ## Now map the alignment. try: MapAlignment( entry, map_a2b ) except ValueError: options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.flush() nerrors_map += 1 is_error= True if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo: options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) nerrors_inconsistencies += 1 is_error = True ########################################################################################################## ## Map via offsets if entry.mSbjctToken in breakpoints: old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t else: f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f o1 = GetOffset( f, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) o2 = GetOffset( t, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) if o1 != o2: options.stderr.write("# break within gene %s\n" % str(entry)) nerrors_map += 1 is_error = True f += o1 t += o2 if not is_positive: f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo: options.stderr.write("# mapping error: start after end %s\n" % str(entry)) nerrors_map += 1 is_error = True ########################################################################################################## ## do translation check, if genome is given if options.genome_file: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, options.genome_file, loglevel = 0) map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \ entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence ) if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation): options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) if map_sbjcts: options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation)) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome))) nerrors_translation += 1 is_error = True if peptides and entry.mQueryToken in peptides: naligned += 1 options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \ entry.mQueryToken, new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) # do a quick reprediction if entry.mQueryToken in peptides: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, 0, 0, genome_file = options.genome_pattern, loglevel = 0) predictor.mLogLevel = 0 result = predictor(entry.mQueryToken, peptides[entry.mQueryToken], entry.mSbjctToken, genomic_sequence, "--exhaustive --subopt FALSE --score '%s' " % str(80), new_f - 10, new_t + 10) prediction_id = entry.mPredictionId if result: entry = result[0] entry.mPredictionId = prediction_id nerrors_realigned += 1 else: if is_error: nerrors_inconsequential += 1 entry.mSbjctToken = new_sbjct_token ## map query tokens if query_map_old2new: query_tokens = query_map_old2new[entry.mQueryToken] else: query_tokens = (entry.mQueryToken,) if options.skip_errors and is_error: continue for query_token in query_tokens: entry.mQueryToken = query_token prediction_id = entry.mPredictionId entry.mPredictionId = 0 hid = Genomics.GetHID( str(entry) ) if hid in output: nduplicates += 1 continue noutput += 1 if options.renumber: prediction_id = noutput entry.mPredictionId = prediction_id options.stdout.write( str(entry) + "\n") options.stdout.flush() found_transcripts[entry.mQueryToken] = 1 ## write out found transcripts and genes nmissed_transcripts = 0 missed_transcripts = [] found_genes = {} if peptides: for x in peptides.keys(): if x not in found_transcripts: nmissed_transcripts += 1 missed_transcripts.append( x ) else: found_genes[map_transcript2gene[x]] = 1 missed_genes = {} nmissed_genes = 0 if map_transcript2gene: for t in missed_transcripts: g = map_transcript2gene[t] if g not in found_genes: missed_genes[g] = 1 nmissed_genes = len(missed_genes) if options.write_missed: outfile = open(options.write_missed, "w") for x in missed_transcripts: if x in unmapped: status = "unmapped" else: status = "mapped" outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status )) for x in missed_genes: status = "unknown" outfile.write( "%s\t%s\t%s\n" % ("gene", x, status )) outfile.close() options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\ ninput, noutput, nfiltered, nduplicates, nmapped, nerrors )) options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\ nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned )) options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\ len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true") parser.add_option("-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff")) parser.add_option("-o", "--output-format", dest="output_format", help="output format", type="choice", choices=('exontable', 'exons', 'predictions', 'cds', 'fasta')) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed).") parser.add_option( "--predictions-file", dest="predictions_file", type="string", help= "filename with predictions. Use gene structures from this file if available." ) parser.add_option("-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section.") parser.add_option( "-p", "--filename-peptides", dest="filename_peptides", type="string", help= "Filename with peptide sequences. If given, it is used to check the predicted translated sequences." ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly.") parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly.") parser.add_option( "--input-coordinates", dest="input_coordinates", type="string", help= "specify input format for input coordinates [forward|both-zero|one-closed|open]." ) parser.set_defaults(trans=False, output_format="predictions", format="psl", gff_field_id='id', input_coordinates="both-zero-open", filename_peptides=None, genome_file=None, do_realignment=True, predictions_file=None, remove_unaligned=False) (options, args) = E.Start(parser) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0, 0, 0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_peptides, "r")) predictor = PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter(options.input_coordinates) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile(options.predictions_file, "r")) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line, )) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons( contig_sizes=contig_sizes) else: raise "unknown format %s for output option %s" % ( options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n") options.stdlog.flush() results = parser.Parse(sys.stdin.readlines()) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n") options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors())) for error, msg in parser.mErrors: options.stdlog.write("# %s : %s\n" % (str(error), msg)) options.stdlog.flush() ## if genomes are given: build translation if options.genome_file: results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken)) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write( "# processing entry %s:%s on %s:%s %i/%i.\n" % (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results))) options.stdlog.flush() try: lgenome = fasta.getLength(entry.mSbjctToken) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome)) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken)) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write( "# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand)) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( \ Genomics.String2Alignment( entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence ) entry.score = entry.mMapPeptide2Translation.getColTo( ) - entry.mMapPeptide2Translation.getColFrom() + 1 (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \ Genomics.CountGeneFeatures( 0, entry.mMapPeptide2Genome, genomic_sequence ) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str( entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity( reference, translation, options) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stdlog.flush() result = predictor( entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80)) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity( reference, translation, options) else: if options.loglevel >= 2: options.stdlog.write( "# %s: realignment returned empty result\n" % (entry.mPredictionId)) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write("# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" %\ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write("# %s: mismatches on %s ... no realignment\n" %\ (entry.mPredictionId, entry.mSbjctToken,)) if options.loglevel >= 3: options.stdlog.write("# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" %\ (entry.mPredictionId, reference, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n")
remove_unaligned = False ) (options, args) = E.Start( parser ) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0,0,0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0,0,0,0,0,0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile( options.filename_peptides, "r")) predictor = PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter( options.input_coordinates ) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile( options.predictions_file, "r") ) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions":
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage = globals()["__doc__"]) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true" ) parser.add_option("-f", "--format", dest="format", help="input format.", type="choice", choices = ("exons", "psl","gff")) parser.add_option("-o", "--output-format", dest="output_format", help="output format", type = "choice", choices=('exontable', 'exons', 'predictions', 'cds', 'fasta')) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "--predictions-file", dest="predictions_file", type="string", help="filename with predictions. Use gene structures from this file if available." ) parser.add_option("-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section." ) parser.add_option("-p", "--filename-peptides", dest="filename_peptides", type="string", help="Filename with peptide sequences. If given, it is used to check the predicted translated sequences." ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly." ) parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly." ) parser.add_option("--input-coordinates", dest="input_coordinates", type="string", help="specify input format for input coordinates [forward|both-zero|one-closed|open]." ) parser.set_defaults( trans = False, output_format = "predictions", format="psl", gff_field_id = 'id', input_coordinates="both-zero-open", filename_peptides = None, genome_file = None, do_realignment = True, predictions_file = None, remove_unaligned = False ) (options, args) = E.Start( parser ) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0,0,0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0,0,0,0,0,0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile( options.filename_peptides, "r")) predictor = PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter( options.input_coordinates ) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile( options.predictions_file, "r") ) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line,)) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons( contig_sizes = contig_sizes ) else: raise"unknown format %s for output option %s" % (options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n" ) options.stdlog.flush() results = parser.Parse( sys.stdin.readlines() ) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n" ) options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write("# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors() )) for error, msg in parser.mErrors: options.stdlog.write( "# %s : %s\n" % (str(error),msg)) options.stdlog.flush() ## if genomes are given: build translation if options.genome_file: results.Sort( lambda x,y: cmp( x.mSbjctToken, y.mSbjctToken ) ) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write("# processing entry %s:%s on %s:%s %i/%i.\n" % (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results) )) options.stdlog.flush() try: lgenome = fasta.getLength( entry.mSbjctToken ) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome)) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken) ) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write("# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand )) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons( entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom ) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( \ Genomics.String2Alignment( entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence ) entry.score = entry.mMapPeptide2Translation.getColTo() - entry.mMapPeptide2Translation.getColFrom() + 1 (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \ Genomics.CountGeneFeatures( 0, entry.mMapPeptide2Genome, genomic_sequence ) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str(entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity( reference, translation, options ) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write("# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) ) options.stdlog.flush() result = predictor(entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80) ) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity( reference, translation, options ) else: if options.loglevel >= 2: options.stdlog.write("# %s: realignment returned empty result\n" % (entry.mPredictionId)) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write("# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" %\ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write("# %s: mismatches on %s ... no realignment\n" %\ (entry.mPredictionId, entry.mSbjctToken,)) if options.loglevel >= 3: options.stdlog.write("# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" %\ (entry.mPredictionId, reference, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write( str(results) + "\n" )