def ProcessChunk(chunk, eliminated_predictions, exons): """process a cluster of overlapping predictions. Chunks are sorted by first position. Thus, only former can span later. """ eliminated = [] for x in range(0, len(chunk) - 1): xfrom, xto, xid, xquality = chunk[x] if xquality in options.quality_keep_gene_spanners: continue for y in range(x + 1, len(chunk)): yfrom, yto, yid, yquality = chunk[y] # print xid, yid, xfrom < yfrom, xto > yto, # Exons.CheckOverlap(exons[xid], exons[yid] ), xquality, # yquality if xfrom < yfrom and \ xto > yto and \ not Exons.CheckOverlap(exons[str(xid)], exons[str(yid)] ) and \ yquality in options.quality_remove_gene_spanners: eliminated_predictions[xid] = 0 eliminated.append((xid, "g")) if options.loglevel >= 1: options.stdlog.write( "# elimination: %s(%s) spans %s(%s)\n" % (str(xid), xquality, str(yid), yquality)) break return eliminated
def ResolveExonOverlaps(gene_id, predictions): """resolve overlaps between predictions based on exonic overlap.""" all_exons = [] n = 1 for p in predictions: exons = Exons.Alignment2ExonBoundaries(Genomics.String2Alignment( p.mAlignmentString), query_from=0, sbjct_from=p.mSbjctGenomeFrom) for exon in exons: all_exons.append((exon.mGenomeFrom, exon.mGenomeTo, n)) n += 1 map_prediction2gene = range(0, len(predictions) + 1) map_gene2predictions = [None] for x in range(1, len(predictions) + 1): map_gene2predictions.append([x]) all_exons.sort() # print all_exons # cluster exons by overlap last_exon_from, last_exon_to, last_p = all_exons[0] for exon_from, exon_to, p in all_exons[1:]: # if overlap if min(exon_to, last_exon_to) - max(exon_from, last_exon_from) > 0: # print "# overlap between %i and %i" % (p, last_p) # rewire pointers to point to gene of previous prediction # if they belong to different genes new_g = map_prediction2gene[last_p] old_g = map_prediction2gene[p] if new_g != old_g: for x in map_gene2predictions[old_g]: map_gene2predictions[new_g].append(x) map_prediction2gene[x] = new_g map_gene2predictions[old_g] = [] # if no overlap: create new gene, if predictions has no gene # associated with it yet. else: # print "# no overlap between %i and %i" % (p, last_p) if not map_prediction2gene[p]: map_prediction2gene[p] = len(map_gene2predictions) map_gene2predictions.append([p]) last_exon_to = max(last_exon_to, exon_to) last_p = p for x in range(1, len(map_gene2predictions)): if map_gene2predictions[x]: for p in map_gene2predictions[x]: print "%i\t%i" % (gene_id, predictions[p - 1].mPredictionId) gene_id += 1 return gene_id
def ClusterByExonCorrespondence(lengths={}, peptide_sequences=None): exons = Exons.ReadExonBoundaries(sys.stdin) if param_loglevel >= 1: print "# read exons for %i transcripts" % len(exons) if not lengths: for k in exons: lengths[k] = (exons[k][0].mPeptideTo / 3) + 1 for e in exons[k][1:]: lengths[k] = max(lengths[k], (e.mPeptideTo / 3) + 1) if param_loglevel >= 1: print "# lengths for %i transcripts" % len(lengths) map_region2transcript = {} map_transcript2region = {} map_transcript2transcript = {} ## build map of regions to transcripts for t in exons: map_transcript2region[t] = [] for e in exons[t]: r = "%s-%s-%i-%i" % (e.mSbjctToken, e.mSbjctStrand, e.mGenomeFrom, e.mGenomeTo) if r not in map_region2transcript: map_region2transcript[r] = [] map_region2transcript[r].append(t) map_transcript2region[t].append(r) ## build map of transcript to transcript map_transcript2transcript = {} for t in map_transcript2region: map_transcript2transcript[t] = [] for r in map_transcript2region[t]: for tt in map_region2transcript[r]: map_transcript2transcript[t].append(tt) for t in map_transcript2transcript: map_transcript2transcript[t].sort() l = None n = [] for tt in map_transcript2transcript[t]: if t == tt: continue if l != tt: n.append(tt) l = tt map_transcript2transcript[t] = n ## cluster greedily, take longest transcript cluster_id = 1 for t in map_transcript2region: if t not in map_transcript2transcript: continue cluster = CollectCluster(map_transcript2transcript, t) PrintCluster(cluster, cluster_id, lengths, peptide_sequences, param_regex_preferred) cluster_id += 1 if param_loglevel >= 1: print "# RESULT: %i transcripts in %i genes" % ( len(map_transcript2region), cluster_id - 1)
def CheckSuboptimal(rep_id, exons, eliminated_predictions, other_ids, map_prediction2data, options): overlaps = [] # get predictions which overlap by exons (but not completely): for id in other_ids: if id == rep_id: continue if id in eliminated_predictions: continue if Exons.CheckOverlap( exons[rep_id], exons[id]) and \ not Exons.CheckCoverage(exons[rep_id], exons[id], max_slippage=options.max_slippage): overlaps.append(id) rep = map_prediction2data[rep_id] identity = rep.mPid + options.suboptimal_min_identity_difference for x in range(0, len(overlaps) - 1): id1 = overlaps[x] d1 = map_prediction2data[id1] for y in range(x + 1, len(overlaps)): id2 = overlaps[y] d2 = map_prediction2data[id2] if options.loglevel >= 3: options.stdlog.write( "# suboptimal: %s ? %s + %s: %s %s %s %s %i %i %i\n" % ( rep_id, id1, id2, d1.mQuality in options.quality_remove_suboptimal, d2.mQuality in options.quality_remove_suboptimal, not Exons.CheckOverlap(exons[id1], exons[id2]), Exons.CheckCoverageAinB( exons[rep_id], exons[id1] + exons[id2], min_terminal_exon_coverage=0.0), rep.mPid, d1.mPid, d2.mPid, )) if (d1.mQuality in options.quality_remove_suboptimal and d2.mQuality in options.quality_remove_suboptimal ) and \ not Exons.CheckOverlap( exons[id1], exons[id2] ) and \ Exons.CheckContainedAinB(exons[rep_id], exons[id1] + exons[id2], min_terminal_exon_coverage=0.0 ) and \ (identity < d1.mPid) and \ (identity < d2.mPid): if options.loglevel >= 1: options.stdlog.write( "# elimination: %s(%s) joins %s(%s) and %s(%s)\n" % (rep_id, rep.mPid, id1, d1.mPid, id2, d2.mPid)) return True return False
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2, transcript2, peptide_map_a2b): if param_loglevel >= 3: for cd in cds1: print "#", str(cd) for cd in cds2: print "#", str(cd) print "# peptide_map_a2b", str( alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b)) sys.stdout.flush() dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2) if len(cds1) != len(cds2): if param_loglevel >= 4: print "" # WARNING: different number of exons!" seq1 = alignlib_lite.makeSequence(transcript1) seq2 = alignlib_lite.makeSequence(transcript2) tmp_map_a2b = alignlib_lite.makeAlignmentVector() dialign = WrapperDialign.Dialign("-n") dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8") dba = WrapperDBA.DBA() #clustal = WrapperClustal.Clustal() matrix, gop, gep = global_substitution_matrix alignator_nw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix) alignator_sw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix) # concatenated alignments for exons: # 1: only the common parts ali_common1 = "" ali_common2 = "" e1, e2 = 0, 0 while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom(): e1 += 1 while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom(): e2 += 1 nskipped, nerrors = 0, 0 if param_loglevel >= 5: nmapped = 0 for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1): if dna_map_a2b.mapRowToCol(x) >= 0: nmapped += 1 print "# nmapped=", nmapped print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b)) # declare alignments used map_intron_a2b = alignlib_lite.makeAlignmentVector() result = Exons.CompareGeneStructures(cds1, cds2, map_cmp2ref=peptide_map_a2b) if param_loglevel >= 2: print result.Pretty("#") nskipped_exons, nskipped_introns = 0, 0 last_e1, last_e2 = None, None for link in result.mEquivalences: if link.mCoverage <= param_min_exon_coverage: nskipped_exons += 1 continue e1, e2 = link.mId1, link.mId2 c1 = cds1[e1] c2 = cds2[e2] exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo] exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo] ####################################################################### # write unaligned exons if param_write_exons: pair = AlignedPairs.UnalignedPair() pair.mCategory = "exon" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) pair.mLen1 = len(exon_fragment1) pair.mSequence1 = exon_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum2 = len(cds2) pair.mLen2 = len(exon_fragment2) pair.mSequence2 = exon_fragment2 pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo, pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo, print str(pair) sys.stdout.flush() ####################################################################### # build alignment for overlap of both exons # tmp_map_a2b.clear() # alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b, # c1.mGenomeFrom + 1, c1.mGenomeTo ) # if param_loglevel >= 5: # print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo) # for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"): # print "#", x # if tmp_map_a2b.getLength() == 0: # if param_loglevel >= 1: # print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \ ## (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2) # print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\ ## peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(peptide_map_a2b) # print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\ ## dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(dna_map_a2b) # for cd in cds1: print "##", str(cd) # for cd in cds2: print "##", str(cd) ## nerrors += 1 # continue ## data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b ).split("\n")) # if "caligned" in param_write_exons : # print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1, ## token2, e2, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) ## ali_common1 += data[0][1] ## ali_common2 += data[1][1] ####################################################################### # write alignment of introns for orthologous introns # orthologous introns are between orthologous exons if param_write_introns: if last_e1 is not None: if e1 - last_e1 != 1 or e2 - last_e2 != 1: nskipped_introns += 1 else: pair = AlignedPairs.UnalignedPair() intron_from1 = cds1[e1 - 1].mGenomeTo intron_to1 = cds1[e1].mGenomeFrom intron_from2 = cds2[e2 - 1].mGenomeTo intron_to2 = cds2[e2].mGenomeFrom intron_fragment1 = transcript1[intron_from1:intron_to1] intron_fragment2 = transcript2[intron_from2:intron_to2] if len(intron_fragment1) == 0 or len( intron_fragment2) == 0: print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\ (intron_from1, intron_to1, len(transcript1), intron_from2, intron_to2, len(transcript2)) continue pair.mCategory = "intron" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) - 1 pair.mLen1 = len(intron_fragment1) pair.mFrom1 = intron_from1 pair.mTo1 = intron_to1 pair.mSequence1 = intron_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum1 = len(cds2) - 1 pair.mLen2 = len(intron_fragment2) pair.mFrom2 = intron_from2 pair.mTo2 = intron_to2 pair.mSequence2 = intron_fragment2 if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \ (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \ (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \ (param_max_intron_length and len(intron_fragment2) > param_max_intron_length): if param_loglevel >= 1: print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\ (token1, e1, token2, e2, len(intron_fragment1), len(intron_fragment2)) sys.stdout.flush() nskipped += 1 print str(pair) # else: ## anchored_from1 = intron_from1 - param_extend_introns ## anchored_to1 = intron_to1 + param_extend_introns ## anchored_from2 = intron_from2 - param_extend_introns ## anchored_to2 = intron_to2 + param_extend_introns ## anchored_fragment1 = transcript1[anchored_from1:anchored_to1] ## anchored_fragment2 = transcript2[anchored_from2:anchored_to2] # for method in param_write_introns: # if param_loglevel >= 2: # print "## aligning with method %s" % method # sys.stdout.flush # map_intron_a2b.clear() # if method == "unaligned": ## from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2 # elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"): ## tmp_intron_a2b = alignlib_lite.makeAlignmentVector() # if param_loglevel >= 1: # print "# aligning with method %s two fragments of length %i and %i" % (method, # len(anchored_fragment1), # len(anchored_fragment2)) # sys.stdout.flush() # if method == "dialigned": ## result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dialignedlgs": ## result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dbaligned": ## result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "clusaligned": ## result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # if not result or result.getLength() == 0: # if param_loglevel >= 1: # print "# Error: empty intron alignment" # sys.stdout.flush() ## nerrors += 1 # continue ## tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 ) # alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b, ## intron_from1 + 1, intron_to1, # intron_from2 + 1, intron_to2 ) # elif method == "nwaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignator_nw.Align( seq1, seq2, map_intron_a2b ) # seq1.useFullLength() # seq2.useFullLength() # elif method == "swaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw ) # seq1.useFullLength() # seq2.useFullLength() # else: ## raise "unknown method %s" % method # if map_intron_a2b.getLength() > 0: # if param_compress: ## from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo() ## from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo() ## ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b ) # else: # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b ).split("\n")) # if len(data) < 2: ## data=[ ( 0, "", 0), (0, "", 0)] ## from1, ali1, to1 = data[0] ## from2, ali2, to2 = data[1] # print string.join(map(str, ("intron", # method, ## token1, e1, len(cds1) - 1, len(intron_fragment1), ## token2, e2, len(cds2) - 1, len(intron_fragment2), # map_intron_a2b.getNumGaps(), # map_intron_a2b.getLength(), ## map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(), ## from1, to1, ali1, ## from2, to2, ali2, ## intron_from1, intron_to1, # intron_from2, intron_to2)), "\t") # sys.stdout.flush() last_e1, last_e2 = e1, e2 ########################################################################## # write concatenated exons # for method in param_write_exons: # if method == "common": # print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## 0, 0, ## 0, 0, # ali_common1, ali_common2 ) # elif method == "exons": # Write full alignment without gaps. # This will not care about exon boundaries and gaps. # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) # try: ## from1, s1, to1, from2, s2, to2 = data[0] + data[1] # except ValueError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # except IndexError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # if from1: # if len(s1) != len(s2): # print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2)) ## nerrors += 1 ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" # else: ## a1, a2 = [], [] # for x in range( min(len(s1), len(s2)) ): # if s1[x] != "-" and s2[x] != "-": ## a1.append( s1[x] ) ## a2.append( s2[x] ) ## s1 = string.join(a1, "") ## s2 = string.join(a2, "") # print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0, ## token2, 0, ## from1, to1, ## from2, to2, # s1, s2 ) ) # elif method == "full": # write full alignment (do not care about exon boundaries) # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) ## if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)] # print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) if param_loglevel >= 3: print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons, nskipped_introns) return nerrors, nskipped
def ReadTranscriptsAndCds(transcript_ids1, transcript_ids2): if param_loglevel >= 1: print "# reading %i left and %i right transcripts" % ( len(transcript_ids1), len(transcript_ids2)) sys.stdout.flush() if param_loglevel >= 1: print "# reading exon boundaries." sys.stdout.flush() cds1 = Exons.ReadExonBoundaries(open(param_filename_cds1, "r"), filter=transcript_ids1, reset=True) cds2 = Exons.ReadExonBoundaries(open(param_filename_cds2, "r"), filter=transcript_ids2, reset=True) if param_loglevel >= 1: print "# read %i left and %i right cds" % (len(cds1), len(cds2)) sys.stdout.flush() if param_loglevel >= 2: if len(cds1) != len(transcript_ids1): print "# missed in left: %s" % ":".join( set(transcript_ids1.keys()).difference(cds1.keys())) if len(cds2) != len(transcript_ids2): print "# missed in right: %s" % ":".join( set(transcript_ids2.keys()).difference(cds2.keys())) if param_loglevel >= 1: print "# reading genomic sequences." sys.stdout.flush() transcripts1 = {} if param_filename_transcripts1: if param_mode_genome1 == "indexed": transcripts1 = Genomics.ParseFasta2HashFromIndex( param_filename_transcripts1, filter=transcript_ids1) else: transcripts1 = Genomics.ReadGenomicSequences( open(param_filename_transcripts1, "r"), do_reverse=0, filter=transcript_ids1, mask=param_mask) transcripts2 = {} if param_filename_transcripts2: if param_mode_genome2 == "indexed": transcripts2 = Genomics.ParseFasta2HashFromIndex( param_filename_transcripts2, filter=transcript_ids2) else: transcripts2 = Genomics.ReadGenomicSequences( open(param_filename_transcripts2, "r"), do_reverse=0, filter=transcript_ids2, mask=param_mask) if param_loglevel >= 1: print "# read %i left and %i right transcript sequences" % ( len(transcripts1), len(transcripts2)) sys.stdout.flush() return transcripts1, transcripts2, cds1, cds2
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option("--coordinate-format", dest="coordinate_format", type="string", help="input type of coordinates." ) parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true", help="output forward coordinates." ) parser.add_option("-e", "--extract-id", dest="extract_id", type="string", help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" ) parser.set_defaults( coordinate_format = "zero-forward", forward_coordinates = False, genome_file = None, extract_id = None ) (options, args) = E.Start( parser ) if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} if options.extract_id: extract_id = re.compile( options.extract_id ) else: extract_id = None converter = IndexedFasta.getConverter( options.coordinate_format ) exons = Exons.ReadExonBoundaries( sys.stdin, contig_sizes = contig_sizes, converter = converter, do_invert = True, format = "gtf", gtf_extract_id = extract_id ) ntranscripts, nexons, nerrors = 0, 0, 0 for id, ee in exons.items(): ntranscripts += 1 has_error = False for e in ee: if options.forward_coordinates and e.mSbjctToken in contig_sizes and \ e.mSbjctStrand == "-": l = contig_sizes[e.mSbjctToken] e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom if e.mGenomeFrom < 0: has_error = True if options.loglevel >= 1: options.stderr.write( "# Error: %s\n" % str(e) ) break options.stdout.write( str(e) + "\n" ) nexons += 1 if has_error: nerrors += 1 continue if options.loglevel >= 1: options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true") parser.add_option("-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff")) parser.add_option("-o", "--output-format", dest="output_format", help="output format", type="choice", choices=('exontable', 'exons', 'predictions', 'cds', 'fasta')) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed).") parser.add_option( "--predictions-file", dest="predictions_file", type="string", help= "filename with predictions. Use gene structures from this file if available." ) parser.add_option("-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section.") parser.add_option( "-p", "--filename-peptides", dest="filename_peptides", type="string", help= "Filename with peptide sequences. If given, it is used to check the predicted translated sequences." ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly.") parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly.") parser.add_option( "--input-coordinates", dest="input_coordinates", type="string", help= "specify input format for input coordinates [forward|both-zero|one-closed|open]." ) parser.set_defaults(trans=False, output_format="predictions", format="psl", gff_field_id='id', input_coordinates="both-zero-open", filename_peptides=None, genome_file=None, do_realignment=True, predictions_file=None, remove_unaligned=False) (options, args) = E.Start(parser) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0, 0, 0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_peptides, "r")) predictor = Predictor.PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter(options.input_coordinates) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile(options.predictions_file, "r")) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line, )) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons( contig_sizes=contig_sizes) else: raise "unknown format %s for output option %s" % ( options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n") options.stdlog.flush() results = parser.Parse(sys.stdin.readlines()) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n") options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors())) for error, msg in parser.mErrors: options.stdlog.write("# %s : %s\n" % (str(error), msg)) options.stdlog.flush() # if genomes are given: build translation if options.genome_file: results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken)) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write( "# processing entry %s:%s on %s:%s %i/%i.\n" % (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results))) options.stdlog.flush() try: lgenome = fasta.getLength(entry.mSbjctToken) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome)) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken)) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write( "# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand)) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence) entry.score = entry.mMapPeptide2Translation.getColTo( ) - entry.mMapPeptide2Translation.getColFrom() + 1 (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \ Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str( entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity( reference, translation, options) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stdlog.flush() result = predictor( entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80)) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity( reference, translation, options) else: if options.loglevel >= 2: options.stdlog.write( "# %s: realignment returned empty result\n" % (entry.mPredictionId)) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches on %s ... no realignment\n" % ( entry.mPredictionId, entry.mSbjctToken, )) if options.loglevel >= 3: options.stdlog.write( "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" % (entry.mPredictionId, reference, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]") parser.add_option("-f", "--format", dest="format", type="string", help="output format [Default=%default]") parser.add_option( "-e", "--expand", dest="expand", action="store_true", help= "expand positions from peptide to nucleotide alignment [Default=%default]" ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option("-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help= "expect one-based coordinates. The default are zero based coordinates [Default=%default]." ) parser.add_option("--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]") parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option("-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option("-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help= "given a set of previous alignments, only write new pairs [Default=%default]." ) parser.set_defaults(filename_sequences=None, filename_exons=None, filename_map=None, filename_outfile=None, no_gaps=False, format="fasta", expand=False, require_codons=False, no_identical=False, min_length=0, report_step=100, one_based_coordinates=False, filename_filter=None) (options, args) = E.Start(parser, add_mysql_options=True) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) else: sequences = {} if options.loglevel >= 1: options.stdlog.write("# read %i sequences\n" % len(sequences)) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r")) else: exons = {} if options.loglevel >= 1: options.stdlog.write("# read %i exons\n" % len(exons)) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read(line) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write("# read %i maps\n" % len(map_old2new)) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write("# reading filtering information.\n") sys.stdout.flush() map_pair2hids = {} if os.path.exists(options.filename_filter): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator(infile) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append(s) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids)) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write("# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links(sys.stdin): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write("# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1)) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write("# read link %s\n" % str(link)) row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken]) col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken]) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment(link.mQueryAli, 3) link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_row2col) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in row with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mQueryToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in col with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mSbjctToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError( "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" % (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) + "\n") # check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write("# %s\n" % str(map_row2col)) options.stdlog.write("# %s\n" % str(link)) options.stdlog.write("# %s\n" % str(map_old2new[link.mQueryToken])) options.stdlog.write("# %s\n" % str(map_old2new[link.mSbjctToken])) options.stdlog.write("#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) raise ValueError( "incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) # if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] # Get overlapping segments segments = Exons.MatchExons(map_row2col, exons1, exons2) for a, b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in # the input files. from1, to1 = GetAdjustedBoundaries(a, exons1) from2, to2 = GetAdjustedBoundaries(b, exons2) alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col, from1 + 1, to1, from2 + 1, to2) mode = Write(tmp1_map_row2col, row_seq, col_seq, link, no_gaps=options.no_gaps, no_identical=options.no_identical, min_length=options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile=outfile, pair_filter=map_pair2hid, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write(map_row2col, row_seq, col_seq, link, min_length=options.min_length, no_gaps=options.no_gaps, no_identical=options.no_identical, outfile=outfile, pair_filter=map_pair2hids, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map(lambda x, y: "%s=%i" % (x, y), counts.keys(), counts.values()))) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/regions2graph.py 2754 2009-09-04 16:50:22Z andreas $", usage=globals()["__doc__"]) parser.add_option("-b", "--benchmark", dest="filename_benchmark", type="string", help="") parser.add_option("-y", "--benchmark-synonyms", dest="benchmark_synonyms", type="string", help="") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="") parser.add_option("-c", "--min-coverage-query", dest="min_coverage_query", type="float", help="") parser.add_option("-s", "--min-score", dest="min_total_score", type="float", help="") parser.add_option("-i", "--min-percent-identity", dest="min_percent_identity", type="float", help="") parser.add_option("-o", "--max-percent-overlap", dest="max_percent_overlap", type="float", help="") parser.add_option("--overlap-min-score", dest="overlap_min_score", type="float", help="") parser.add_option("--overlap-min-coverage", dest="overlap_min_coverage", type="float", help="") parser.add_option("--overlap-min-identity", dest="overlap_min_identity", type="float", help="") parser.add_option("--overlap-max-coverage", dest="overlap_max_coverage", type="float", help="") parser.add_option("-m", "--max-matches", dest="max_matches", type="int", help="") parser.add_option("-j", "--join-regions", dest="join_regions", type="int", help="") parser.add_option("--join-regions-max-regions", dest="join_regions_max_regions", type="int", help="") parser.add_option("--join-regions-max-coverage", dest="join_regions_max_coverage", type="float", help="") parser.add_option("--min-length", dest="min_length", type="int", help="") parser.add_option("--test", dest="test", type="int", help="") parser.add_option("--filter-queries", dest="filename_filter_queries", type="string", help="") parser.add_option("--filter-regions", dest="filter_regions", type="string", help="") parser.add_option("--conserve-memory", dest="conserve_memory", action="store_true", help="") parser.add_option("--filter-suboptimal", dest="filter_suboptimal", action="store_true", help="") parser.set_defaults( # overlap allowed for matches on genomic region max_percent_overlap=20, gop=-10.0, gep=-2.0, # thresholds for joining regions overlap_min_score=80, overlap_min_coverage=80, overlap_max_coverage=90, overlap_min_identity=50, # threshold for filtering bad predictions: # minimum score min_total_score=80, # joining regions join_regions=0, # maximum coverage of query for predictions to be joined # (This is to ensure not to join duplications. A range check # would be better, but runs into trouble with repeats). join_regions_max_coverage=90, # minimum coverage of query min_coverage_query=10, # conserve memory conserve_memory=0, # minimum percent identity min_percent_identity=0, # minimum length min_length=0, max_matches=0, filename_peptides=None, filename_filter_queries=None, # turn on/off various filters filter_suboptimal=False, filter_regions=False, # parameters for filter of suboptimal predictions min_relative_coverage=0.5, min_relative_score=0.5, min_relative_percent_identity=0.5, # minimum difference between non-correlated conflicts to keep them # both. conflicts_min_difference=0.1, # benchmarking data benchmarks=None, benchmark_synonyms=None, filename_benchmark=None, filename_benchmark_synonyms=None, test=None, max_intron=50000) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) ########################################################################## # read filtering filter_queries = {} if options.filename_filter_queries: for line in open(options.filename_filter_queries, "r"): if line[0] == "#": continue query_token = line[:-1].split("\t")[0] filter_queries[query_token] = True if options.loglevel >= 1: options.stdlog.write("# filtering for %i queries.\n" % len(filter_queries)) ########################################################################## # read benchmarking regions if options.filename_benchmark: options.benchmarks = ReadBenchmarkingRegions( open(options.filename_benchmark, "r")) if options.loglevel >= 1: options.stdlog.write( "# read benchmarking regions for %i tokens\n" % len(options.benchmarks)) sys.stdout.flush() if options.filename_benchmark_synonyms: infile = open(options.filename_benchmark_synonyms, "r") options.benchmark_synonyms = {} for line in infile: if line[0] == "#": continue value, key = line[:-1].split("\t") options.benchmark_synonyms[key] = value else: options.benchmark_synonyms = {} else: options.benchmarks = {} options.benchmark_synonyms = {} ########################################################################## # read peptide sequences if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) else: peptide_sequences = {} if options.conserve_memory: old_predictions, filename_old_predictions = tempfile.mkstemp() os.close(old_predictions) old_predictions = PredictionFile.PredictionFile() old_predictions.open(filename_old_predictions, "w") else: # array with final predictions old_predictions = [] if options.loglevel >= 1: options.stdlog.write("# reading predictions.\n") sys.stdout.flush() nread = 0 ninput = 0 for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry(expand=0) entry.Read(line) nread += 1 # set prediction id if not entry.mPredictionId: entry.mPredictionId = nread # filter bad predictions right here in order to save memory: if entry.score < options.min_total_score: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: score below minimum: removing: %s\n" % str(entry)) continue elif entry.mQueryCoverage < options.min_coverage_query: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: coverage below minimum: removing: %s\n" % str(entry)) continue elif entry.mPercentIdentity < options.min_percent_identity: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: percent identity below minimum: removing: %s\n" % str(entry)) continue elif entry.mSbjctTo - entry.mSbjctFrom < options.min_length: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: length of transcript below minimum: removing: %s\n" % str(entry)) continue ninput += 1 if options.test and ninput > options.test: break old_predictions.append(entry) if options.loglevel >= 1: options.stdlog.write("# predictions after input: %i\n" % ninput) sys.stdout.flush() if options.loglevel >= 10: options.stdlog.write( "############## start: predictions after input ###################################\n" ) for x in old_predictions: options.stdlog.write("# %s\n" % str(x)) options.stdlog.write( "############## end: predictions after input #####################################\n" ) sys.stdout.flush() if ninput == 0: options.stdlog.write("# ERROR: no predictions\n") sys.exit(1) ########################################################################## # set up stacks of regions if options.conserve_memory: old_predictions.close() old_predictions.open(mode="r") removed_predictions, filename_removed_predictions = tempfile.mkstemp() os.close(removed_predictions) removed_predictions = PredictionFile.PredictionFile() removed_predictions.open(filename_removed_predictions, "w") new_predictions, filename_new_predictions = tempfile.mkstemp() os.close(new_predictions) new_predictions = PredictionFile.PredictionFile() new_predictions.open(filename_new_predictions, "w") else: removed_predictions = [] new_predictions = [] if options.benchmarks: EvaluateBenchmark(old_predictions) ########################################################################## # join regions if options.join_regions and options.join_regions_max_coverage: if options.loglevel >= 1: options.stdlog.write( "# joining regions: maximum distance between segments = %i and maximum query coverage = %i\n" % (options.join_regions, options.join_regions_max_coverage)) sys.stdout.flush() njoined = JoinRegions(old_predictions, new_predictions) if options.conserve_memory: ExchangeStreams(old_predictions, new_predictions) else: old_predictions = new_predictions new_predictions = [] if options.loglevel >= 1: options.stdlog.write("# predictions after joining: %i\n" % njoined) sys.stdout.flush() if options.loglevel >= 10: options.stdlog.write( "############## start: predictions after joining ###################################\n" ) for x in old_predictions: options.stdlog.write("# %s" % str(x)) options.stdlog.write( "############## end: predictions after joining #####################################\n" ) sys.stdout.flush() else: if options.loglevel >= 1: options.stdlog.write("# joining regions: skipped\n") sys.stdout.flush() njoined = ninput ########################################################################## # build map of best predictions if options.filter_suboptimal: if options.loglevel >= 1: options.stdlog.write("# calculating best predictions\n") sys.stdout.flush() best_predictions = GetBestPredictions(old_predictions) else: best_predictions = {} if options.loglevel >= 1: options.stdlog.write("# calculated best predictions: %i\n" % len(best_predictions)) sys.stdout.flush() ########################################################################## # get regions to eliminate filter_regions = {} if options.filter_regions: entry = PredictionParser.PredictionParserEntry(expand=0) filenames = options.filter_regions.split(",") for filename in filenames: if options.loglevel >= 1: options.stdlog.write("# reading regions to filter from %s.\n" % (filename)) sys.stdout.flush() if filename.endswith(".gz"): infile = gzip.open(filename, "r") else: infile = open(filename, "r") for line in infile: if line[0] == "#": continue entry.Read(line) exons = Exons.Alignment2Exons( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, entry.mSbjctGenomeFrom) key = "%s-%s" % (entry.mSbjctToken, entry.mSbjctStrand) if key not in filter_regions: filter_regions[key] = [] for exon in exons: filter_regions[key].append( (exon.mGenomeFrom, exon.mGenomeTo)) infile.close() for k in filter_regions.keys(): filter_regions[k].sort() ########################################################################## # bipartite graph construction ########################################################################## # sort predictions by genomic region if options.conserve_memory: old_predictions.sort(('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo')) else: old_predictions.sort(lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x. mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y. mSbjctGenomeFrom, y.mSbjctGenomeTo))) ########################################################################## # filter predictions and resolve conflicts based on genomic overlap # deleted segments are put in a temporary storage space. min_from, max_from = None, None min_to, max_to = None, None region_id = 0 noverlaps = 0 last_prediction = None predictions = [] region = Region() nclusters = 0 neliminated_suboptimal = 0 neliminated_overlap = 0 noutput, nfiltered = 0, 0 for this_prediction in old_predictions: # Filter 1: skip suboptimal predictions if this_prediction.mQueryToken in best_predictions: best_prediction = best_predictions[this_prediction.mQueryToken] neliminated_suboptimal += 1 if float( this_prediction.mQueryCoverage ) / best_prediction.mQueryCoverage < options.min_relative_coverage: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: coverage below best: removing %s\n" % str(this_prediction)) continue if float(this_prediction.score ) / best_prediction.score < options.min_relative_score: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: score below best: removing %s\n" % str(this_prediction)) continue if float( this_prediction.mPercentIdentity ) / best_prediction.mPercentIdentity < options.min_relative_percent_identity: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: percent identity below best: removing %s\n" % str(this_prediction)) continue neliminated_suboptimal -= 1 # Filter 2: remove predictions overlapping with certain segments key = "%s-%s" % (this_prediction.mSbjctToken, this_prediction.mSbjctStrand) if key in filter_regions: exons = Exons.Alignment2Exons( Genomics.String2Alignment(this_prediction.mAlignmentString), this_prediction.mQueryFrom, this_prediction.mSbjctGenomeFrom) if CheckOverlap(map(lambda x: (x.mGenomeFrom, x.mGenomeTo), exons), filter_regions[key]): if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: overlapping with taboo region: removing %s\n" % str(this_prediction)) neliminated_overlap += 1 continue try: this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None # process first entry if min_from is None: min_from = this_prediction.mSbjctGenomeFrom max_from = this_prediction.mSbjctGenomeFrom max_to = this_prediction.mSbjctGenomeTo min_to = this_prediction.mSbjctGenomeTo predictions.append(this_prediction) last_prediction = this_prediction continue overlap = min_to > this_prediction.mSbjctGenomeFrom and \ last_prediction.mSbjctToken == this_prediction.mSbjctToken and \ last_prediction.mSbjctStrand == this_prediction.mSbjctStrand if options.loglevel >= 4: options.stdlog.write("# from=%i, to=%i, working on: %s\n" % (min_from, max_to, str(this_prediction))) options.stdlog.flush() # resolve overlap between different genes if overlap: noverlaps += 1 else: region.mSbjctToken = last_prediction.mSbjctToken region.mSbjctStrand = last_prediction.mSbjctStrand region.mSbjctGenomeFrom = min_from region.mSbjctGenomeTo = max_to region_id, nxoutput, nxfiltered = ProcessRegion( predictions, region_id, region, peptide_sequences, filter_queries) noutput += nxoutput nfiltered += nxfiltered nclusters += 1 predictions = [] min_from = this_prediction.mSbjctGenomeFrom max_from = this_prediction.mSbjctGenomeFrom min_to = this_prediction.mSbjctGenomeTo max_to = this_prediction.mSbjctGenomeTo predictions.append(this_prediction) min_from = min(min_from, this_prediction.mSbjctGenomeFrom) max_from = max(max_from, this_prediction.mSbjctGenomeFrom) min_to = min(min_to, this_prediction.mSbjctGenomeTo) max_to = max(max_to, this_prediction.mSbjctGenomeTo) last_prediction = this_prediction if last_prediction: region.mSbjctToken = last_prediction.mSbjctToken region.mSbjctStrand = last_prediction.mSbjctStrand region.mSbjctGenomeFrom = min_from region.mSbjctGenomeTo = max_to region_id, nxoutput, nxfiltered = ProcessRegion( predictions, region_id, region, peptide_sequences, filter_queries) noutput += nxoutput nfiltered += nxfiltered nclusters += 1 if options.conserve_memory: os.remove(filename_old_predictions) os.remove(filename_new_predictions) os.remove(filename_removed_predictions) if options.loglevel >= 1: options.stdlog.write( "# pairs: nread=%i, input=%i, joined=%i, clusters=%i, regions=%i, eliminated_subopt=%i, eliminated_overlap=%i, noutput=%i, nfiltered=%i\n" % (nread, ninput, njoined, nclusters, region_id, neliminated_suboptimal, neliminated_overlap, noutput, nfiltered)) E.Stop()
elif o == "--report-step": param_report_step = int(a) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) print E.GetHeader() print E.GetParams() sys.stdout.flush() if param_loglevel >= 1: print "# reading exon boundaries." sys.stdout.flush() cds = Exons.ReadExonBoundaries(open(param_filename_cds, "r")) if param_loglevel >= 1: print "# read %i cds" % (len(cds)) sys.stdout.flush() ninput, npairs, nskipped = 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue if line[0] == ">": print line[:-1] continue ninput += 1
if line[0] == "#": continue if line[0] == ">": continue a, b = line[:-1].split("\t")[:2] if b not in components: components[b] = [] components[b].append(a) if param_loglevel >= 1: print "# read %i components." % len(components) else: components = {'all': all_identifiers} if param_filename_exons: exons = Exons.ReadExonBoundaries( open(param_filename_exons, "r"), filter=all_mali) if param_loglevel >= 2: print "# read %i exons." % len(exons) else: exons = {} print "# PREFIX\tsummary\tNSEQUENCES\tNASSIGNED\tNCLUSTERS\tNASSIGNED\tUNASSIGNED" print "# PREFIX\tcluster\tNMEMBERS\tMEMBERS" print "# PREFIX\tfragments\tNFRAGMENTS\tFRAGMENTS" print "# PREFIX\tpide\tNPAIRS\tNAMIN\tNAMAX\tNAMEAN\tNAMEDIAN\tNASTDDEV\tAAMIN\tAAMAX\tAAMEAN\tAAMEDIAN\tAASTDDEV" print string.join(("# PREFIX", "codons", "NCLEAN", "NNOSTOPS", "ALIGNED_MIN", "ALIGNED_MAX", "ALIGNED_MEAN", "ALIGNED_MEDIAN", "ALIGNED_STDDEV", "CODONS_MIN", "CODONS_MAX", "CODONS_MEAN", "CODONS_MEDIAN", "CODONS_STDDEV",
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-o", "--forward-coordinates", dest="forward_coordinates", action="store_true", help="input uses forward coordinates.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("default", "cds", "cdnas", "map", "gff", "intron-fasta", "exons"), help="output format.") parser.add_option("-r", "--reset-to-start", dest="reset_to_start", action="store_true", help="move genomic coordinates to begin from 0.") parser.add_option("--reset-query", dest="reset_query", action="store_true", help="move peptide coordinates to begin from 0.") parser.set_defaults(genome_file=None, forward_coordinates=False, format="default", reset_to_start=False, reset_query=False) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) cds_id = 1 entry = PredictionParser.PredictionParserEntry() fasta = IndexedFasta.IndexedFasta(options.genome_file) ninput, noutput, nskipped, nerrors = 0, 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue if line.startswith("id"): continue ninput += 1 try: entry.Read(line) except ValueError, msg: options.stdlog.write("# parsing failed with msg %s in line %s" % (msg, line)) nerrors += 1 continue cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) for cd in cds: cd.mSbjctToken = entry.mSbjctToken cd.mSbjctStrand = entry.mSbjctStrand if cds[-1].mGenomeTo != entry.mSbjctGenomeTo: options.stdlog.write( "# WARNING: discrepancy in exon calculation!!!\n") for cd in cds: options.stdlog.write("# %s\n" % str(cd)) options.stdlog.write("# %s\n" % entry) lsequence = fasta.getLength(entry.mSbjctToken) genomic_sequence = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) # deal with forward coordinates: convert them to negative strand # coordinates if options.forward_coordinates and \ entry.mSbjctStrand == "-": entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \ entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom for cd in cds: cd.InvertGenomicCoordinates(lsequence) # attach sequence to cds for cd in cds: start = cd.mGenomeFrom - entry.mSbjctGenomeFrom end = cd.mGenomeTo - entry.mSbjctGenomeFrom cd.mSequence = genomic_sequence[start:end] # reset coordinates for query if options.reset_to_start: offset = entry.mPeptideFrom for cd in cds: cd.mPeptideFrom -= offset cd.mPeptideTo -= offset # play with coordinates if options.reset_to_start: offset = entry.mSbjctGenomeFrom for cd in cds: cd.mGenomeFrom -= offset cd.mGenomeTo -= offset else: offset = 0 if options.format == "cds": rank = 0 for cd in cds: rank += 1 cd.mQueryToken = entry.mQueryToken cd.mSbjctToken = entry.mSbjctToken cd.mSbjctStrand = entry.mSbjctStrand cd.mRank = rank print str(cd) if options.format == "exons": rank = 0 for cd in cds: rank += 1 options.stdout.write("\t".join( map(str, (entry.mPredictionId, cd.mSbjctToken, cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom, cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) + "\n") elif options.format == "cdnas": print string.join( map(str, (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset, entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t") elif options.format == "map": map_prediction2genome = alignlib_lite.makeAlignmentSet() for cd in cds: alignlib_lite.addDiagonal2Alignment( map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo, (cd.mGenomeFrom - offset) - cd.mPeptideFrom) print string.join( map(str, (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, alignlib_lite.AlignmentFormatEmissions( map_prediction2genome))), "\t") elif options.format == "intron-fasta": rank = 0 if len(cds) == 1: nskipped += 1 continue last = cds[0].mGenomeTo for cd in cds[1:]: rank += 1 key = "%s %i %s:%s:%i:%i" % ( entry.mPredictionId, rank, entry.mSbjctToken, entry.mSbjctStrand, last, entry.mSbjctGenomeFrom) sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd. mGenomeFrom - entry.mSbjctGenomeFrom] options.stdout.write(">%s\n%s\n" % (key, sequence)) last = cd.mGenomeTo elif options.format == "gff-match": print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \ (entry.mSbjctToken, "gpipe", "similarity", entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mPercentIdentity, entry.mSbjctStrand, ".", entry.mQueryToken, entry.mQueryFrom, entry.mQueryTo, entry.score, entry.mNIntrons, entry.mNFrameShifts, entry.mNStopCodons) elif options.format == "gff-exon": rank = 0 for cd in cds: rank += 1 print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \ (entry.mSbjctToken, "gpipe", "similarity", cd.mGenomeFrom, cd.mGenomeTo, entry.mPercentIdentity, entry.mSbjctStrand, ".", entry.mQueryToken, cd.mPeptideFrom / 3 + 1, cd.mPeptideTo / 3 + 1, entry.score, rank, len(cds), entry.mPredictionId) else: exon_from = 0 for cd in cds: cd.mPeptideFrom = exon_from exon_from += cd.mGenomeTo - cd.mGenomeFrom cd.mPeptideTo = exon_from print string.join( map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom, cd.mPeptideTo, cd.frame, cd.mGenomeFrom, cd.mGenomeTo, cd.mSequence)), "\t") cds_id += 1 noutput += 1
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2, peptides2, cds2): """sort out ortholog relationships between transcripts of orthologous genes. Orthologs have: the same number of exons compatible intron/exon boundaries For the remaining transcript pairs, take reciprocal bet hits. I see the following: 0: 0(100%), 1: 0(94%), 2: 0,1(100%) 0: 0(100%), 1: 0,1,2(100%) Selecting 1-0 first, would result in a suboptimal match, because one transcript is longer than the other, while matching up 0-0 and 2-1 would be better. Objective function: it is the maximal matching/assignment problem. Use greedy implementation instead. Assign as much as possible according to descending weights. """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) # for long sequence: use dot alignment with tuple size of three dottor = alignlib_lite.makeAlignatorTuples(3) alignator_dots = alignlib_lite.makeAlignatorDotsSquared( param_gop, param_gep, dottor) seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]), transcripts1) seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]), transcripts2) if param_loglevel >= 4: print "# building sequence 1" for i in range(len(seqs1)): if not cds1.has_key(transcripts1[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# building sequence 2" for i in range(len(seqs2)): if not cds2.has_key(transcripts2[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# all-vs-all alignment" # do all versus all alignment alis1 = [] alis2 = [] for i in range(len(seqs1)): alis1.append([]) for i in range(len(seqs2)): alis2.append([]) if param_loglevel >= 3: print "#################################" for i in range(len(seqs1)): for cd in cds1[transcripts1[i][0]]: print "#", str(cd) print "# versus" for i in range(len(seqs2)): for cd in cds2[transcripts2[i][0]]: print "#", str(cd) sys.stdout.flush() weights = {} for i in range(len(seqs1)): prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[ i] for j in range(len(seqs2)): prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[ j] map_a2b = alignlib_lite.makeAlignmentVector() m = seqs1[i].getLength() * seqs2[j].getLength() if param_loglevel >= 3: print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\ (i, j, prediction_id1, seqs1[ i].getLength(), prediction_id2, seqs2[j].getLength()) sys.stdout.flush() if m > param_max_matrix_size: # switch to tuple alignment if sequences are too large if param_loglevel >= 2: print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % ( seqs1[i].getLength(), seqs2[j].getLength()) sys.stdout.flush() alignator_dots.align(map_a2b, seqs1[i], seqs2[j]) else: alignator.align(map_a2b, seqs1[i], seqs2[j]) coverage_a = 100.0 * \ (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \ seqs1[i].getLength() coverage_b = 100.0 * \ (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \ seqs2[j].getLength() # get copy of cds, but only those overlapping with alignment c1 = Exons.GetExonsRange( cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3, (map_a2b.getRowTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) c2 = Exons.GetExonsRange( cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3, (map_a2b.getColTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) # check exon boundaries, look at starts, skip first exon def MyMap(a, x): while x <= a.getRowTo(): c = a.mapRowToCol(x) if c: return c x += 1 else: return 0 mapped_boundaries = map( lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:]) mapped_boundaries.sort() reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1, c2[1:]) reference_boundaries.sort() nmissed_cmp2ref = Exons.CountMissedBoundaries( mapped_boundaries, reference_boundaries, param_boundaries_max_slippage) nmissed_ref2cmp = Exons.CountMissedBoundaries( reference_boundaries, mapped_boundaries, param_boundaries_max_slippage) min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp) # set is_ok for the whole thing # no intron: is ok is_ok = 0 if (len(c1) == 1 and len(c2) == 1): is_ok = 1 else: # allow for missed boundaries, if param_boundaries_allow_missed # > 0 if min_nmissed == 0: is_ok = 1 else: if param_boundaries_allow_missed and \ len(mapped_boundaries) >= param_boundaries_allow_missed and \ min_nmissed <= param_boundaries_max_missed: is_ok = 1 cc = min(coverage_a, coverage_b) if cc >= param_min_coverage: is_ok_coverage = 1 else: is_ok_coverage = 0 # check for missing introns is_ok_exons = 1 if abs(len(c1) - len(c2)) != 0: if param_missing_max_missing: if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or (min(len(c1), len(c2)) < param_missing_min_present)): is_ok_exons = 0 else: is_ok_exons = 0 if param_loglevel >= 3: print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \ "boundaries_ok=", is_ok, \ "nexons_ok=", is_ok_exons, \ "missed_c2r=", nmissed_cmp2ref, \ "missed_r2c=", nmissed_ref2cmp, \ "min_cov=", cc, \ "mapped=", mapped_boundaries, \ "reference=", reference_boundaries print "#", string.join( map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b), map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") sys.stdout.flush() # dump out pairs for method in param_write_pairs: if method == "all": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength(), map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b, nmissed_cmp2ref, mapped_boundaries, nmissed_ref2cmp, reference_boundaries, i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t") elif method == "alignment": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") elif method == "location": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t") if not is_ok_exons: if param_loglevel >= 4: print "# rejected %i and %i: too many exons difference." % ( i, j) continue if param_check_exon_boundaries: if not is_ok: continue if cc < param_min_coverage: continue if not weights.has_key(cc): weights[cc] = [] alis1[i].append((coverage_a, j)) alis2[j].append((coverage_b, i)) weights[cc].append((i, j, map_a2b)) # sort out alignments ww = weights.keys() ww.sort() ww.reverse() pairs = [] assigned1 = {} assigned2 = {} if param_loglevel >= 3: print "# alis1=", alis1 print "# alis2=", alis2 print "# --------------------------------------" for w in ww: for i, j, map_a2b in weights[w]: if not assigned1.has_key(i) and not assigned2.has_key(j): pairs.append((transcripts1[i], transcripts2[j], w, map_a2b)) assigned1[i] = 1 assigned2[j] = 1 if len(assigned1) == len(transcripts1): break if len(assigned2) == len(transcripts2): break return pairs
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string", help="filename with summary information." ) parser.add_option( "--skip-header", dest="skip_header", action="store_true", help="skip header." ) parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int", help="maximum extension for start codon (make divisible by 3)." ) parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int", help="maximum extension for stop codon (make divisible by 3)." ) parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice", choices=("first-start", "first-stop-backtrack"), help="extension mode for 5' end.") parser.add_option( "--fill-introns", dest="fill_introns", type="int", help="fill intron if divisible by three and no stop codon up to a maximum length of #." ) parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int", help="maximum number of stop codons to tolerate within an intron." ) parser.add_option( "--output-format", dest="output_format", type="choice", choices=("predictions", "extensions", "filled-introns"), help="output format." ) parser.set_defaults( genome_file = "genome", start_codons = ("ATG"), stop_codons = ("TAG", "TAA", "TGA"), start_codon_boundary = 9999, stop_codon_boundary = 9999, fill_introns = 0, introns_max_stops = 0, left_splice_signals = ("GT",), right_splice_signals = ("AG",), output_format="extensions", left_extension_mode = "first-start", skip_header = False, output_filename_summary = None, ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) options.start_codon_boundary = int(options.start_codon_boundary / 3) options.stop_codon_boundary = int(options.stop_codon_boundary / 3) fasta = IndexedFasta.IndexedFasta( options.genome_file ) p = PredictionParser.PredictionParserEntry() ninput, noutput = 0, 0 nfilled = 0 nseqs_filled = 0 nseqs_extended = 0 left_extensions = [] right_extensions = [] filled_introns = [] if not options.skip_header: if options.output_format == "predictions": options.stdout.write( Prediction.Prediction().getHeader() + "\n" ) elif options.output_format == "filled-introns": options.stdout.write("\t".join( ("prediction_id", "intron", "peptide_sequence", "genomic_sequence") ) + "\n" ) if options.output_filename_summary: outfile_summary = open (options.output_filename_summary, "w" ) outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" ) else: outfile_summary = None for line in options.stdin: if line[0] == "#": continue ninput += 1 p.Read(line) lsequence = fasta.getLength( p.mSbjctToken ) genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary) genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary) genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, genome_from, genome_to ).upper() ######################################################################## ######################################################################## ######################################################################## ## Do extensions if options.start_codon_boundary or options.stop_codon_boundary: extension_start = p.mSbjctGenomeFrom - genome_from extension_stop = genome_to - p.mSbjctGenomeTo fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom lfragment = len(genomic_sequence) ######################################################################## ######################################################################## ######################################################################## ## find start codon start = extension_start found_start = False if options.left_extension_mode == "first-start": found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons, options.stop_codons ) elif options.left_extension_mode == "first-stop-backtrack": if genomic_sequence[start:start+3] in options.start_codons: found_start = True else: found_start, start = findCodonReverse( genomic_sequence, start, options.stop_codons ) if found_start: E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) ) ## bracktrack to first start codon found_start = False while start < extension_start: start += 3 if genomic_sequence[start:start+3] in options.start_codons: found_start = True break else: start = extension_start if found_start: E.info("start codon found at %i (%i)." % ( start, extension_start - start) ) else: E.info("no start codon found." ) else: E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) ) found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons ) E.info("prediction %s: no start codon found." % ( p.mPredictionId ) ) if found_start: start += genome_from else: start = p.mSbjctGenomeFrom dstart = p.mSbjctGenomeFrom - start ######################################################################## ######################################################################## ######################################################################## ## find stop codon ## stop points to the beginning of the codon, thus the stop codon will ## not be part of the sequence. stop = fragment_to found_stop = 0 while stop < lfragment and \ genomic_sequence[stop:stop+3] not in ("NNN", "XXX"): if genomic_sequence[stop:stop+3] in options.stop_codons: found_stop = 1 break stop += 3 if found_stop: stop += genome_from else: stop = p.mSbjctGenomeTo dstop = stop - p.mSbjctGenomeTo ######################################################################## ######################################################################## ######################################################################## ## build new prediction map_peptide2genome = [] if dstart: map_peptide2genome.append( ("G", 0, dstart) ) map_peptide2genome += p.mMapPeptide2Genome if dstop: map_peptide2genome.append( ("G", 0, dstop) ) E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) ) ## save results p.mMapPeptide2Genome = map_peptide2genome p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome ) p.mSbjctGenomeFrom -= dstart p.mSbjctGenomeTo += dstop p.mSbjctFrom += dstart / 3 p.mSbjctTo += dstart / 3 + dstop / 3 if dstart or dstop: if dstart: left_extensions.append( dstart ) if dstop: right_extensions.append( dstop ) nseqs_extended += 1 ## update genomic sequence because borders might have changed. genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ).upper() if options.fill_introns: has_filled = False exons = Exons.Alignment2Exons( p.mMapPeptide2Genome, query_from = 0, sbjct_from = 0 ) new_exons = [] last_e = exons[0] nintron = 0 for e in exons[1:]: nintron += 1 lintron = e.mGenomeFrom - last_e.mGenomeTo if lintron > options.fill_introns or (lintron) % 3 != 0: E.debug( "prediction %s: intron %i of size %i discarded." % \ (p.mPredictionId, nintron, lintron ) ) new_exons.append(last_e) last_e = e continue ## get sequence, include also residues from split codons ## when checking for stop codons. if e.mAlignment[0][0] == "S": offset_left = last_e.mAlignment[-1][2] offset_right = e.mAlignment[0][2] else: offset_left, offset_right = 0, 0 sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right] ## check for splice sites for signal in options.left_splice_signals: if sequence[offset_left:offset_left+len(signal)] == signal: left_signal = True break else: left_signal = False for signal in options.right_splice_signals: if sequence[-(len(signal)+offset_right):-offset_right] == signal: right_signal = True break else: right_signal = False nstops, ngaps = 0, 0 for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]: if codon in options.stop_codons: nstops += 1 if "N" in codon.upper(): ngaps += 1 E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \ (p.mPredictionId, nintron, lintron, offset_left, offset_right, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom + last_e.mGenomeTo, p.mSbjctGenomeFrom + e.mGenomeFrom, nstops, ngaps, left_signal, right_signal ) ) if nstops + ngaps > options.introns_max_stops: new_exons.append(last_e) last_e = e continue E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \ (p.mPredictionId, nintron, lintron, nstops, ngaps, left_signal, right_signal)) e.Merge( last_e ) has_filled = True nfilled += 1 last_e = e if options.output_format == "filled-introns": options.stdout.write( "\t".join( map(str, ( p.mPredictionId, nintron, Genomics.TranslateDNA2Protein( sequence ), sequence ) ) ) + "\n" ) filled_introns.append(lintron) p.mNIntrons -= 1 new_exons.append(last_e) if has_filled: nseqs_filled += 1 Exons.UpdatePeptideCoordinates( new_exons ) p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons ) p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome ) ## build translated sequence p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \ p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence ) ## output info if options.output_format == "predictions": options.stdout.write( str(p) + "\n" ) elif options.output_format == "extensions": if found_start: found_start = 1 if found_stop: found_stop = 1 options.stdout.write( "\t".join( map(str, ( p.mPredictionId, found_start, found_stop, dstart, dstop, p.mTranslation, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, p.mAlignmentString ))) + "\n" ) noutput += 1 options.stdout.flush() E.info("stats : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() )) E.info("left : %s" % str(Stats.DistributionalParameters(left_extensions)) ) E.info("right : %s" % str(Stats.DistributionalParameters(right_extensions)) ) E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) ) E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\ ninput, noutput, nseqs_extended, nseqs_filled, nfilled)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--master", dest="master", type="string", help="master sequence.") parser.add_option("-p", "--master-pattern", dest="master_pattern", type="string", help="master pattern.") parser.add_option("--master-species", dest="master_species", type="string", help="species to use as master sequences.") parser.add_option("-t", "--translate", dest="filename_translation", type="string", help="filename on where to store translated sequences.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename on where to exon information.") parser.add_option("-c", "--mark-codons", dest="mark_codons", action="store_true", help="mark codons.") parser.add_option( "-i", "--ignore-case", dest="ignore_case", action="store_true", help="ignore case (otherwise: lowercase are unaligned chars).") parser.add_option("--remove-stops", dest="remove_stops", action="store_true", help="remove stop codons.") parser.add_option("--mask-stops", dest="mask_stops", action="store_true", help="mask stop codons.") parser.add_option("--mask-char", dest="mask_char", type="string", help="masking character to use.") parser.add_option("-f", "--remove-frameshifts", dest="remove_frameshifts", action="store_true", help="remove columns corresponding to frameshifts.") parser.add_option( "--mask-master", dest="mask_master", action="store_true", help= "columns in master to be removed are masked to keep residue numbering." ) parser.add_option( "-s", "--split-exons", dest="split_exons", action="store_true", help="split columns aligned to different exons in the same gene.") parser.add_option("-a", "--target", dest="target", type="choice", choices=("paml", ), help="perform cleaning up for certain targets.") parser.set_defaults( gap_char="-", mask_char="n", gap_chars="-.", separator="|", master=None, master_species=None, filename_translation=None, filename_exons=None, master_pattern=None, remove_stops=False, mark_codons=False, mask_unaligned=False, split_exons=False, remove_frameshifts=False, min_segment_length=5, ignore_case=False, mask_stops=False, target=None, mask_master=False, ) (options, args) = E.Start(parser) if options.target == "paml": options.mask_stops = True options.mask_char = "n" options.remove_frameshifts = True if options.loglevel >= 1: options.stdlog.write( "# setting output to paml : removing frameshifts, masking stops with '%s'.\n" % (options.mask_char)) ## 1. read multiple alignment in fasta format mali = Mali.Mali() mali.readFromFile(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read mali with %i entries.\n" % len(mali)) if len(mali) == 0: raise "empty multiple alignment" identifiers = mali.getIdentifiers() masters = [] if options.master: masters = options.master.split(",") elif options.master_pattern: for id in identifiers: if re.search(options.master_pattern, id): masters.append(id) elif options.master_species: for id in identifiers: if options.master_species == id.split(options.separator)[0]: masters.append(id) else: masters.append(identifiers[0]) if options.loglevel >= 2: options.stdlog.write("# master sequences are: %s\n" % str(masters)) options.stdlog.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"), filter=set(identifiers), from_zero=True) if options.loglevel >= 2: options.stdlog.write("# read exons %i sequences.\n" % len(exons)) else: exons = {} ################################################################################# ################################################################################# ################################################################################# ## translate characters to upper/lower case according to exon info. ################################################################################# if exons: for id in identifiers: if id in exons: mali.getSequence(id).mString = AddExonInformation( mali[id], exons[id], mask_char=options.mask_char) elif options.ignore_case: ## convert all to uppercase mali.upper() ################################################################################# ################################################################################# ################################################################################# ## untangle misaligned exons ################################################################################# if exons and options.split_exons: ## first split with masters if len(masters) > 0: SplitExons(mali, exons, masters=masters, options=options) if options.loglevel >= 4: mali.writeToFile(open("log_mali1", "w"), format="fasta") SplitExons(mali, exons, options) ################################################################################# ################################################################################# ################################################################################# ## remove frameshifts ################################################################################# if options.remove_frameshifts: out_of_frame_columns = [] if len(masters) == 1: frame_columns = GetFrameColumns(mali, masters[0], gap_chars=options.gap_chars) else: columns = [] for id in masters: columns += GetFrameColumns(mali, id, gap_chars=options.gap_chars) if len(columns) == 0: columns += GetFrameColumns(mali, identifiers[0], gap_chars=options.gap_chars) # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100), # and (1,2,100) before (1,3,4). columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2]))) # select codons frame_columns = [] last_codon = columns[0] for codon in columns[1:]: # skip identical codons if codon == last_codon: continue # take first (shortest) codon in case of identical first residue if codon[0] == last_codon[0]: continue # if not overlapping, keep if codon[0] > last_codon[2]: frame_columns.append(last_codon) else: out_of_frame_columns += last_codon # if overlapping, but out of register: skip last_codon = codon frame_columns.append(last_codon) # build set of skipped columns frame_set = set() for column in frame_columns: for c in column: frame_set.add(c) # columns that contain a master sequence that is out of # frame out_of_frame_set = set(out_of_frame_columns) out_of_frame_set = out_of_frame_set.difference(frame_set) if options.loglevel >= 1: options.stdlog.write("# found %i/%i columns in frame\n" % (len(frame_columns) * 3, mali.getWidth())) if options.loglevel >= 5: options.stdlog.write("# frame columns: %i\n" % (len(frame_columns))) x = 0 for column in frame_columns: options.stdlog.write("# %i\t%s\n" % (x, ",".join(map(str, column)))) x += 1 if options.loglevel >= 5: options.stdlog.write( "# Out-of frame columns with residue of masters: %i\n" % (len(out_of_frame_set))) options.stdlog.write("# %s" % ",".join(map(str, out_of_frame_columns))) mask_chars = (string.upper(options.mask_char), string.lower(options.mask_char)) to_delete = [] ignore_case = exons or options.ignore_case for id in identifiers: ngaps, nmasked = 0, 0 sequence = mali.getSequence(id).mString if options.loglevel >= 7: options.stdlog.write( "# processing sequence %s of length %i with gaps\n" % (id, len(sequence))) ## treat masters differently if they are only to be masked, not ## pruned. ## simple mask all characters that are to skipped fragments = [] nstops, ncodons, naligned = 0, 0, 0 codon = [] chars = [] is_master = id in masters for x in range(len(sequence)): c = sequence[x] ## delete columns that do not align to ## a master. if x not in frame_set and x not in out_of_frame_set: continue chars.append(c) if c not in options.gap_chars: codon.append(c) if len(codon) % 3 == 0: codon = "".join(codon) codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options) if codon_is_aligned: naligned += 1 to_mask = False if codon_is_all_gaps: ngaps += len(chars) elif codon_is_ok: ncodons += 1 if string.upper(codon) in ("TAG", "TAA", "TGA"): nstops += 1 to_mask = True else: to_mask = True nmasked += 1 if to_mask: for i in range(len(chars)): if chars[i] not in options.gap_chars: chars[i] = options.mask_char fragments.append("".join(chars)) chars = [] codon = [] ## mask incomplete codons at the end if chars: for i in range(len(chars)): if chars[i] not in options.gap_chars: chars[i] = options.mask_char fragments.append("".join(chars)) ## else: ## for a,b,c in frame_columns: ## codon = sequence[a] + sequence[b] + sequence[c] ## codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options ) ## if codon_is_aligned: naligned += 1 ## if codon_is_all_gaps: ## fragments.append( options.gap_char * 3 ) ## ngaps += 1 ## elif codon_is_ok: ## ncodons += 1 ## if string.upper(codon) in ("TAG", "TAA", "TGA"): ## if options.remove_stops: ## fragments.append( options.gap_char * 3 ) ## elif options.mask_stops: ## fragments.append( options.mask_char * 3 ) ## else: ## fragments.append( codon ) ## nstops += 1 ## else: ## fragments.append( codon ) ## else: ## fragments.append( options.gap_char * 3 ) ## nmasked += 1 ## if options.loglevel >= 7: ## options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id, ## a,b,c, ## codon, ## str(codon_is_ok), ## str(codon_is_aligned) )) s = string.join(fragments, "") if options.loglevel >= 1: options.stdlog.write( "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n" % (id, len(fragments), naligned, ncodons, nstops, ngaps, nmasked)) options.stdlog.flush() ## postpone deletion in order to not ## confuse the iteration of ids if naligned == 0: options.stdlog.write( "# sequence: %s removed because there are no aligned nucleotides.\n" % id) to_delete.append(id) elif ncodons == 0: options.stdlog.write( "# sequence: %s removed because there are no aligned codons.\n" % id) to_delete.append(id) else: mali.setSequence(id, string.join(fragments, "")) for id in to_delete: del mali[id] for id in identifiers: if options.mark_codons: a = mali[id] f = lambda x: a[x:x + 3] s = string.join([f(x) for x in range(0, len(a), 3)], " ") else: s = mali[id] options.stdout.write(">%s\n%s\n" % (id, s)) if options.filename_translation: outfile = open(options.filename_translation, "w") for id in mali.keys(): outfile.write(">%s\n%s\n" % (id, Genomics.TranslateDNA2Protein(mali[id]))) outfile.close() E.Stop()
alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) map_reference2target = alignlib_lite.makeAlignmentVector() assignment_id = 0 for line in cr.fetchall(): reference = PredictionParser.PredictionParserEntry() reference.FillFromTable(line) ct = dbhandle.cursor() ct.execute(statement % (param_tablename_predictions_target, reference.mSbjctToken, reference.mSbjctStrand, reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo)) reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome, 0, reference.mSbjctFrom) for line2 in ct.fetchall(): target = PredictionParser.PredictionParserEntry() target.FillFromTable(line2) target_exons = Exons.Alignment2Exons(target.mMapPeptide2Genome, 0, target.mSbjctFrom) ## check for exon overlap rr, tt = 0, 0 overlap = 0 while rr < len(reference_exons) and tt < len(target_exons): r = reference_exons[rr] t = target_exons[tt]
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int", help="overlap residues.") parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string", help="filename to filter tokens." ) parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true", help="exon identity." ) parser.add_option( "--exons", dest="filename_exons", type="string", help="filename with exon information." ) parser.add_option( "-m", "--output-members", dest="filename_members", type="string", help="output filename with members." ) parser.add_option( "--overlap-id", dest="overlap_id", action="store_true", help="overlap id." ) parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true", help="remove spanning predictions." ) parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true", help="remove complementary predictions." ) parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true", help="remove exon swoppers." ) parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true", help="remove gene spanners." ) parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true", help="remove suboptimal predictions." ) parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide information." ) parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string", help="filename with peptide information - after extension." ) parser.add_option( "--test", dest="test_nids", type="string", help="test nids." ) ## filter options parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string", help="filename with transcripts that are used to filter." ) parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true", help="remove all transcripts that span the filter set." ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int", help="if set discard clusters bigger than this size (patch) [default=%default]." ) parser.set_defaults( filename_members = None, filename_peptides = None, filename_extended_peptides = None, filename_exons = None, quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ), ## Classes, where redundancy is removed by similarity. When exon structure ## is not conserved, I can't predict alternative splice variants, so remove ## the redundancy. quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ), quality_genes = ("CG", "SG", "PG", "RG", "UG"), ## class that can be removed in spanning/complementary predictions quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ), ## class that is required for defining exon swopper event quality_remove_exon_swopper = ("CG", "PG"), ## class that will kept, in spite of being an exons swopper. quality_keep_exon_swopper = (), ## class that is required for removing gene spanners quality_remove_gene_spanners = ("CG"), ## class that will kept, in spite of being a gene spanner quality_keep_gene_spanners = (), ## class that is required for defining suboptimal matches quality_remove_suboptimal = ("CG", "PG" ), ## class that will be kept, in spite of being a suboptimal match quality_keep_suboptimal = (), ## gap penalties gop = -10.0, gep = -1.0, ## maximum number of gaps to allow in alignment max_gaps = 20, ## threshold of percent identity that allows to remove a prediction ## of a lower class. ## This allows for insertions/deletions min_identity = 98, ## threshold of percent identity that allows to remove a prediction ## of a non-gene by a gene min_identity_non_genes = 80, ## safety threshold: do not remove, if coverage of member is by x better ## than representative safety_pide = 10, safety_coverage = 10, overlap_id = False, remove_spanning_predictions = False, remove_exon_swoppers = False, remove_gene_spanners = False, remove_suboptimal = False, ## nids to use for testing test_nids = None, ## remove members with less than maximum coverage max_member_coverage = 90, ## maximum allowable exon slippage max_slippage = 9, ## minimum difference in identity for suboptimal predictions to be removed. suboptimal_min_identity_difference = 10, ## filter options filter_filename_transcripts = None, filter_remove_spanning = True, filter_remove_spanning_both_strands = True, genome_file = None, discard_large_clusters = None ) (options, args) = E.Start( parser, add_psql_options = True ) if options.test_nids: options.test_nids = options.test_nids.split(",") # list of eliminated predictions eliminated_predictions = {} if options.filename_members: outfile_members = open( options.filename_members, "w" ) else: outfile_members = sys.stdout ###################################################### ###################################################### ###################################################### # data ###################################################### data = [] class Entry: def __init__(self, gff): self.mPid = float(gff["pid"]) self.mQueryCoverage = float(gff["qcov"]) self.gene_id = gff['gene_id'] self.transcript_id = gff['transcript_id'] self.mExtendedStart = int( gff['xstart'] ) self.mExtendedEnd = int( gff['xend'] ) self.start = gff.start self.contig = gff.contig self.strand = gff.strand self.end = gff.end self.mQuality = gff['class'] for gff in GTF.iterator( sys.stdin ): data.append( Entry(gff) ) if options.loglevel >= 1: options.stdlog.write( "# read %i transcripts.\n" % len(data) ) options.stdlog.flush() ###################################################### ###################################################### ###################################################### # read peptide sequences ###################################################### if options.loglevel >= 1: options.stdlog.write( "# loading peptide databases ... " ) options.stdlog.flush() if options.filename_peptides: peptides = IndexedFasta.IndexedFasta( options.filename_peptides ) peptide_lengths = peptides.getContigSizes() else: peptide_lengths = {} peptides = {} ###################################################### ###################################################### ###################################################### # read extended peptide sequences ###################################################### if options.filename_extended_peptides: extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides ) else: extended_peptides = {} if options.loglevel >= 1: options.stdlog.write( "finished\n" ) options.stdlog.flush() ###################################################### ###################################################### ###################################################### ## open genome file ###################################################### if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} ###################################################### ###################################################### ###################################################### ## reading exons, clustering and formatting them. ###################################################### if options.filename_exons: if options.loglevel >= 1: options.stdlog.write( "# reading exon boundaries ... " ) options.stdlog.flush() ids = [ x.transcript_id for x in data ] exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"), contig_sizes = contig_sizes, filter = set(ids) ) if options.loglevel >= 1: options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) )) if len(exons) == 0: raise ValueError("no exons found in table.") # flag terminal exons Exons.SetRankToPositionFlag( exons ) identity_map_cluster2transcripts, identity_map_transcript2cluster =\ Exons.ClusterByExonIdentity( exons, max_terminal_num_exons = 3, max_slippage= options.max_slippage, loglevel = options.loglevel ) overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\ Exons.ClusterByExonOverlap( exons, min_overlap = 10, loglevel = options.loglevel ) else: exons = {} ###################################################### nrepresentatives, nmembers, neliminated = 0, 0, 0 eliminated_by_method = {} ###################################################### ###################################################### ###################################################### ## read filter transcripts and apply filters ###################################################### if options.filter_filename_transcripts: if options.loglevel >= 1: options.stdlog.write( "# reading exon boundaries for filter set ... " ) options.stdlog.flush() filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ), delete_missing = True, contig_sizes = contig_sizes ) if options.loglevel >= 1: options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) ) t = time.time() eliminated = FilterEliminateOverlappingTranscripts( exons, filter_exons, eliminated_predictions, contig_sizes, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t )) options.stdlog.flush() if options.remove_exon_swoppers and not exons: raise ValueError( "please specify exon table if using --remove-swoppers." ) if options.remove_gene_spanners and not exons: raise ValueError( "please specify exon table if using --remove-gene-spanners." ) ######################################################################################## ## remove predictions spanning other predictions but do not overlap with them on an exon level. if options.remove_gene_spanners and exons: if options.loglevel >= 1: options.stdlog.write( "# removing gene spanners\n" ) options.stdlog.flush() t = time.time() eliminated = EliminateGeneSpanners( data, eliminated_predictions, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t )) options.stdlog.flush() ######################################################################################## ## sort data by quality, length of prediction and coverage * pid if options.loglevel >= 1: options.stdlog.write( "# sorting data\n" ) options.stdlog.flush() map2pos = {} for x in range(len(options.quality_hierarchy)): map2pos[options.quality_hierarchy[x]] = x data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) ) # build map of prediction to quality map_prediction2data = {} for d in data: map_prediction2data[d.transcript_id] = d if options.loglevel >= 1: options.stdlog.write( "# sorting data finished\n" ) options.stdlog.flush() ######################################################################################## ## remove predictions joining two other complete non-overlapping predictions if options.remove_exon_swoppers and exons: if options.loglevel >= 1: options.stdlog.write( "# removing exon swoppers\n" ) options.stdlog.flush() eliminated = EliminateExonSwoppers( data, eliminated_predictions, identity_map_transcript2cluster, identity_map_cluster2transcripts, map_prediction2data, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i exon swoppers\n" % n ) options.stdlog.flush() ######################################################################################## ## remove suboptimal predictions if options.remove_suboptimal and exons: if options.loglevel >= 1: options.stdlog.write( "# removing suboptimal predictions\n" ) options.stdlog.flush() t = time.time() eliminated = EliminateSuboptimalPredictions( data, eliminated_predictions, overlap_map_transcript2cluster, overlap_map_cluster2transcripts, map_prediction2data, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) ) options.stdlog.flush() ######################################################################################## ## remove redundant predictions l = len(data) options.report_step = max(1, int(l / 100)) t2= time.time() last_quality = None qualities = [] options.stdout.write( "%s\t%s\n" % ("rep", "comment") ) for x in range(len(data)): if options.loglevel >= 1: if x % options.report_step == 0: options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \ (x+1, l, int(100 * (x+1) / l), len(eliminated_predictions), l, 100 * len(eliminated_predictions) / l, time.time() - t2 ) ) options.stdlog.flush() rep = data[x] rep_id, rep_quality = rep.transcript_id, rep.mQuality if rep_id in eliminated_predictions: continue if rep_quality != last_quality: if last_quality: qualities.append( last_quality ) last_quality = rep_quality if options.loglevel >= 2: options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) ) options.stdlog.flush() eliminated = [] if options.overlap_id: eliminated += EliminateRedundantEntriesByOverlap( rep, data[x+1:], eliminated_predictions, options, peptides, extended_peptides, filter_quality = qualities, this_quality = rep_quality ) else: eliminated += EliminateRedundantEntriesByRange( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = qualities, this_quality = rep_quality ) options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) ) if outfile_members: outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id))) nrepresentatives += 1 nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method ) if outfile_members != sys.stdout: outfile_members.close() options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\ (nrepresentatives, nmembers, neliminated, nrepresentatives+nmembers+neliminated ) ) options.stdlog.write( "# elimination by method:\n" ) for v,c in eliminated_by_method.items(): options.stdlog.write( "# method=%s, count=%i\n" % (v, c) ) E.Stop()
def CheckExonSwop( rep_id, exons, eliminated_predictions, other_ids, map_prediction2data, options ): """check for exon swop return true, if exon swop occurs. Exon swop occurs, if this prediction joins two predictions, one of which should be CG. None of the predictions should be fully contained in the master prediction. given: the rep_id to analyzse a map of rep_id to exons a list of rep_ids to check against -> is it an exon swopper? -> joining two CG predictions that do not overlap and contain no extra exons apart from the overlapping. -> is it large spanning prediction? -> spanning many predictions, including at least one CG? """ overlaps = [] ## get predictions which overlap by exons (but not completely): for id in other_ids: if id == rep_id: continue if id in eliminated_predictions: continue if Exons.CheckOverlap( exons[rep_id], exons[id]) and \ not Exons.CheckCoverage( exons[rep_id], exons[id], max_slippage=options.max_slippage ): overlaps.append( id ) if options.loglevel >= 3: options.stdlog.write( "# exon swop: %s overlaps with %i out of %i predictions\n" % (rep_id, len(overlaps), len(other_ids) ) ) options.stdlog.flush() for x in range(0, len(overlaps)-1): id1 = overlaps[x] for y in range(x+1, len(overlaps)): id2 = overlaps[y] if options.loglevel >= 4: options.stdlog.write( "# exon swop: %s ? %s + %s: %s %s %s %s\n" % \ (rep_id, id1, id2, map_prediction2data[id1].mQuality in options.quality_remove_exon_swopper, map_prediction2data[id2].mQuality in options.quality_remove_exon_swopper, not Exons.CheckOverlap( exons[id1], exons[id2] ), Exons.CheckCoverageAinB( exons[rep_id], exons[id1] + exons[id2], min_terminal_num_exons = 0, min_terminal_exon_coverage = 0.7, max_slippage = options.max_slippage ) ) ) if (map_prediction2data[id1].mQuality in options.quality_remove_exon_swopper and \ map_prediction2data[id2].mQuality in options.quality_remove_exon_swopper ) and \ not Exons.CheckOverlap( exons[id1], exons[id2] ) and \ Exons.CheckCoverageAinB( exons[rep_id], exons[id1] + exons[id2], min_terminal_num_exons = 0, min_terminal_exon_coverage = 0.7, max_slippage = options.max_slippage ): if options.loglevel >= 1: options.stdlog.write( "# elimination: %s(%s) joins %s(%s) and %s(%s)\n" % \ (rep_id, map_prediction2data[rep_id].mQuality, id1, map_prediction2data[id1].mQuality, id2, map_prediction2data[id2].mQuality) ) return True return False
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-b", "--boundaries", dest="filename_boundaries", type="string", help="filename with exon boundaries.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename with exons (output).") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences.") parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true", help="print exons for predictions not found in reference.") parser.add_option("-q", "--quality-pide", dest="quality_threshold_pide", type="int", help="quality threshold (pide) for exons.") parser.set_defaults( genome_file="genome", filename_boundaries=None, filename_exons=None, filename_peptides=None, quality_threshold_pide=0, write_notfound=False, ## allowed number of nucleotides for exon boundaries to ## be considered equivalent. slipping_exon_boundary=9, ## stop codons to search for stop_codons=("TAG", "TAA", "TGA"), ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) reference_exon_boundaries = {} if options.filename_boundaries: reference_exon_boundaries = Exons.ReadExonBoundaries(open( options.filename_boundaries, "r"), do_invert=1, remove_utr=1) E.info("read exon boundaries for %i queries" % len(reference_exon_boundaries)) if options.filename_exons: outfile_exons = open(options.filename_exons, "w") outfile_exons.write("%s\n" % "\t".join( ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame", "reference_id", "reference_from", "reference_to", "reference_phase", "pidentity", "psimilarity", "nframeshifts", "ngaps", "nstopcodons", "is_ok", "genome_exon_from", "genome_exon_to"))) else: outfile_exons = None if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) E.info("read peptide sequences for %i queries" % len(peptide_sequences)) else: peptide_sequences = {} entry = PredictionParser.PredictionParserEntry() last_filename_genome = None nfound, nmissed_exons, nmissed_length = 0, 0, 0 nempty_alignments = 0 fasta = IndexedFasta.IndexedFasta(options.genome_file) options.stdout.write("%s\n" % "\t".join( ("prediction_id", "number", "dubious_exons", "boundaries_sum", "boundaries_max", "identical_exons", "inserted_exons", "deleted_exons", "inserted_introns", "deleted_introns", "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons", "deleted_Cexons", "inserted_Nexons", "inserted_Cexons"))) for line in sys.stdin: if line[0] == "#": continue try: entry.Read(line) except ValueError, msg: print "# parsing failed with msg %s in line %s" % (msg, line[:-1]) sys.exit(1) exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) if exons[-1][4] != entry.mSbjctGenomeTo: print "# WARNING: discrepancy in exon calculation!!!" for e in exons: print "#", str(e) print "#", str(entry) if options.loglevel >= 5: for e in exons: print "#", str(e) genomic_fragment = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) skip = False if peptide_sequences.has_key(entry.mQueryToken): query_sequence = alignlib_lite.makeSequence( peptide_sequences[entry.mQueryToken]) sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation) percent_similarity, percent_identity = 0, 0 if query_sequence.getLength( ) < entry.mMapPeptide2Translation.getRowTo(): print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken, query_sequence.getLength(), entry.mMapPeptide2Translation.getRowTo()) sys.stdout.flush() nmissed_length += 1 skip = True elif sbjct_sequence.getLength( ) < entry.mMapPeptide2Translation.getColTo(): print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken, sbjct_sequence.getLength(), entry.mMapPeptide2Translation.getColTo()) sys.stdout.flush() nmissed_length += 1 skip = True else: alignlib_lite.rescoreAlignment( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence, alignlib_lite.makeScorer(query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( entry.mMapPeptide2Translation) * 100 E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % (str(entry.mPredictionId), entry.mPercentSimilarity, entry.mPercentIdentity, percent_similarity, percent_identity)) else: query_sequence = None sbjct_sequence = None # default values exons_num_exons = "na" exons_boundaries_sum = "na" exons_boundaries_max = "na" dubious_exons = "na" ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0 truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0 ndeleted_Nexons, ndeleted_Cexons = 0, 0 ninserted_Nexons, ninserted_Cexons = 0, 0 exons_offset = exons[0][3] if not reference_exon_boundaries.has_key(entry.mQueryToken): print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken) sys.stdout.flush() nmissed_exons += 1 skip = True if not skip: nfound += 1 ref_exons = reference_exon_boundaries[entry.mQueryToken] ref_exons_offset = ref_exons[0].mGenomeFrom exons_num_exons = len(ref_exons) - len(exons) exons_boundaries_sum = 0 exons_phase = 0 exons_boundaries_max = 0 dubious_exons = 0 inserted_exons = 0 temp_inserted_exons = 0 if options.loglevel >= 3: for e in exons: options.stdlog.write("# %s\n" % str(e)) for e in ref_exons: options.stdlog.write("# %s\n" % str(e)) min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100 in_sync = 0 e, r = 0, 0 while e < len(exons) and r < len(ref_exons): this_e, this_r = e + 1, r + 1 percent_identity = 0 percent_similarity = 0 is_good_exon = 0 if options.loglevel >= 4: options.stdlog.write("# current exons: %i and %i\n" % (e, r)) sys.stdout.flush() exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[ e][0:6] ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset ## get percent identity for exon exon_percent_identity = 0 exon_percent_similarity = 0 if query_sequence and sbjct_sequence: tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = exon_from / 3 xquery_to = exon_to / 3 alignlib_lite.copyAlignment(tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# WARNING: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) nempty_alignments += 1 else: if options.loglevel >= 5: options.stdlog.write("# %s\n" % str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence))) exon_percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 exon_percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if exon_percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 if e < len(exons) - 1: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e + 1][0:6] else: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, [] if r < len(ref_exons) - 1: next_ref_from, next_ref_to, next_ref_phase = ( ref_exons[r + 1].mPeptideFrom, ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame) else: next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0 if options.loglevel >= 2: options.stdlog.write("# %s\n" % "\t".join( map(str, (entry.mQueryToken, exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, ref_from, ref_to, ref_phase)))) sys.stdout.flush() # beware of small exons. # if less than options.slipping_exon_boundary: boundary is 0 # check if end is more than options.splipping_exon_boundary apart as well. if exon_to - exon_from <= options.slipping_exon_boundary or \ ref_to - ref_from <= options.slipping_exon_boundary: boundary = 0 else: boundary = options.slipping_exon_boundary if ref_to <= exon_from + boundary and \ ref_to <= exon_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if e == 0: ndeleted_Nexons += 1 else: ndeleted_exons += 1 r += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0 overlap = 0 elif exon_to <= ref_from + boundary and \ exon_to <= ref_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if r == 0: ninserted_Nexons += 1 else: ninserted_exons += 1 e += 1 ref_from, ref_to, ref_phase = 0, 0, 0 overlap = 0 else: ## overlap overlap = 1 dfrom = int(math.fabs(exon_from - ref_from)) dto = int(math.fabs(exon_to - ref_to)) ## get percent identity for overlapping fragment if query_sequence and sbjct_sequence: ## this the problem tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = max(ref_from / 3, exon_from / 3) xquery_to = min(ref_to / 3, exon_to / 3) alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# warning: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) percent_identity = 0 percent_similarity = 0 else: if options.loglevel >= 5: print str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 dubious_exons += 1 ## adjust regions for terminal exons if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0: if is_good_exon: truncated_Nterminal_exon = dfrom dfrom = 0 ## truncated terminal exons if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: if is_good_exon: truncated_Cterminal_exon = dto dto = 0 ## do not count deviations for terminal query exons if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0: dfrom = 0 if e == len(exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: dto = 0 ## permit difference of one codon (assumed to be stop) if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto == 3: dto = 0 ## deal with different boundary conditions: if dfrom == 0 and dto == 0: if is_good_exon: nidentical_exons += 1 e += 1 r += 1 ## next exon within this ref_exon elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary: if is_good_exon: ninserted_introns += 1 e += 1 in_sync = 1 dto = 0 ## next ref_exon within this exon elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary: if is_good_exon: ndeleted_introns += 1 r += 1 in_sync = 1 dto = 0 else: e += 1 r += 1 if in_sync: dfrom = 0 if is_good_exon: exons_boundaries_sum += dfrom + dto exons_boundaries_max = max(dfrom, exons_boundaries_max) exons_boundaries_max = max(dto, exons_boundaries_max) ########################################################### ## count inserted/deleted introns and misplaced boundaries ## ## if exon and next_exon in ref_exon: inserted intron ## if ref_exon and next_ref_exon in exon: deleted intron if outfile_exons: if genomic_fragment and exon_genome_to: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment, border_stop_codon=0) else: nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0 if exon_to == 0: this_e = 0 if ref_to == 0: this_r = 0 outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, this_r, ref_from, ref_to, ref_phase, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, is_good_exon, exon_genome_from, exon_genome_to, )), "\t") + "\n") while e < len(exons): exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[ e][0:5] e += 1 ninserted_Cexons += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") while r < len(ref_exons): ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ndeleted_Cexons += 1 ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset r += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, 0, 0, 0, 0, r, ref_from, ref_to, ref_phase, 0, 0, 0, 0, 0, 0, 0, 0, )), "\t") + "\n") else: if options.write_notfound: this_e = 0 ## use prediction's identity/similarity for exons. ## This will still then flag stop-codons in later analysis percent_identity = entry.mPercentIdentity percent_similarity = entry.mPercentSimilarity for exon in exons: this_e += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[ 0:6] if genomic_fragment: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment) outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") options.stdout.write("\t".join( map(str, (entry.mPredictionId, exons_num_exons, dubious_exons, exons_boundaries_sum, exons_boundaries_max, nidentical_exons, ninserted_exons, ndeleted_exons, ninserted_introns, ndeleted_introns, truncated_Nterminal_exon, truncated_Cterminal_exon, ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons, ninserted_Cexons))) + "\n")
elif param_filename_contigs: # read contigs contig_sizes = Genomics.ReadContigSizes( open(param_filename_contigs, "r")) delete_missing = True else: contig_sizes = {"dummy": 1000000000} delete_missing = False if param_loglevel >= 1: print "# read %i peptide sequences" % len(peptide_sequences) sys.stdout.flush() exons = Exons.ReadExonBoundaries( sys.stdin, contig_sizes=contig_sizes, delete_missing=delete_missing, ) if param_loglevel >= 1: print "# read exon information for %i transcripts" % len(exons) sys.stdout.flush() if len(exons) == 0: raise IOError("no exons in exon list.") Exons.SetRankToPositionFlag(exons) if param_use_genome_length: lengths = Exons.GetGenomeLengths(exons) else:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string", help="filename with summary information.") parser.add_option("--skip-header", dest="skip_header", action="store_true", help="skip header.") parser.add_option( "--fill-introns", dest="fill_introns", type="int", help= "fill intron if divisible by three and no stop codon up to a maximum length of #." ) parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int", help="maximum number of stop codons to tolerate within an intron.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("predictions", "extensions", "filled-introns"), help="output format.") parser.set_defaults( genome_file="genome", start_codons=("ATG"), stop_codons=("TAG", "TAA", "TGA"), skip_header=False, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) fasta = IndexedFasta.IndexedFasta(options.genome_file) p = PredictionParser.PredictionParserEntry() ninput, noutput = 0, 0 nfilled = 0 nseqs_filled = 0 nseqs_extended = 0 left_extensions = [] right_extensions = [] filled_introns = [] if not options.skip_header: options.stdout.write("\t".join(( "prediction_id", "intron", "contig", "strand", "start", "end", "length", "nstops", "type", "prime5", "prime3", )) + "\n") for line in sys.stdin: if line[0] == "#": continue ninput += 1 p.Read(line) lsequence = fasta.getLength(p.mSbjctToken) genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo).upper() exons = Exons.Alignment2Exons(p.mMapPeptide2Genome, query_from=0, sbjct_from=0) new_exons = [] last_e = exons[0] nintron = 0 for e in exons[1:]: nintron += 1 lintron = e.mGenomeFrom - last_e.mGenomeTo intron_is_l3 = lintron % 3 != 0 if intron_is_l3: ## get sequence, include also residues from split codons ## when checking for stop codons. ## note that e.mAlignment can sometimes be empty. This might ## be an exonerate bug. In the alignment string there are two ## consecutive exons. if e.mAlignment and last_e.mAlignment and e.mAlignment[0][ 0] == "S": offset_left = last_e.mAlignment[-1][2] offset_right = e.mAlignment[0][2] else: offset_left, offset_right = 0, 0 sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom + offset_right] intron_nstops = 0 for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: if codon in options.stop_codons: intron_nstops += 1 else: intron_nstops = 0 ## check for splice signals sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom] intron_type, prime5, prime3 = Genomics.GetIntronType(sequence) if options.loglevel >= 2: options.stdlog.write( "\t".join(map(str, (p.mPredictionId, nintron, lintron, intron_nstops, intron_type, genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\ sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" ) options.stdout.write("\t".join( map(str, (p.mPredictionId, nintron, p.mSbjctToken, p.mSbjctStrand, last_e.mGenomeTo + p.mSbjctGenomeFrom, e.mGenomeFrom + p.mSbjctGenomeFrom, lintron, intron_nstops, intron_type, prime5, prime3))) + "\n") last_e = e noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\ ninput, noutput)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser.add_option("-q", "--quality", dest="quality", type="string", help="quality categories to take into account.") parser.add_option("-f", "--format=", dest="format", type="string", help="input format [exons|gff|table]") parser.add_option("-e", "--exons=", dest="tablename_exons", type="string", help="table name with exons.") parser.add_option("-p", "--predictions=", dest="tablename_predictions", type="string", help="table name with predictions.") parser.add_option("-n", "--non-redundant", dest="non_redundant", action="store_true", help="only non-redundant predictions.") parser.add_option("-s", "--schema", dest="schema", type="string", help="schema to use.") parser.set_defaults( fields=[ "Id", "NumExons", "GeneLength", "MinExonLength", "MaxExonLength", "MinIntronLength", "MaxIntronLength" ], tablename_exons="exons", tablename_predictions="predictions", quality=None, non_redundant=False, schema=None, tablename_redundant="redundant", tablename_quality="quality", format="exons", ) (options, args) = E.Start(parser, add_csv_options=True, add_psql_options=True) if options.quality: options.quality = options.quality.split(",") if options.format == "table": dbhandle = pgdb.connect(options.psql_connection) exons = Exons.GetExonBoundariesFromTable( dbhandle, options.tablename_predictions, options.tablename_exons, non_redundant_filter=options.non_redundant, quality_filter=options.quality, table_name_quality=options.tablename_quality, table_name_redundant=options.tablename_redundant, schema=options.schema) else: exons = Exons.ReadExonBoundaries(sys.stdin) stats = Exons.CalculateStats(exons) print "\t".join(options.fields) writer = csv.DictWriter(sys.stdout, options.fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') for k, v in stats.items(): v["Id"] = k writer.writerow(v) E.Stop()
def WriteGeneStructureCorrespondence(mali, identifiers, exons, param_master_pattern, gap_char="-", prefix=""): """split multiple alignment into clusters of orthologous transcripts. Orthologous transcripts are defined by similarity of gene structure to query sequences. Also: return matrix of gene structure compatibility 0 : perfect compatibility (exact match) ratio of missed exon boundaries to total exon boundaries. 100 : no compatibility """ wmali = len(identifiers) lmali = len(mali[identifiers[0]]) matrix_compatibility = numpy.zeros((wmali, wmali)) if len(identifiers) == 0: return wmali = len(identifiers) lmali = len(mali[identifiers[0]]) nok = 0 nperfect = 0 ntotal_exons = 0 nidentical_exons = 0 nskipped_exons = 0 ref_nok = 0 ref_nperfect = 0 ref_ntotal_exons = 0 ref_nidentical_exons = 0 ref_nskipped_exons = 0 ref_ntotal = 0 rx = re.compile(param_master_pattern) # list of number of exons anexons = [] ## exons in reference ref_nexons = 0 x = 0 for key1 in identifiers: seq = mali[key1] matches = [] unassigned = [] is_perfect = False anexons.append(len(exons[key1])) if rx.search(key1): ref_nexons = len(exons[key1]) y = 0 for key2 in identifiers: if key2 == key1: continue if param_loglevel >= 3: print "#############################################" print "# comparing %s to %s" % (key1, key2) mref = 0 mcmp = 0 seq_master = mali[key2] ref_exons = exons[key2] map_cmp2ref = MaliIO.getMapFromMali(seq, seq_master, gap_char) # map exon boundaries to reference sequence cmp_exons = [] if param_loglevel >= 5: print alignlib_lite.py_writeAlignataTable(map_cmp2ref) for e in exons[key1]: ne = e.GetCopy() ne.mPeptideFrom = MyMap(map_cmp2ref, e.mPeptideFrom + 1, 3, -1) ne.mPeptideTo = MyMap(map_cmp2ref, e.mPeptideTo, 3, 0) cmp_exons.append(ne) # massage boundaries for terminal exons: if cmp_exons[0].mPeptideFrom <= 0: cmp_exons[0].mPeptideFrom = ref_exons[0].mPeptideFrom if cmp_exons[-1].mPeptideTo <= 0: cmp_exons[-1].mPeptideTo = ref_exons[-1].mPeptideTo if param_loglevel >= 4: for e in exons[key1]: print "# exon", str(e) if param_loglevel >= 3: for e in cmp_exons: print "# exon", str(e) for e in ref_exons: print "# exon", str(e) # do exon comparison comparison = Exons.CompareGeneStructures(cmp_exons, ref_exons, threshold_min_pide=0, threshold_slipping_exon_boundary=param_threshold_splipping_exon_boundary) if param_loglevel >= 3: print comparison.Pretty(prefix="# EVAL: ") # analyse results min_nexons = min(len(cmp_exons), len(ref_exons)) max_nexons = max(len(cmp_exons), len(ref_exons)) similarity = (max_nexons - comparison.mNumIdenticalExons) * \ (abs(comparison.mNumDifferenceExons)) is_perfect = False is_ok = False status = [] # non-equivalent exon pairs ne = len(cmp_exons) - comparison.mNumIdenticalExons - \ comparison.mNumSkippedExons is_perfect = False is_ok = False if comparison.mNumIdenticalExons == 0: # F: complete and utter failure, no excuses status.append("F") else: if ne == 0: # P: perfect conservation status.append("=") is_ok = True is_perfect = True elif ne == min_nexons - comparison.mNumSkippedExons: # D: completely different predictions status.append("D") elif ne in (1, 2): # A: almost conserved status.append("A") is_ok = True elif ne > 2: # M : mostly conserved (in case of long proteins that is # good enough). if (100 * comparison.mNumIdenticalExons) / max_nexons > param_evaluate_min_percent_exon_identity: status.append("M") else: # S : spuriously conserved status.append("S") else: # U: unconserved status.append("U") if len(cmp_exons) > len(ref_exons): status.append(">") elif len(ref_exons) < len(cmp_exons): status.append("<") else: status.append("=") if min_nexons == max_nexons and min_nexons == 1: status.append("S") elif min_nexons == 1 and max_nexons == 2: status.append("s") elif min_nexons == 2 and max_nexons == 2: status.append("D") elif min_nexons == 2 and max_nexons > 2: status.append("d") elif min_nexons == max_nexons: status.append("M") elif min_nexons > 2 and max_nexons > 2: status.append("m") else: status.append("U") status = string.join(status, "") structure_compatibility = 100 if is_ok: nok += 1 structure_compatibility = 100 - 100 * \ (comparison.mNumIdenticalExons + comparison.mNumSkippedExons) / len(cmp_exons) if is_perfect: nperfect += 1 structure_compatibility = 0 if abs(comparison.mNumDifferenceExons) > param_max_exons_difference: compatibility_value = 100 else: compatibility_value = structure_compatibility t = comparison.mNumRefBoundaries + comparison.mNumCmpBoundaries if t == 0: compatibility_value = 0 else: compatibility_value = 100 * \ (comparison.mNumMissedRefBoundaries + comparison.mNumMissedCmpBoundaries) / t matrix_compatibility[x][y] = compatibility_value nidentical_exons += comparison.mNumIdenticalExons nskipped_exons += comparison.mNumSkippedExons ntotal_exons += len(cmp_exons) if param_loglevel >= 2: print "%s\tgenepair\t%s\t%s\t%s\t%i\t%i\t%i\t%s" % (prefix, key1, key2, status, compatibility_value, len(cmp_exons), len(ref_exons), str(comparison)) # comparison to reference: count separately: if rx.search(key2): ref_nidentical_exons += comparison.mNumIdenticalExons ref_nskipped_exons += comparison.mNumSkippedExons ref_ntotal_exons += len(cmp_exons) if is_ok: ref_nok += 1 if is_perfect: ref_nperfect += 1 ref_ntotal += 1 y += 1 x += 1 ntotal = wmali * (wmali - 1) print "%s\tallstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ntotal, nperfect, nok, float( nperfect) / ntotal, float(nok) / ntotal, ntotal_exons, nidentical_exons, nskipped_exons, float( nidentical_exons) / ntotal_exons, float(nidentical_exons + nskipped_exons) / ntotal_exons) if ref_ntotal > 0: if ref_ntotal_exons == 0: raise "no exons in reference : ref_ntotal_exons = 0, ref_ntotal = %i" % ( ref_ntotal) print "%s\trefstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ref_ntotal, ref_nperfect, ref_nok, float( ref_nperfect) / ref_ntotal, float(ref_nok) / ref_ntotal, ref_ntotal_exons, ref_nidentical_exons, ref_nskipped_exons, float( ref_nidentical_exons) / ref_ntotal_exons, float(ref_nidentical_exons + ref_nskipped_exons) / ref_ntotal_exons) print "%s\tnexons\t%i\t%i\t" % (prefix, len(anexons), ref_nexons) +\ string.join(map(lambda x: "%.2f" % x, (min(anexons), max(anexons), scipy.mean( anexons), scipy.median( anexons), scipy.std(anexons))), "\t") return matrix_compatibility
"NEXONS_MEAN", "NEXONS_MEDIAN", "NEXONS_STDDEV")), "\t".join(("species", "NSPECIES", "SPECIES_MAX", "MAX_PER_SPECIES", "UNKNOWN")), "\t".join(("failed", "NFAILED_SEQS", "NTOTAL_SEQS", "PFAILED_SEQS", "NFAILED_PAIRS", "NTOTAL_PAIRS", "PFAILED_PAIRS")), "\t".join(("npairs", "NPAIRS_MIN", "NPAIRS_MAX", "NPAIRS_MEAN", "NPAIRS_MEDIAN", "NPAIRS_STDDEV")), "\t".join(("ppairs", "PPAIRS_MIN", "PPAIRS_MAX", "PPAIRS_MEAN", "PPAIRS_MEDIAN", "PPAIRS_STDDEV")), "\t".join(("cov", "COV_MIN", "COV_MAX", "COV_MEAN", "COV_MEDIAN", "COV_STDDEV")), "\t".join(("pcov", "PCOV_MIN", "PCOV_MAX", "PCOV_MEAN", "PCOV_MEDIAN", "PCOV_STDDEV")), "\t".join( ("genepair", "STATUS", "COMPATIBILITY", "CMP_NEXONS", "REF_NEXONS", Exons.ComparisonResult().GetHeader())), "\t".join(("bootstrap", "NORGS", "NOTUS", "PTEST", "PTOTAL", "FTOTAL", evaluate_bootstrap.Results().printHeader())), ] if param_only_headers: print "PREFIX\t" + "\nPREFIX\t".join(headers) print E.GetFooter() sys.exit(0) else: print "# PREFIX\t" + "\n# PREFIX\t".join(headers) # 1. read multiple alignment in fasta format all_mali, all_identifiers = MaliIO.readFasta(sys.stdin) if len(all_identifiers) == 0:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/exons2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", help="method to apply.", type="choice", choices=("remove-stop", )) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed).") parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true", help="work in forward coordinates.") parser.set_defaults(method=None, forward_coordinates=False, genome_file=None) (options, args) = E.Start(parser) if options.method == "remove-stop" and not options.genome_file: raise "please supply genome file for method %s" % options.method if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() exons = Exons.ReadExonBoundaries(sys.stdin, contig_sizes=contig_sizes) else: exons = Exons.ReadExonBoundaries(sys.stdin) ninput, noutput, nremoved_stops, nremoved_exons = 0, 0, 0, 0 for id, ee in exons.items(): if options.loglevel >= 3: for e in ee: options.stdlog.write("# %s\n" % str(e)) if options.method == "remove-stop": e = ee[-1] d = min(3, e.mPeptideTo - e.mPeptideFrom) if d < 3: codon2 = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand, e.mGenomeTo - d, e.mGenomeTo) prev_e = ee[-2] codon1 = fasta.getSequence(prev_e.mSbjctToken, prev_e.mSbjctStrand, prev_e.mGenomeTo - (3 - d), prev_e.mGenomeTo) codon = codon1 + codon2 else: codon = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand, e.mGenomeTo - d, e.mGenomeTo) if codon.upper() in Genomics.StopCodons: if d < 3: nremoved_exons += 1 d = 3 - d del ee[-1] e = ee[-1] e.mGenomeTo -= d e.mPeptideTo -= d nremoved_stops += 1 if e.mGenomeTo == e.mGenomeFrom: nremoved_exons += 1 del ee[-1] e = ee[-1] assert (e.mGenomeTo > e.mGenomeFrom) assert (e.mPeptideTo > e.mPeptideFrom) if options.forward_coordinates: l = contig_sizes[ee[0].mSbjctToken] for e in ee: e.InvertGenomicCoordinates(l) for e in ee: options.stdout.write(str(e) + "\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nremoved_stops=%i, nremoved_exons=%i\n" % (ninput, noutput, nremoved_stops, nremoved_exons)) E.Stop()
continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n") elif options.output_format == "exontable": if options.format == "exons": exons = Exons.ReadExonBoundaries(sys.stdin, contig_sizes=contig_sizes, delete_missing=True) else: raise "unknown format." for k in exons.keys(): ee = exons[k] id = 0 for e in ee: id += 1 print "\t".join( map(str, (e.mQueryToken, id, e.mPeptideFrom, e.mPeptideTo, e.frame, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, e.mGenomeFrom, e.mGenomeTo)))
for o, a in optlist: if o in ("-v", "--verbose"): param_loglevel = int(a) elif o in ("--version", ): print "version=" sys.exit(0) elif o in ("-h", "--help"): print USAGE sys.exit(0) elif o in ("-c", "--contigs"): param_filename_contigs = a print E.GetHeader() print E.GetParams() last_exon = Exons.Exon() contig_sizes = {} if param_filename_contigs: infile = open(param_filename_contigs, "r") for line in infile: if line[0] == "#": continue sbjct_token, size = line[:-1].split("\t")[:2] contig_sizes[sbjct_token] = int(size) map_prediction2genome = alignlib_lite.makeAlignmentSet() nexons, npairs = 0, 0 for line in sys.stdin:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2cleaned_mali.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-m", "--genome-master", dest="genome_master", type="string", help="genome to use as master.") parser.add_option("-s", "--filename-removed", dest="filename_removed", type="string", help="output filename for deleted entries.") parser.add_option("-e", "--filename-exons", dest="filename_exons", type="string", help="filename on where to exon information.") parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="output filename of component summary.") parser.add_option("-c", "--filename-components", dest="filename_components", type="string", help="output filename for components.") parser.add_option( "--min-percent-overlap", dest="min_percent_overlap", type="float", help= "minimum percent overlap for splitting multiple alignment into components." ) parser.add_option("--max-percent-overlap", dest="max_percent_overlap", type="float", help="maximum percent overlap for split genes.") parser.add_option( "--min-genomic-distance", dest="min_genomic_distance", type="int", help= "minimum genomic distance for adjacent genes to be considered dodgy.") parser.add_option("-o", "--mode", dest="mode", type="choice", choices=("joining", "split"), help="""how to filter the alignment. joining: remove joining transcripts (spindly genes) split: remove split transcripts""") parser.add_option( "-g", "--gene-mode", dest="gene_mode", action="store_true", help= """the aligned sequences are genes. This forces the exon boundaries to collated by genes.""") parser.set_defaults( \ genome_master = None, filename_removed = None, filename_components = None, filename_summary = None, filename_exons = None, mode="joining", input_format = "fasta", output_format = "fasta", max_percent_overlap = 0, min_percent_overlap = 0, gene_mode = False, separator = "|") (options, args) = E.Start(parser) ############################################################### ############################################################### ############################################################### ## input ############################################################### mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) all_identifiers = mali.getIdentifiers() if options.filename_exons: ## read exon boundaries and keep forward coordinates if options.gene_mode: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"), from_zero=True) gene_exons = {} for id, ee in exons.items(): data = id.split(options.separator) new_id = options.separator.join((data[0], data[2])) if new_id not in gene_exons: gene_exons[new_id] = [] for e in ee: e.mQueryToken = new_id gene_exons[new_id] += ee for id, ee in gene_exons.items(): ee.sort(lambda x, y: cmp(x.mGenomeFrom, y.mGenomeFrom)) exons = gene_exons else: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"), filter=set(all_identifiers), from_zero=True) else: exons = {} ############################################################### ############################################################### ############################################################### ## collect all transcripts for a species together with their ## aligned length ############################################################### map_species2transcripts = {} for id in mali.getIdentifiers(): data = id.split(options.separator) species = data[0] if exons: l = exons[id][-1].mGenomeTo - exons[id][0].mGenomeFrom else: l = len(mali.getEntry(id).getSequence()) try: map_species2transcripts[species].append((l, id)) except KeyError: map_species2transcripts[species] = [(l, id)] if options.mode == "joining": mapped_transcripts = removeJoiningTranscripts(mali, exons, map_species2transcripts, options) elif options.mode == "split": mapped_transcripts = removeSplitTranscripts(mali, exons, map_species2transcripts, options) ############################################################### ############################################################### ############################################################### ## now build overlap graph of remaining sequences split multiple ## alignment in components. ## Compute reciprocal best match graph ############################################################### graph = networkx.Graph() removed_transcripts = set(map(lambda x: x[0], mapped_transcripts)) for t in all_identifiers: if t not in removed_transcripts: graph.add_node(t) for t1 in range(len(all_identifiers) - 1): transcript1 = all_identifiers[t1] if transcript1 in removed_transcripts: continue for t2 in range(t1 + 1, len(all_identifiers)): transcript2 = all_identifiers[t2] if transcript2 in removed_transcripts: continue overlap = getPercentOverlap(mali[transcript1], mali[transcript2]) if overlap > 5: graph.add_edge(transcript1, transcript2) ## compute components components = networkx.connected_components(graph) ############################################################### ############################################################### ############################################################### ## output ############################################################### if options.filename_components: n = 1 outfile = open(options.filename_components, "w") outfile.write("id\tcomponent\n") for component in components: for c in component: outfile.write("%s\t%i\n" % (c, n)) n += 1 outfile.close() if options.filename_removed and len(removed_transcripts) > 0: outfile = open(options.filename_removed, "w") outfile.write("removed\trepresentative\treason\n") for removed_transcript, rep_transcript, reason in mapped_transcripts: outfile.write("%s\t%s\t%s\n" % (removed_transcript, rep_transcript, reason)) outfile.close() if options.filename_summary: n = 1 outfile = open(options.filename_summary, "w") outfile.write("component\tsize\tnspecies\tnmaster\n") for component in components: species = map(lambda x: x.split(options.separator)[0], component) outfile.write( "%i\t%i\t%i\t%i\t%i\n" % (n, len(component), len(species), len(filter(lambda x: x == options.genome_master, species)))) n += 1 for transcript in removed_transcripts: mali.deleteEntry(transcript) new_identifiers = mali.getIdentifiers() mali.removeGaps(minimum_gaps=len(new_identifiers)) mali.writeToFile(options.stdout, format=options.output_format) if options.loglevel >= 1: options.stdlog.write( "# input=%i, output=%i, removed=%i, ncomponents=%i\n" % (len(all_identifiers), len(new_identifiers), len(removed_transcripts), len(components))) options.stdlog.write("# final component sizes: %s\n" % ",".join(map(lambda x: str(len(x)), components))) E.Stop()