if line[0] == "#": continue this_exon = Exons.Exon() this_exon.Read(line) if this_exon.mSbjctStrand == "-": this_exon.InvertGenomicCoordinates( contig_sizes[this_exon.mSbjctToken]) nexons += 1 if last_exon.mQueryToken != this_exon.mQueryToken: if last_exon.mQueryToken: f = alignlib_lite.AlignmentFormatEmissions( map_prediction2genome) print string.join( map(str, (last_exon.mQueryToken, last_exon.mSbjctToken, last_exon.mSbjctStrand, f)), "\t") npairs += 1 map_prediction2genome.clear() alignlib_lite.addDiagonal2Alignment( map_prediction2genome, this_exon.mPeptideFrom + 1, this_exon.mPeptideTo + 1, this_exon.mGenomeFrom - this_exon.mPeptideFrom) last_exon = this_exon f = alignlib_lite.AlignmentFormatEmissions(map_prediction2genome)
def IsParalogLink(link, cds1, cds2): """sort out ortholog relationships between transcripts of orthologous genes. """ map_a2b = alignlib_lite.makeAlignmentVector() alignlib_lite.AlignmentFormatEmissions(link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_a2b) if link.mQueryLength < (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) or \ link.mSbjctLength < (map_a2b.getColTo() - map_a2b.getColFrom() + 1): print "ERRONEOUS LINK: %s" % str(link) raise "length discrepancy" coverage_a = 100.0 * \ (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / link.mQueryLength coverage_b = 100.0 * \ (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / link.mSbjctLength # check exon boundaries, look at starts, skip first exon def MyMap(a, x): if x < a.getRowFrom(): return 0 while x <= a.getRowTo(): c = a.mapRowToCol(x) if c: return c x += 1 else: return 0 mapped_boundaries = UniquifyList( map(lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), cds1[1:])) reference_boundaries = UniquifyList( map(lambda x: x.mPeptideFrom / 3 + 1, cds2[1:])) nmissed = 0 nfound = 0 nmin = min(len(mapped_boundaries), len(reference_boundaries)) nmax = max(len(mapped_boundaries), len(reference_boundaries)) both_single_exon = len(cds1) == 1 and len(cds2) == 1 one_single_exon = len(cds1) == 1 or len(cds2) == 1 if len(mapped_boundaries) < len(reference_boundaries): mless = mapped_boundaries mmore = reference_boundaries else: mmore = mapped_boundaries mless = reference_boundaries # check if exon boundaries are ok for x in mless: is_ok = 0 for c in mmore: if abs(x - c) < param_boundaries_max_slippage: is_ok = 1 break if is_ok: nfound += 1 else: nmissed += 1 # set is_ok for dependent on exon boundaries # in single exon cases, require a check of coverage is_ok = False check_coverage = False if both_single_exon or one_single_exon: is_ok = True check_coverage = True else: if nmin == 1: is_ok = nmissed == 0 elif nmin == 2: is_ok = nmissed <= 1 elif nmin > 2: is_ok = nfound >= 2 cc = min(coverage_a, coverage_b) if param_loglevel >= 3: print "# nquery=", len(cds1), "nsbjct=", len(cds2), "nmin=", nmin, "nmissed=", nmissed, "nfound=", nfound, \ "is_ok=", is_ok, "check_cov=", check_coverage, \ "min_cov=", cc, coverage_a, coverage_b, \ "mapped=", mapped_boundaries, "reference=", reference_boundaries if not is_ok: return True, "different exon boundaries" if check_coverage and cc < param_min_coverage: return True, "low coverage" return False, None
def Expand(self): self.mMapOld2New = alignlib_lite.makeAlignmentVector() alignlib_lite.AlignmentFormatEmissions( self.mOldFrom, self.mOldAli, self.mNewFrom, self.mNewAli).copy(self.mMapOld2New)
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2, peptides2, cds2): """sort out ortholog relationships between transcripts of orthologous genes. Orthologs have: the same number of exons compatible intron/exon boundaries For the remaining transcript pairs, take reciprocal bet hits. I see the following: 0: 0(100%), 1: 0(94%), 2: 0,1(100%) 0: 0(100%), 1: 0,1,2(100%) Selecting 1-0 first, would result in a suboptimal match, because one transcript is longer than the other, while matching up 0-0 and 2-1 would be better. Objective function: it is the maximal matching/assignment problem. Use greedy implementation instead. Assign as much as possible according to descending weights. """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) # for long sequence: use dot alignment with tuple size of three dottor = alignlib_lite.makeAlignatorTuples(3) alignator_dots = alignlib_lite.makeAlignatorDotsSquared( param_gop, param_gep, dottor) seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]), transcripts1) seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]), transcripts2) if param_loglevel >= 4: print "# building sequence 1" for i in range(len(seqs1)): if not cds1.has_key(transcripts1[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# building sequence 2" for i in range(len(seqs2)): if not cds2.has_key(transcripts2[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# all-vs-all alignment" # do all versus all alignment alis1 = [] alis2 = [] for i in range(len(seqs1)): alis1.append([]) for i in range(len(seqs2)): alis2.append([]) if param_loglevel >= 3: print "#################################" for i in range(len(seqs1)): for cd in cds1[transcripts1[i][0]]: print "#", str(cd) print "# versus" for i in range(len(seqs2)): for cd in cds2[transcripts2[i][0]]: print "#", str(cd) sys.stdout.flush() weights = {} for i in range(len(seqs1)): prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[ i] for j in range(len(seqs2)): prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[ j] map_a2b = alignlib_lite.makeAlignmentVector() m = seqs1[i].getLength() * seqs2[j].getLength() if param_loglevel >= 3: print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\ (i, j, prediction_id1, seqs1[ i].getLength(), prediction_id2, seqs2[j].getLength()) sys.stdout.flush() if m > param_max_matrix_size: # switch to tuple alignment if sequences are too large if param_loglevel >= 2: print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % ( seqs1[i].getLength(), seqs2[j].getLength()) sys.stdout.flush() alignator_dots.align(map_a2b, seqs1[i], seqs2[j]) else: alignator.align(map_a2b, seqs1[i], seqs2[j]) coverage_a = 100.0 * \ (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \ seqs1[i].getLength() coverage_b = 100.0 * \ (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \ seqs2[j].getLength() # get copy of cds, but only those overlapping with alignment c1 = Exons.GetExonsRange( cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3, (map_a2b.getRowTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) c2 = Exons.GetExonsRange( cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3, (map_a2b.getColTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) # check exon boundaries, look at starts, skip first exon def MyMap(a, x): while x <= a.getRowTo(): c = a.mapRowToCol(x) if c: return c x += 1 else: return 0 mapped_boundaries = map( lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:]) mapped_boundaries.sort() reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1, c2[1:]) reference_boundaries.sort() nmissed_cmp2ref = Exons.CountMissedBoundaries( mapped_boundaries, reference_boundaries, param_boundaries_max_slippage) nmissed_ref2cmp = Exons.CountMissedBoundaries( reference_boundaries, mapped_boundaries, param_boundaries_max_slippage) min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp) # set is_ok for the whole thing # no intron: is ok is_ok = 0 if (len(c1) == 1 and len(c2) == 1): is_ok = 1 else: # allow for missed boundaries, if param_boundaries_allow_missed # > 0 if min_nmissed == 0: is_ok = 1 else: if param_boundaries_allow_missed and \ len(mapped_boundaries) >= param_boundaries_allow_missed and \ min_nmissed <= param_boundaries_max_missed: is_ok = 1 cc = min(coverage_a, coverage_b) if cc >= param_min_coverage: is_ok_coverage = 1 else: is_ok_coverage = 0 # check for missing introns is_ok_exons = 1 if abs(len(c1) - len(c2)) != 0: if param_missing_max_missing: if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or (min(len(c1), len(c2)) < param_missing_min_present)): is_ok_exons = 0 else: is_ok_exons = 0 if param_loglevel >= 3: print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \ "boundaries_ok=", is_ok, \ "nexons_ok=", is_ok_exons, \ "missed_c2r=", nmissed_cmp2ref, \ "missed_r2c=", nmissed_ref2cmp, \ "min_cov=", cc, \ "mapped=", mapped_boundaries, \ "reference=", reference_boundaries print "#", string.join( map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b), map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") sys.stdout.flush() # dump out pairs for method in param_write_pairs: if method == "all": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength(), map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b, nmissed_cmp2ref, mapped_boundaries, nmissed_ref2cmp, reference_boundaries, i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t") elif method == "alignment": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") elif method == "location": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t") if not is_ok_exons: if param_loglevel >= 4: print "# rejected %i and %i: too many exons difference." % ( i, j) continue if param_check_exon_boundaries: if not is_ok: continue if cc < param_min_coverage: continue if not weights.has_key(cc): weights[cc] = [] alis1[i].append((coverage_a, j)) alis2[j].append((coverage_b, i)) weights[cc].append((i, j, map_a2b)) # sort out alignments ww = weights.keys() ww.sort() ww.reverse() pairs = [] assigned1 = {} assigned2 = {} if param_loglevel >= 3: print "# alis1=", alis1 print "# alis2=", alis2 print "# --------------------------------------" for w in ww: for i, j, map_a2b in weights[w]: if not assigned1.has_key(i) and not assigned2.has_key(j): pairs.append((transcripts1[i], transcripts2[j], w, map_a2b)) assigned1[i] = 1 assigned2[j] = 1 if len(assigned1) == len(transcripts1): break if len(assigned2) == len(transcripts2): break return pairs
else: alignlib_lite.copyAlignment(tmp_map_row2col, map_row2col, e1.mPeptideFrom / 3 + 1, e1.mPeptideTo / 3 + 1, e2.mPeptideFrom / 3 + 1, e2.mPeptideTo / 3 + 1) # in case of split codons, there is an alignment of length # 1. Skip that. if tmp_map_row2col.getLength() > 1: print string.join( map(str, (link.mQueryToken, e1.mRank, link.mSbjctToken, e2.mRank, link.mEvalue, alignlib_lite.AlignmentFormatEmissions( tmp_map_row2col))), "\t") npairs += 1 else: if param_loglevel >= 2: print "# SKIPPED: %s" % str(link) nskipped += 1 if (ninput % param_report_step) == 0: if param_loglevel >= 1: print "# ninput=%i, noutput=%i, nskipped=%i" % (ninput, npairs, nskipped) sys.stdout.flush() if param_loglevel >= 1: print "# ninput=%i, noutput=%i, nskipped=%i" % (ninput, npairs,
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2, transcript2, peptide_map_a2b): if param_loglevel >= 3: for cd in cds1: print "#", str(cd) for cd in cds2: print "#", str(cd) print "# peptide_map_a2b", str( alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b)) sys.stdout.flush() dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2) if len(cds1) != len(cds2): if param_loglevel >= 4: print "" # WARNING: different number of exons!" seq1 = alignlib_lite.makeSequence(transcript1) seq2 = alignlib_lite.makeSequence(transcript2) tmp_map_a2b = alignlib_lite.makeAlignmentVector() dialign = WrapperDialign.Dialign("-n") dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8") dba = WrapperDBA.DBA() #clustal = WrapperClustal.Clustal() matrix, gop, gep = global_substitution_matrix alignator_nw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix) alignator_sw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix) # concatenated alignments for exons: # 1: only the common parts ali_common1 = "" ali_common2 = "" e1, e2 = 0, 0 while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom(): e1 += 1 while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom(): e2 += 1 nskipped, nerrors = 0, 0 if param_loglevel >= 5: nmapped = 0 for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1): if dna_map_a2b.mapRowToCol(x) >= 0: nmapped += 1 print "# nmapped=", nmapped print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b)) # declare alignments used map_intron_a2b = alignlib_lite.makeAlignmentVector() result = Exons.CompareGeneStructures(cds1, cds2, map_cmp2ref=peptide_map_a2b) if param_loglevel >= 2: print result.Pretty("#") nskipped_exons, nskipped_introns = 0, 0 last_e1, last_e2 = None, None for link in result.mEquivalences: if link.mCoverage <= param_min_exon_coverage: nskipped_exons += 1 continue e1, e2 = link.mId1, link.mId2 c1 = cds1[e1] c2 = cds2[e2] exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo] exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo] ####################################################################### # write unaligned exons if param_write_exons: pair = AlignedPairs.UnalignedPair() pair.mCategory = "exon" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) pair.mLen1 = len(exon_fragment1) pair.mSequence1 = exon_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum2 = len(cds2) pair.mLen2 = len(exon_fragment2) pair.mSequence2 = exon_fragment2 pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo, pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo, print str(pair) sys.stdout.flush() ####################################################################### # build alignment for overlap of both exons # tmp_map_a2b.clear() # alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b, # c1.mGenomeFrom + 1, c1.mGenomeTo ) # if param_loglevel >= 5: # print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo) # for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"): # print "#", x # if tmp_map_a2b.getLength() == 0: # if param_loglevel >= 1: # print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \ ## (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2) # print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\ ## peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(peptide_map_a2b) # print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\ ## dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(dna_map_a2b) # for cd in cds1: print "##", str(cd) # for cd in cds2: print "##", str(cd) ## nerrors += 1 # continue ## data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b ).split("\n")) # if "caligned" in param_write_exons : # print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1, ## token2, e2, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) ## ali_common1 += data[0][1] ## ali_common2 += data[1][1] ####################################################################### # write alignment of introns for orthologous introns # orthologous introns are between orthologous exons if param_write_introns: if last_e1 is not None: if e1 - last_e1 != 1 or e2 - last_e2 != 1: nskipped_introns += 1 else: pair = AlignedPairs.UnalignedPair() intron_from1 = cds1[e1 - 1].mGenomeTo intron_to1 = cds1[e1].mGenomeFrom intron_from2 = cds2[e2 - 1].mGenomeTo intron_to2 = cds2[e2].mGenomeFrom intron_fragment1 = transcript1[intron_from1:intron_to1] intron_fragment2 = transcript2[intron_from2:intron_to2] if len(intron_fragment1) == 0 or len( intron_fragment2) == 0: print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\ (intron_from1, intron_to1, len(transcript1), intron_from2, intron_to2, len(transcript2)) continue pair.mCategory = "intron" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) - 1 pair.mLen1 = len(intron_fragment1) pair.mFrom1 = intron_from1 pair.mTo1 = intron_to1 pair.mSequence1 = intron_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum1 = len(cds2) - 1 pair.mLen2 = len(intron_fragment2) pair.mFrom2 = intron_from2 pair.mTo2 = intron_to2 pair.mSequence2 = intron_fragment2 if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \ (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \ (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \ (param_max_intron_length and len(intron_fragment2) > param_max_intron_length): if param_loglevel >= 1: print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\ (token1, e1, token2, e2, len(intron_fragment1), len(intron_fragment2)) sys.stdout.flush() nskipped += 1 print str(pair) # else: ## anchored_from1 = intron_from1 - param_extend_introns ## anchored_to1 = intron_to1 + param_extend_introns ## anchored_from2 = intron_from2 - param_extend_introns ## anchored_to2 = intron_to2 + param_extend_introns ## anchored_fragment1 = transcript1[anchored_from1:anchored_to1] ## anchored_fragment2 = transcript2[anchored_from2:anchored_to2] # for method in param_write_introns: # if param_loglevel >= 2: # print "## aligning with method %s" % method # sys.stdout.flush # map_intron_a2b.clear() # if method == "unaligned": ## from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2 # elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"): ## tmp_intron_a2b = alignlib_lite.makeAlignmentVector() # if param_loglevel >= 1: # print "# aligning with method %s two fragments of length %i and %i" % (method, # len(anchored_fragment1), # len(anchored_fragment2)) # sys.stdout.flush() # if method == "dialigned": ## result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dialignedlgs": ## result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dbaligned": ## result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "clusaligned": ## result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # if not result or result.getLength() == 0: # if param_loglevel >= 1: # print "# Error: empty intron alignment" # sys.stdout.flush() ## nerrors += 1 # continue ## tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 ) # alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b, ## intron_from1 + 1, intron_to1, # intron_from2 + 1, intron_to2 ) # elif method == "nwaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignator_nw.Align( seq1, seq2, map_intron_a2b ) # seq1.useFullLength() # seq2.useFullLength() # elif method == "swaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw ) # seq1.useFullLength() # seq2.useFullLength() # else: ## raise "unknown method %s" % method # if map_intron_a2b.getLength() > 0: # if param_compress: ## from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo() ## from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo() ## ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b ) # else: # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b ).split("\n")) # if len(data) < 2: ## data=[ ( 0, "", 0), (0, "", 0)] ## from1, ali1, to1 = data[0] ## from2, ali2, to2 = data[1] # print string.join(map(str, ("intron", # method, ## token1, e1, len(cds1) - 1, len(intron_fragment1), ## token2, e2, len(cds2) - 1, len(intron_fragment2), # map_intron_a2b.getNumGaps(), # map_intron_a2b.getLength(), ## map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(), ## from1, to1, ali1, ## from2, to2, ali2, ## intron_from1, intron_to1, # intron_from2, intron_to2)), "\t") # sys.stdout.flush() last_e1, last_e2 = e1, e2 ########################################################################## # write concatenated exons # for method in param_write_exons: # if method == "common": # print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## 0, 0, ## 0, 0, # ali_common1, ali_common2 ) # elif method == "exons": # Write full alignment without gaps. # This will not care about exon boundaries and gaps. # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) # try: ## from1, s1, to1, from2, s2, to2 = data[0] + data[1] # except ValueError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # except IndexError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # if from1: # if len(s1) != len(s2): # print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2)) ## nerrors += 1 ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" # else: ## a1, a2 = [], [] # for x in range( min(len(s1), len(s2)) ): # if s1[x] != "-" and s2[x] != "-": ## a1.append( s1[x] ) ## a2.append( s2[x] ) ## s1 = string.join(a1, "") ## s2 = string.join(a2, "") # print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0, ## token2, 0, ## from1, to1, ## from2, to2, # s1, s2 ) ) # elif method == "full": # write full alignment (do not care about exon boundaries) # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) ## if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)] # print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) if param_loglevel >= 3: print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons, nskipped_introns) return nerrors, nskipped
def WriteGeneStructureCorrespondence( mali, identifiers, exons, param_master_pattern, gap_char = "-" , prefix = "" ): """split multiple alignment into clusters of orthologous transcripts. Orthologous transcripts are defined by similarity of gene structure to query sequences. Also: return matrix of gene structure compatibility 0 : perfect compatibility (exact match) ratio of missed exon boundaries to total exon boundaries. 100 : no compatibility """ wmali = len(identifiers) lmali = len(mali[identifiers[0]]) matrix_compatibility = numpy.zeros( (wmali, wmali) ) if len(identifiers) == 0: return wmali = len(identifiers) lmali = len(mali[identifiers[0]]) nok = 0 nperfect = 0 ntotal_exons = 0 nidentical_exons = 0 nskipped_exons = 0 ref_nok = 0 ref_nperfect = 0 ref_ntotal_exons = 0 ref_nidentical_exons = 0 ref_nskipped_exons = 0 ref_ntotal = 0 rx = re.compile( param_master_pattern ) ## list of number of exons anexons = [] ## exons in reference ref_nexons = 0 for x in range(len(identifiers)): key1 = identifiers[x] seq = mali[key1] matches = [] unassigned = [] is_perfect = False anexons.append( len (exons[key1]) ) if rx.search( key1 ): ref_nexons = len(exons[key1] ) for y in range(len(identifiers)): key2 = identifiers[y] if key2 == key1: continue if param_loglevel >= 3: print "#############################################" print "# comparing %s to %s" % (key1, key2) mref = 0 mcmp = 0 seq_master = mali[key2] ref_exons = exons[key2] map_cmp2ref = MaliIO.getMapFromMali( seq, seq_master, gap_char ) ## map exon boundaries to reference sequence cmp_exons = [] if param_loglevel >= 5: print str(alignlib_lite.AlignmentFormatEmissions( map_cmp2ref )) for e in exons[key1]: ne = e.GetCopy() ne.mPeptideFrom = MyMap( map_cmp2ref, e.mPeptideFrom + 1, 3, -1 ) ne.mPeptideTo = MyMap( map_cmp2ref, e.mPeptideTo, 3, 0 ) cmp_exons.append(ne) ## massage boundaries for terminal exons: if cmp_exons[0].mPeptideFrom <= 0: cmp_exons[0].mPeptideFrom = ref_exons[0].mPeptideFrom if cmp_exons[-1].mPeptideTo <= 0: cmp_exons[-1].mPeptideTo = ref_exons[-1].mPeptideTo if param_loglevel >= 4: for e in exons[key1]: print "# exon", str(e) if param_loglevel >= 3: for e in cmp_exons: print "# exon", str(e) for e in ref_exons: print "# exon", str(e) ## do exon comparison comparison = Exons.CompareGeneStructures( cmp_exons, ref_exons, threshold_min_pide = 0, threshold_slipping_exon_boundary = param_threshold_splipping_exon_boundary, threshold_terminal_exon = param_threshold_terminal_exon ) if param_loglevel >= 3: print comparison.Pretty( prefix = "# EVAL: ") ## analyse results min_nexons = min(len(cmp_exons), len(ref_exons)) max_nexons = max(len(cmp_exons), len(ref_exons)) similarity = (max_nexons - comparison.mNumIdenticalExons) * (abs( comparison.mNumDifferenceExons)) is_perfect = False is_ok = False status = [] # non-equivalent exon pairs ne = len(cmp_exons) - comparison.mNumIdenticalExons - comparison.mNumSkippedExons is_perfect = False is_ok = False if comparison.mNumIdenticalExons == 0: # F: complete and utter failure, no excuses status.append( "F" ) else: if ne == 0: # P: perfect conservation status.append( "=" ) is_ok = True is_perfect = True elif ne == min_nexons - comparison.mNumSkippedExons: # D: completely different predictions status.append( "D" ) elif ne in (1,2): # A: almost conserved status.append( "A" ) is_ok = True elif ne > 2: # M : mostly conserved (in case of long proteins that is good enough). if (100 * comparison.mNumIdenticalExons) / max_nexons > param_evaluate_min_percent_exon_identity: status.append( "M" ) else: # S : spuriously conserved status.append( "S" ) else: # U: unconserved status.append( "U" ) if len(cmp_exons) > len(ref_exons): status.append( ">" ) elif len(ref_exons) < len(cmp_exons): status.append( "<" ) else: status.append( "=" ) if min_nexons == max_nexons and min_nexons == 1: status.append( "S" ) elif min_nexons == 1 and max_nexons == 2: status.append( "s") elif min_nexons == 2 and max_nexons == 2: status.append( "D" ) elif min_nexons == 2 and max_nexons > 2: status.append( "d" ) elif min_nexons == max_nexons: status.append( "M" ) elif min_nexons > 2 and max_nexons > 2: status.append( "m" ) else: status.append( "U") status = string.join( status, "") structure_compatibility = 100 if is_ok: nok += 1 structure_compatibility = 100 - 100 * (comparison.mNumIdenticalExons + comparison.mNumSkippedExons) / len(cmp_exons) if is_perfect: nperfect += 1 structure_compatibility = 0 if abs(comparison.mNumDifferenceExons) > param_max_exons_difference: compatibility_value = 100 else: compatibility_value = structure_compatibility t = comparison.mNumRefBoundaries + comparison.mNumCmpBoundaries if t == 0: compatibility_value = 0 else: compatibility_value = 100 * (comparison.mNumMissedRefBoundaries + comparison.mNumMissedCmpBoundaries) / t matrix_compatibility[x][y] = compatibility_value nidentical_exons += comparison.mNumIdenticalExons nskipped_exons += comparison.mNumSkippedExons ntotal_exons += len(cmp_exons) if param_loglevel >= 2: print "%s\tgenepair\t%s\t%s\t%s\t%i\t%i\t%i\t%s" % (prefix, key1, key2, status, compatibility_value, len(cmp_exons), len(ref_exons), str(comparison)) ## comparison to reference: count separately: if rx.search( key2 ): ref_nidentical_exons += comparison.mNumIdenticalExons ref_nskipped_exons += comparison.mNumSkippedExons ref_ntotal_exons += len(cmp_exons) if is_ok: ref_nok += 1 if is_perfect: ref_nperfect += 1 ref_ntotal += 1 ntotal = wmali * ( wmali - 1) print "%s\tallstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ntotal, nperfect, nok, float(nperfect) / ntotal, float(nok) / ntotal, ntotal_exons, nidentical_exons, nskipped_exons, float(nidentical_exons) / ntotal_exons, float(nidentical_exons + nskipped_exons) / ntotal_exons) if ref_ntotal > 0: if ref_ntotal_exons == 0: raise "no exons in reference : ref_ntotal_exons = 0, ref_ntotal = %i" % (ref_ntotal) print "%s\trefstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ref_ntotal, ref_nperfect, ref_nok, float(ref_nperfect) / ref_ntotal, float(ref_nok) / ref_ntotal, ref_ntotal_exons, ref_nidentical_exons, ref_nskipped_exons, float(ref_nidentical_exons) / ref_ntotal_exons, float(ref_nidentical_exons + ref_nskipped_exons) / ref_ntotal_exons) print "%s\tnexons\t%i\t%i\t" % (prefix, len(anexons), ref_nexons) +\ string.join(map(lambda x: "%.2f" % x, (min(anexons), max(anexons), scipy.mean(anexons), scipy.median(anexons), numpy.std(anexons))), "\t") return matrix_compatibility
def PrintCluster(cluster, cluster_id, lengths, peptide_sequences=None, regex_preferred=None): """print a cluster. Take longest sequence as representative. If preferred is given, only take genes matching preferred identifier. """ if regex_preferred: rx = re.compile(regex_preferred) else: rx = None max_al = 0 max_pl = 0 rep_a = None rep_p = None for c in cluster: l = 0 if c in lengths: l = lengths[c] if l > max_al: max_al = l rep_a = c if rx and rx.search(c) and l > max_pl: max_pl = l rep_p = c if max_pl > 0: max_l = max_pl rep = rep_p else: max_l = max_al rep = rep_a for mem in cluster: l = 0 if mem in lengths: l = lengths[mem] if peptide_sequences: map_rep2mem = alignlib_lite.makeAlignmentVector() if rep == mem and rep in lengths: alignlib_lite.addDiagonal2Alignment(map_rep2mem, 1, lengths[rep], 0) elif mem in peptide_sequences and \ rep in peptide_sequences: alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) alignator.align( map_rep2mem, alignlib_lite.makeSequence(peptide_sequences[rep]), alignlib_lite.makeSequence(peptide_sequences[mem])) f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem) print string.join(map(str, (rep, mem, l, f)), "\t") else: print string.join(map(str, (rep, mem, l)), "\t") sys.stdout.flush() return cluster_id
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-o", "--forward-coordinates", dest="forward_coordinates", action="store_true", help="input uses forward coordinates.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("default", "cds", "cdnas", "map", "gff", "intron-fasta", "exons"), help="output format.") parser.add_option("-r", "--reset-to-start", dest="reset_to_start", action="store_true", help="move genomic coordinates to begin from 0.") parser.add_option("--reset-query", dest="reset_query", action="store_true", help="move peptide coordinates to begin from 0.") parser.set_defaults(genome_file=None, forward_coordinates=False, format="default", reset_to_start=False, reset_query=False) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) cds_id = 1 entry = PredictionParser.PredictionParserEntry() fasta = IndexedFasta.IndexedFasta(options.genome_file) ninput, noutput, nskipped, nerrors = 0, 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue if line.startswith("id"): continue ninput += 1 try: entry.Read(line) except ValueError, msg: options.stdlog.write("# parsing failed with msg %s in line %s" % (msg, line)) nerrors += 1 continue cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) for cd in cds: cd.mSbjctToken = entry.mSbjctToken cd.mSbjctStrand = entry.mSbjctStrand if cds[-1].mGenomeTo != entry.mSbjctGenomeTo: options.stdlog.write( "# WARNING: discrepancy in exon calculation!!!\n") for cd in cds: options.stdlog.write("# %s\n" % str(cd)) options.stdlog.write("# %s\n" % entry) lsequence = fasta.getLength(entry.mSbjctToken) genomic_sequence = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) # deal with forward coordinates: convert them to negative strand # coordinates if options.forward_coordinates and \ entry.mSbjctStrand == "-": entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \ entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom for cd in cds: cd.InvertGenomicCoordinates(lsequence) # attach sequence to cds for cd in cds: start = cd.mGenomeFrom - entry.mSbjctGenomeFrom end = cd.mGenomeTo - entry.mSbjctGenomeFrom cd.mSequence = genomic_sequence[start:end] # reset coordinates for query if options.reset_to_start: offset = entry.mPeptideFrom for cd in cds: cd.mPeptideFrom -= offset cd.mPeptideTo -= offset # play with coordinates if options.reset_to_start: offset = entry.mSbjctGenomeFrom for cd in cds: cd.mGenomeFrom -= offset cd.mGenomeTo -= offset else: offset = 0 if options.format == "cds": rank = 0 for cd in cds: rank += 1 cd.mQueryToken = entry.mQueryToken cd.mSbjctToken = entry.mSbjctToken cd.mSbjctStrand = entry.mSbjctStrand cd.mRank = rank print str(cd) if options.format == "exons": rank = 0 for cd in cds: rank += 1 options.stdout.write("\t".join( map(str, (entry.mPredictionId, cd.mSbjctToken, cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom, cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) + "\n") elif options.format == "cdnas": print string.join( map(str, (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset, entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t") elif options.format == "map": map_prediction2genome = alignlib_lite.makeAlignmentSet() for cd in cds: alignlib_lite.addDiagonal2Alignment( map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo, (cd.mGenomeFrom - offset) - cd.mPeptideFrom) print string.join( map(str, (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, alignlib_lite.AlignmentFormatEmissions( map_prediction2genome))), "\t") elif options.format == "intron-fasta": rank = 0 if len(cds) == 1: nskipped += 1 continue last = cds[0].mGenomeTo for cd in cds[1:]: rank += 1 key = "%s %i %s:%s:%i:%i" % ( entry.mPredictionId, rank, entry.mSbjctToken, entry.mSbjctStrand, last, entry.mSbjctGenomeFrom) sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd. mGenomeFrom - entry.mSbjctGenomeFrom] options.stdout.write(">%s\n%s\n" % (key, sequence)) last = cd.mGenomeTo elif options.format == "gff-match": print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \ (entry.mSbjctToken, "gpipe", "similarity", entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mPercentIdentity, entry.mSbjctStrand, ".", entry.mQueryToken, entry.mQueryFrom, entry.mQueryTo, entry.score, entry.mNIntrons, entry.mNFrameShifts, entry.mNStopCodons) elif options.format == "gff-exon": rank = 0 for cd in cds: rank += 1 print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \ (entry.mSbjctToken, "gpipe", "similarity", cd.mGenomeFrom, cd.mGenomeTo, entry.mPercentIdentity, entry.mSbjctStrand, ".", entry.mQueryToken, cd.mPeptideFrom / 3 + 1, cd.mPeptideTo / 3 + 1, entry.score, rank, len(cds), entry.mPredictionId) else: exon_from = 0 for cd in cds: cd.mPeptideFrom = exon_from exon_from += cd.mGenomeTo - cd.mGenomeFrom cd.mPeptideTo = exon_from print string.join( map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom, cd.mPeptideTo, cd.frame, cd.mGenomeFrom, cd.mGenomeTo, cd.mSequence)), "\t") cds_id += 1 noutput += 1
tt += 1 continue overlap += (min(r.mGenomeTo, t.mGenomeTo) - max(r.mGenomeFrom, t.mGenomeFrom)) rr += 1 tt += 1 if overlap == 0: continue map_reference2target.clear() row = alignlib_lite.makeSequence(reference.mTranslation) col = alignlib_lite.makeSequence(target.mTranslation) alignator.align(map_reference2target, row, col) f = alignlib_lite.AlignmentFormatEmissions(map_reference2target) row_ali, col_ali = f.mRowAlignment, f.mColAlignment pidentity = 100.0 * alignlib_lite.calculatePercentIdentity( map_reference2target, row, col) psimilarity = 100.0 * alignlib_lite.calculatePercentSimilarity( map_reference2target) union = max( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \ min( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom ) inter = min( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \ max( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom ) assignment_id += 1 print string.join( map(str,