def pep_seqs (cursor, gene_id, exons): for exon in exons: ##################################### if (not exon.is_coding): if verbose: print exon.exon_id, "is not coding " continue if (exon.covering_exon > 0): if verbose: print exon.exon_id, "has covering exon" continue exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): if verbose: print exon.exon_id, "no exon_seqs" continue [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs if len(dna_seq)<4: if verbose: print exon.exon_id, "short dna" continue ##################################### mitochondrial = is_mitochondrial(cursor, gene_id) [seq_start, seq_end] = translation_bounds (cursor, exon.exon_id, verbose) if verbose: print " ** ", seq_start, seq_end dna_cropped = crop_dna (seq_start, seq_end, dna_seq) if verbose: print " ** ", dna_cropped [offset, length_translated, pepseq, phase_corrected] = translate (dna_cropped, exon.phase, mitochondrial, verbose) if ( offset < 0): # translation failure; usually some short pieces (end in pos 4 and such) if verbose: print exon.exon_id, "translation failure" print "mitochondrial:", mitochondrial print seq_start, seq_end continue if seq_start is None: seq_start = 1 if seq_start == 0: seq_start = 1 start = seq_start+offset-1 end = start + length_translated dnaseq = Seq (dna_seq[start:end], generic_dna) if (mitochondrial): pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring() else: pepseq2 = dnaseq.translate().tostring() if (not pepseq == pepseq2): start = -10 end = -10 if verbose: print exon.exon_id print "pep from translate:", pepseq print "dna transl:", pepseq2 print "start:" , start print "end:", end print if True: qry = "update exon_seq " qry += " set protein_seq = '%s', " % pepseq qry += " pepseq_transl_start = %d, " % start qry += " pepseq_transl_end = %d " % end qry += " where exon_seq_id = %d " % exon_seq_id rows = search_db (cursor, qry) if (rows): rows = search_db (cursor, qry, verbose = True) continue
def alt_splice_almt (cursor, cfg, acg, species, ensembl_db_name): flank_length = 10 print "############################" print 'checking alt splicing in ', species qry = "use " + ensembl_db_name[species] search_db(cursor, qry) gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) if species == 'homo_sapiens': spec_short = 'HSA' else: spec_short = 'MMU' outdir = "{0}/alt/{1}".format(cfg.dir_path['afs_dumps'], spec_short) if not os.path.exists(outdir): os.makedirs(outdir) ######################################## ######################################## ######################################## #gene_ids.reverse() for gene_id in gene_ids: #for gene_id in [429349]: #for count in range(1000): #gene_id = choice (gene_ids) stable_gene_id = gene2stable(cursor, gene_id) if verbose: print gene_id, stable_gene_id, get_description (cursor, gene_id) transcript_ids = get_transcript_ids(cursor, gene_id) tr_w_ccds = [] for [tr_id, tr_stable] in transcript_ids: ccds = check_ccds (cursor, tr_stable) if not ccds: continue tr_w_ccds.append([tr_id, tr_stable]) if not tr_w_ccds: continue # get all exons for this gene all_exons = gene2exon_list (cursor, gene_id) exons_w_ccds = set([]) # get the unique_ids # find exons which are on the ccds list for [tr_id, tr_stable] in tr_w_ccds: exon_ids = transcript_id2exon_ids (cursor, tr_id) exons_w_ccds.update( set(exon_ids)) # for these exons check sequence is_known = 1 bad_exon = set([]) for exon_id in exons_w_ccds: exon = get_exon (cursor, exon_id, is_known) seq = get_exon_seqs (cursor, exon_id, is_known) if not seq: bad_exon.add(exon_id) continue [exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = seq if exon.covering_exon < 0: if not dna_seq: bad_exon.add(exon_id) else: if exon.covering_exon_known and exon.covering_exon in exons_w_ccds: pass else: all_exon_ids = map(lambda exon: exon.exon_id, all_exons) if not exon.covering_exon in all_exon_ids: bad_exon.add(exon_id) # which transcripts seem to be completely ok? if verbose: print "reconstructing alt splice almts for " if verbose: print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) if verbose: print "there are ", len(tr_w_ccds), " transscripts with ccds" # get the gene_sequence ret = get_gene_seq(acg, cursor, gene_id, species) [gene_seq, canonical_exon_pepseq, file_name, seq_name, seq_region_start, seq_region_end] = ret output_seq = {} global_boundaries = [] local_boundaries = {} # sort exons by the order in which they appear in the gene all_exons.sort(key=lambda exon: exon.start_in_gene) # a bit of a cleanup for exon in all_exons: cleanup_endphase (cursor, exon) # check if any of the translations is complete: no_ok_transcripts = True for [tr_id, tr_stable] in tr_w_ccds: tr_exon_ids = transcript_id2exon_ids (cursor, tr_id) if bad_exon & set(tr_exon_ids): continue if verbose: print tr_stable, " ok " no_ok_transcripts = False if no_ok_transcripts: if verbose: print " no ok transcripts found" continue # main loop cary = "" # for patching up codons split by intron for [tr_id, tr_stable] in tr_w_ccds: tr_exon_ids = transcript_id2exon_ids (cursor, tr_id) if bad_exon & set(tr_exon_ids): continue # translation is from where to where? ret = get_translation_coords (cursor, tr_id) if not ret: continue [seq_start, start_exon_id, seq_end, end_exon_id] = ret for exon in all_exons: if exon.exon_id == start_exon_id: start_exon=exon if exon.exon_id == end_exon_id: end_exon=exon transl_start_in_gene = start_exon.start_in_gene + seq_start transl_end_in_gene = end_exon.start_in_gene + seq_end local_boundaries[tr_stable] = [] output_seq[tr_stable] = "-"*len(gene_sequence) output_seq[tr_stable+"_pep"] = "-"*len(gene_sequence) transl_end = "" for exon in all_exons: if not exon.exon_id in tr_exon_ids: continue start = exon.start_in_gene start_flank = exon.start_in_gene - flank_length if start_flank < 0: start_flank = 0 else: if not start_flank-1 in global_boundaries: global_boundaries.append(start_flank-1) local_boundaries[tr_stable].append(start_flank) end = exon.end_in_gene end_flank = exon.end_in_gene + flank_length if end_flank > len(gene_sequence): end_flank = len(gene_sequence) else: if not end_flank in global_boundaries: global_boundaries.append(end_flank) local_boundaries[tr_stable].append(end_flank) tmp_dna = output_seq[tr_stable][:start_flank] + gene_sequence[start_flank:start].lower() tmp_dna += gene_sequence[start:end] tmp_dna += gene_sequence[end:end_flank].lower() + output_seq[tr_stable][end_flank:] output_seq[tr_stable] = tmp_dna ################################################# # now try and handle translation to protein prev_transl_end = transl_end # where does translation start: if exon.end_in_gene < transl_start_in_gene: transl_start = -1 elif exon.exon_id == start_exon_id: # if this is the first exon, the transl start given above transl_start = exon.start_in_gene+seq_start-1 else: # otherwise it is the exon start - except that if this is not the # first exon and the codon is split, we want to start with the # translation of the stitched up exon transl_start = exon.start_in_gene start_flank = exon.phase # where does translation end: if exon.start_in_gene > transl_end_in_gene: transl_end = -1 elif exon.exon_id == end_exon_id: # if this is the first exon, the transl start given above transl_end = exon.start_in_gene+seq_end else: # otherwise it is the exon start - except that if this is not the # first exon and the codon is split, we want to start with the # translation of the stitched up exon transl_end = exon.end_in_gene - exon.end_phase+1 end_flank = exon.end_phase if transl_start < 0 or transl_end < 0 : continue if exon.phase > 0 and prev_transl_end: cary = gene_sequence[prev_transl_end:prev_transl_end+exon.phase] else: cary = "" [phase, pepseq] = translate (cary + gene_sequence[transl_start:transl_end], 0, mitochondrial, strip_stop = False) prev_transl_end = transl_end pepseq_padded = "" for aa in pepseq: pepseq_padded += "-"+aa+"-" pepseq_name = tr_stable+"_pep" tmp_pep = output_seq[pepseq_name][:transl_start-len(cary)] tmp_pep += pepseq_padded tmp_pep += output_seq[pepseq_name][transl_end:] output_seq[pepseq_name] = tmp_pep global_boundaries.sort() for [tr_id, tr_stable] in tr_w_ccds: seq = output_seq[tr_stable] tmp_seq = "" prev_bdry = 0 for bdry in global_boundaries: tmp_seq += seq[prev_bdry:bdry] if bdry >= len(seq): continue if bdry in local_boundaries[tr_stable]: marker = "-Z-" else: marker = "---" tmp_seq += marker prev_bdry = bdry output_seq[tr_stable] = tmp_seq pepseq_name = tr_stable+"_pep" seq = output_seq[pepseq_name] tmp_seq = "" prev_bdry = 0 for bdry in global_boundaries: tmp_seq += seq[prev_bdry:bdry] if bdry >= len(seq): continue if bdry in local_boundaries[tr_stable]: # note here marker = "-Z-" else: marker = "---" tmp_seq += marker prev_bdry = bdry output_seq[pepseq_name] = tmp_seq output_seq = strip_gaps(output_seq) # define the order in which we want the sequences output name_order = [] for [tr_id, tr_stable] in tr_w_ccds: pepseq_name = tr_stable+"_pep" name_order.append (pepseq_name) name_order.append (tr_stable) afa_fnm = "{0}/{1}.afa".format(outdir, stable_gene_id) ret = output_fasta (afa_fnm, name_order, output_seq) print afa_fnm return True