def translate_alignment_exons_for_protein(protein_id, exon_number): ''' Translates all the proteins for which there is SW to gene alignment ''' algorithm = "sw_gene" # instantiate all the utilities logger = Logger.Instance() dc = DirectoryCrawler() translation_logger = logger.get_logger("translator") # instantiate all the containters eec = EnsemblExonContainer.Instance() ec = ExonContainer.Instance() pc = ProteinContainer.Instance() failed_species = [] assembled_protein_path = dc.get_assembled_protein_path(protein_id) # for all the species for which it is required to generate translated protein for species in get_species_list(protein_id, assembled_protein_path): # get all you need for the processing assembled_protein_fasta = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species) exon_key = (protein_id, species, algorithm) target_prot = pc.get(protein_id) target_prot_seq = target_prot.get_sequence_record().seq try: exons = ec.get(exon_key) except KeyError: translation_logger.error("%s,%s,%s" % (protein_id, species, "No exons available")) failed_species.append(species) continue exons_for_transcription = [] # THIS PART WILL NOT EXIST IN THE NEAR FUTURE last_translated_exon = False for al_exon in exons.get_ordered_exons(): ref_exon = eec.get(al_exon.ref_exon_id) trans_exon = Exon_translation(ref_exon, al_exon) # if we've already bumped into exon with UTR on its end, all the other exons are not viable if last_translated_exon: trans_exon.viability = False if trans_exon.viability: (trans_exon, last_translated_exon) = chop_off_start_utr(al_exon.ref_exon_id, trans_exon, target_prot_seq, exon_number) trans_exon = chop_off_end_utr (al_exon.ref_exon_id, trans_exon, target_prot_seq, exon_number, protein_id) exons_for_transcription.append(trans_exon) # up to here - this will get trashed assemble_and_store_protein (protein_id, species, exons_for_transcription, target_prot_seq, assembled_protein_fasta) create_protein_alignment (protein_id, species) write_failed_species_to_status(failed_species, assembled_protein_path) return failed_species
def set_frames_to_coding_exons_batch(protein_list): exon_container = ExonContainer.Instance() for protein_id in protein_list: for species in get_species_list(protein_id, None): try: exons = exon_container.get((protein_id, species, "ensembl")) exons.set_coding_exon_frames() except Exception: pass
def create_msa_alignments (): dc = DirectoryCrawler() pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() acg = AlignmentCommandGenerator() fill_all_containers(False) for (prot_id, exon_num) in get_protein_list(): if not check_status_file(prot_id): continue ref_prot_rec = pc.get(prot_id).get_sequence_record() exoloc_proteins = [] ensembl_proteins = [] exoloc_proteins.append(ref_prot_rec) ensembl_proteins.append(ref_prot_rec) assembled_dir = dc.get_assembled_protein_path(prot_id) for fasta in sorted(os.listdir(assembled_dir)): if fasta == "Homo_sapiens.fa": continue abs_fasta = "%s/%s" % (assembled_dir, fasta) prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein) exoloc_proteins.append(prot_rec) species_list = get_species_list(prot_id, None) for species in species_list: if species == "Homo_sapiens": continue data_map = dmc.get((prot_id, species)) prot_rec = pc.get(data_map.protein_id).get_sequence_record() prot_rec.id = species ensembl_proteins.append(prot_rec) msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id) msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id) write_seq_records_to_file(msa_exoloc_path, exoloc_proteins) write_seq_records_to_file(msa_ensembl_path, ensembl_proteins) cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id)) print cmd os.system(cmd) cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id)) print cmd os.system(cmd)
def create_species_msa_alignments (): dc = DirectoryCrawler() pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() acg = AlignmentCommandGenerator() fill_all_containers(False) for (prot_id, exon_num) in get_protein_list(): if not check_status_file(prot_id): continue ref_prot_rec = pc.get(prot_id).get_sequence_record() ref_prot_rec.id = "Homo_sapiens" assembled_dir = dc.get_assembled_protein_path(prot_id) species_list = get_species_list(prot_id, None) for species in species_list: protein_recs = [] protein_recs.append(ref_prot_rec) if species == "Homo_sapiens": continue data_map = dmc.get((prot_id, species)) prot_rec = pc.get(data_map.protein_id).get_sequence_record() prot_rec.id = species protein_recs.append(prot_rec) if "%s.fa" % species in os.listdir(assembled_dir): exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein) protein_recs.append(exoloc_protein_rec) msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species) if len(protein_recs) == 1: continue write_seq_records_to_file(msa_species_path, protein_recs) cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species)) print cmd os.system(cmd) os.remove(msa_species_path)
def get_SW_exon_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with SW_exon for that protein ''' path = self.crawler.get_SW_exon_path(protein_id) return get_species_list(protein_id, path)
def main (): #ERROR FILE::: err_f = open('/home/marioot/err_status_monday.txt', 'w') fill_all_containers(True) protein_tuples = get_protein_list() ec = ExonContainer.Instance() beac = BestExonAlignmentContainer.Instance() dc = DirectoryCrawler() for (protein_id, exon_num) in protein_tuples: if int(exon_num) > 15: print "too big" continue species_list = get_species_list(protein_id, None) try: ref_exons = ec.get((protein_id, "Homo_sapiens", "ensembl")) except KeyError: print "ERROR: No protein %s" % protein_id continue for species in species_list: try: print "\nBest_exon_al: %s, %s" % (protein_id, species) err_f.write("%s, %s" % (protein_id, species)) bpp = BestProteinProduct (protein_id, species, "Homo_sapiens") bpp.load_alignments() bpp.decide_on_best_exons() #bpp.patch_interexon_AAS() for ref_exon in ref_exons.get_coding_exons(): best_exon_alignment = bpp.best_exons[ref_exon.exon_id] if best_exon_alignment: beac.add(ref_exon.exon_id, species, best_exon_alignment) print "%d. Exon status: %s (%s)" % (ref_exon.ordinal, best_exon_alignment.status, ref_exon.exon_id) if best_exon_alignment.sw_gene_alignment: print ref_exon.sequence[ref_exon.frame:].translate() best_exon_alignment.sw_gene_alignment.create_cDNA() print "\tAdded %2d alignment pieces" % (len(best_exon_alignment.sw_gene_alignment.alignment_pieces)) for al_piece in best_exon_alignment.sw_gene_alignment.alignment_pieces: print "\t\t%s:" % (al_piece.type), if al_piece.type in ["coding", "insertion"]: print "PROT: %d-%d, GENOME: %d-%d, %s" % (al_piece.ref_protein_start, al_piece.ref_protein_stop, al_piece.genomic_start, al_piece.genomic_stop, al_piece.sequence_id) print "\t\t\tHUMAN:", al_piece.ref_protein_seq print "\t\t\tSPEC :", al_piece.spec_protein_seq else: print whole_prot = bpp.get_spec_protein_translation() whole_prot_rec = SeqRecord(whole_prot, id = species, description = "assembled_protein") file_name = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species) SeqIO.write(whole_prot_rec, file_name, "fasta") print beac.get("ENSE00002199725", species) except Exception, e: print '{0} {1} \n'.format(protein_id, species) err_f.write('{0} {1} \n'.format(protein_id, species))