def main (): referenced_species = "Homo_sapiens" # 'ENSP00000253108', 'Ailuropoda_melanoleuca', 'ensembl' protein_list_raw = FileUtilities.get_protein_list() protein_list = [] for protein_tuple in protein_list_raw: protein_list.append(protein_tuple[0]) fill_all_containers(False) if(len(sys.argv) < 1): print "Usage: {0} <blastn | tblastn | SW_gene | SW_exon | all> \n".format(sys.argv[0]) exit mode = sys.argv[1] populate_referenced_species_databases(protein_list, referenced_species) if (mode == "blastn"): populate_blastn_alignments(protein_list) elif (mode == "tblastn"): populate_tblastn_alignments(protein_list) elif (mode == "SW_gene"): populate_SW_gene_alignments(protein_list) elif (mode == "SW_exon"): populate_SW_exon_alignments(protein_list) elif (mode == "all"): populate_blastn_alignments(protein_list) populate_tblastn_alignments(protein_list) populate_SW_gene_alignments(protein_list) populate_SW_exon_alignments(protein_list) else: print "Usage: {0} <blastn | tblastn | SW_gene | SW_exon | all> \n".format(sys.argv[0]) exit
def create_msa_alignments (): dc = DirectoryCrawler() pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() acg = AlignmentCommandGenerator() fill_all_containers(False) for (prot_id, exon_num) in get_protein_list(): if not check_status_file(prot_id): continue ref_prot_rec = pc.get(prot_id).get_sequence_record() exoloc_proteins = [] ensembl_proteins = [] exoloc_proteins.append(ref_prot_rec) ensembl_proteins.append(ref_prot_rec) assembled_dir = dc.get_assembled_protein_path(prot_id) for fasta in sorted(os.listdir(assembled_dir)): if fasta == "Homo_sapiens.fa": continue abs_fasta = "%s/%s" % (assembled_dir, fasta) prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein) exoloc_proteins.append(prot_rec) species_list = get_species_list(prot_id, None) for species in species_list: if species == "Homo_sapiens": continue data_map = dmc.get((prot_id, species)) prot_rec = pc.get(data_map.protein_id).get_sequence_record() prot_rec.id = species ensembl_proteins.append(prot_rec) msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id) msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id) write_seq_records_to_file(msa_exoloc_path, exoloc_proteins) write_seq_records_to_file(msa_ensembl_path, ensembl_proteins) cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id)) print cmd os.system(cmd) cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id)) print cmd os.system(cmd)
def create_species_msa_alignments (): dc = DirectoryCrawler() pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() acg = AlignmentCommandGenerator() fill_all_containers(False) for (prot_id, exon_num) in get_protein_list(): if not check_status_file(prot_id): continue ref_prot_rec = pc.get(prot_id).get_sequence_record() ref_prot_rec.id = "Homo_sapiens" assembled_dir = dc.get_assembled_protein_path(prot_id) species_list = get_species_list(prot_id, None) for species in species_list: protein_recs = [] protein_recs.append(ref_prot_rec) if species == "Homo_sapiens": continue data_map = dmc.get((prot_id, species)) prot_rec = pc.get(data_map.protein_id).get_sequence_record() prot_rec.id = species protein_recs.append(prot_rec) if "%s.fa" % species in os.listdir(assembled_dir): exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein) protein_recs.append(exoloc_protein_rec) msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species) if len(protein_recs) == 1: continue write_seq_records_to_file(msa_species_path, protein_recs) cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species)) print cmd os.system(cmd) os.remove(msa_species_path)
def main (): fill_all_containers(False) generate_SW_exon_alignments2("ENSP00000341765")
for ref_exon in self.ref_exons.get_coding_exons(): bea = self.best_exons[ref_exon.exon_id] if not bea: whole_protein_cdna += "N" * len(ref_exon.sequence) elif bea.status in ["both", "ensembl"]: whole_protein_cdna += bea.ensembl_alignment.get_cDNA(len(whole_protein_cdna)) else: whole_protein_cdna += bea.sw_gene_alignment.create_cDNA() return whole_protein_cdna.translate() if __name__ == '__main__': fill_all_containers(True) bpp = BestProteinProduct("ENSP00000341765", "Ailuropoda_melanoleuca", "Homo_sapiens") bpp.decide_on_best_exon_alignments() bpp.patch_interexon_AAS() print bpp.export_spec_protein_translation() be = bpp.best_exons print
def main(): fill_all_containers(True) ec = ExonContainer.Instance() # translate_alignment_exons() create_statistics(get_protein_list())
def main (): #ERROR FILE::: err_f = open('/home/marioot/err_status_monday.txt', 'w') fill_all_containers(True) protein_tuples = get_protein_list() ec = ExonContainer.Instance() beac = BestExonAlignmentContainer.Instance() dc = DirectoryCrawler() for (protein_id, exon_num) in protein_tuples: if int(exon_num) > 15: print "too big" continue species_list = get_species_list(protein_id, None) try: ref_exons = ec.get((protein_id, "Homo_sapiens", "ensembl")) except KeyError: print "ERROR: No protein %s" % protein_id continue for species in species_list: try: print "\nBest_exon_al: %s, %s" % (protein_id, species) err_f.write("%s, %s" % (protein_id, species)) bpp = BestProteinProduct (protein_id, species, "Homo_sapiens") bpp.load_alignments() bpp.decide_on_best_exons() #bpp.patch_interexon_AAS() for ref_exon in ref_exons.get_coding_exons(): best_exon_alignment = bpp.best_exons[ref_exon.exon_id] if best_exon_alignment: beac.add(ref_exon.exon_id, species, best_exon_alignment) print "%d. Exon status: %s (%s)" % (ref_exon.ordinal, best_exon_alignment.status, ref_exon.exon_id) if best_exon_alignment.sw_gene_alignment: print ref_exon.sequence[ref_exon.frame:].translate() best_exon_alignment.sw_gene_alignment.create_cDNA() print "\tAdded %2d alignment pieces" % (len(best_exon_alignment.sw_gene_alignment.alignment_pieces)) for al_piece in best_exon_alignment.sw_gene_alignment.alignment_pieces: print "\t\t%s:" % (al_piece.type), if al_piece.type in ["coding", "insertion"]: print "PROT: %d-%d, GENOME: %d-%d, %s" % (al_piece.ref_protein_start, al_piece.ref_protein_stop, al_piece.genomic_start, al_piece.genomic_stop, al_piece.sequence_id) print "\t\t\tHUMAN:", al_piece.ref_protein_seq print "\t\t\tSPEC :", al_piece.spec_protein_seq else: print whole_prot = bpp.get_spec_protein_translation() whole_prot_rec = SeqRecord(whole_prot, id = species, description = "assembled_protein") file_name = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species) SeqIO.write(whole_prot_rec, file_name, "fasta") print beac.get("ENSE00002199725", species) except Exception, e: print '{0} {1} \n'.format(protein_id, species) err_f.write('{0} {1} \n'.format(protein_id, species))