def generate_referenced_species_database(protein_id, referenced_species): ''' Creates a database for a referenced species and protein, using formatdb @param protein_id @param referenced_species ''' logger = Logger.Instance() alignment_logger = logger.get_logger('alignment') command_generator = CommandGenerator() crawler = DirectoryCrawler() exon_container = ExonContainer.Instance() input_exons = exon_container.get((protein_id, referenced_species, "ensembl")) #source_exon_file = "{0}/{1}.fa".format(crawler.get_exon_ensembl_path(protein_id), referenced_species) input_db_file = "{0}/{1}.fa".format(crawler.get_database_path(protein_id), referenced_species) sequence_type = "Nucleotide" input_exons.export_coding_exons_to_fasta(input_db_file) command = command_generator.generate_formatdb_command(input_db_file, sequence_type) command_return = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) output = command_return.stdout.read() if output != "": #LOGGING alignment_logger.warning("{0}, {1}, REF SPECIES DB, {2}".format(protein_id, referenced_species.strip(), output.strip())) return False return True
def generate_SW_exon_alignments2 (protein_id, species_list = None, referenced_species = "Homo_sapiens"): # utilities alignment_generator = AlignmentTargetGenerator() crawler = DirectoryCrawler() command_generator = CommandGenerator() logger = Logger.Instance() alignment_logger = logger.get_logger('alignment') exon_container = ExonContainer.Instance() tmp_fasta_target_path = "tmp_target.fa" ref_exons_fasta = "%s/%s.fa" % (crawler.get_database_path(protein_id), referenced_species) #ref_exons = exon_container.get((protein_id, referenced_species, "ensembl")) #ref_exons.export_coding_exons_to_fasta(tmp_ref_exons_fasta_path) failed_species_list = [] if (not species_list): species_list = alignment_generator.get_SW_exon_targets(protein_id) try: (proteins_known, proteins_abinitio) = DescriptionParser().parse_descr_file(protein_id) except IOError, e: alignment_logger.error("{0}, , SW cDNA_EXONS, {2}".format(protein_id, e)) return False
def create_protein_alignment(protein_id, species): ''' Generates the SW alignment of three protein sequences: reference species protein, the assembled protein and the ensembl species protein @param protein_id: referent protein id @param species: species (latin) ''' sequences_for_fasta = [] dc = DirectoryCrawler() pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() acg = AlignmentCommandGenerator() tpc = TranslatedProteinContainer.Instance() data_map = dmc.get((protein_id, species)) # get all the proteins ref_protein = pc.get(protein_id) species_protein = pc.get(data_map.protein_id) assembled_protein = tpc.get(protein_id, species) sequences_for_fasta.append(ref_protein.get_sequence_record()) sequences_for_fasta.append(assembled_protein.get_sequence_record()) sequences_for_fasta.append(species_protein.get_sequence_record()) msa_fasta = "%s/%s.fa" % (dc.get_mafft_path(protein_id), species) msa_afa = "%s/%s.afa" % (dc.get_mafft_path(protein_id), species) msa_fasta_file = open(msa_fasta, "w") SeqIO.write(sequences_for_fasta, msa_fasta_file, "fasta") msa_fasta_file.close() mafft_cmd = acg.generate_mafft_command(msa_fasta, msa_afa) os.system(mafft_cmd)
def update_entry_in_status_file (protein_id, status_entry, status_entry_value): ''' Updates the status entry to new value. If there is no .status file as to this update, it generates the status file. If there exists the status file, it reads it. If this status entry is already present, and its value the same as the new value, then nothing is done. Otherwise, the value is updated and written in the status file. ''' dc = DirectoryCrawler() status_file_path = dc.get_mutual_best_status_file_path(protein_id) status_dict = {} if (os.path.isfile(status_file_path)): status_dict = read_status_file(protein_id) if (status_dict.has_key(status_entry)): if (status_dict[status_entry] == status_entry_value): return else: status_dict[status_entry] = status_entry_value status_file = open(status_file_path, 'w') for status_entry, status_entry_value in status_dict.items(): status_file.write("%s %s\n" % (status_entry, status_entry_value)) status_file.close() else: status_file = open(status_file_path, 'a+') status_file.write("%s %s\n" % (status_entry, status_entry_value)) status_file.close()
def load_exons(self): dc = DirectoryCrawler() logger = Logger.Instance() container_logger = logger.get_logger('containters') exon_file_path = dc.get_exon_genewise_path(self.ref_protein_id) exon_file_path += "/%s.fa" % self.species if not os.path.isfile(exon_file_path): container_logger.error ("{0},{1},genewise,no fasta file for genewise exons.".format(self.ref_protein_id, self.species)) return False try: exon_file = open(exon_file_path, 'r') except IOError: container_logger.error("%s,%s,%s" % (self.ref_protein_id, self.species, "No genewise exon file.")) return None seq_records = SeqIO.parse(exon_file, "fasta", unambiguous_dna) for seq_record in seq_records: (num,ir1,ir2,data) = seq_record.description.split() num = int(num) (length, start, stop) = data.split('|') exon = GenewiseExon((self.ref_protein_id, self.species), num, start, stop, seq_record.seq) self.exons[num] = exon return self.exons
def fill_all_containers (load_alignments): ''' Fills all the containers with correspondent data. The containers are: data maps, proteins, genes, transcripts, ensembl exons, and all the alignment exons ''' dc = DirectoryCrawler() protein_list_raw = FileUtilities.get_protein_list() # flatten the raw protein list and take every second element, which is a protein id protein_list = list(chain.from_iterable(protein_list_raw))[0::2] algorithms = ["blastn", "tblastn", "sw_gene", "sw_exon"] for protein_id in protein_list: dc.generate_directory_tree(protein_id) ens_exon_container = load_protein_configuration_batch(protein_list) if ens_exon_container: load_exon_configuration_batch(protein_list, "ensembl") load_exon_configuration_batch(protein_list, "genewise") if load_alignments: load_exon_configuration_batch (protein_list, "blastn") load_exon_configuration_batch(protein_list, "tblastn") load_exon_configuration_batch(protein_list, "sw_gene") load_exon_configuration_batch(protein_list, "sw_exon") set_frames_to_coding_exons_batch (protein_list) remove_overlapping_alignments_batch(protein_list, ["blastn", "tblastn"]) annotate_spurious_alignments_batch(protein_list, algorithms)
def translate_alignment_exons_for_protein(protein_id, exon_number): ''' Translates all the proteins for which there is SW to gene alignment ''' algorithm = "sw_gene" # instantiate all the utilities logger = Logger.Instance() dc = DirectoryCrawler() translation_logger = logger.get_logger("translator") # instantiate all the containters eec = EnsemblExonContainer.Instance() ec = ExonContainer.Instance() pc = ProteinContainer.Instance() failed_species = [] assembled_protein_path = dc.get_assembled_protein_path(protein_id) # for all the species for which it is required to generate translated protein for species in get_species_list(protein_id, assembled_protein_path): # get all you need for the processing assembled_protein_fasta = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species) exon_key = (protein_id, species, algorithm) target_prot = pc.get(protein_id) target_prot_seq = target_prot.get_sequence_record().seq try: exons = ec.get(exon_key) except KeyError: translation_logger.error("%s,%s,%s" % (protein_id, species, "No exons available")) failed_species.append(species) continue exons_for_transcription = [] # THIS PART WILL NOT EXIST IN THE NEAR FUTURE last_translated_exon = False for al_exon in exons.get_ordered_exons(): ref_exon = eec.get(al_exon.ref_exon_id) trans_exon = Exon_translation(ref_exon, al_exon) # if we've already bumped into exon with UTR on its end, all the other exons are not viable if last_translated_exon: trans_exon.viability = False if trans_exon.viability: (trans_exon, last_translated_exon) = chop_off_start_utr(al_exon.ref_exon_id, trans_exon, target_prot_seq, exon_number) trans_exon = chop_off_end_utr (al_exon.ref_exon_id, trans_exon, target_prot_seq, exon_number, protein_id) exons_for_transcription.append(trans_exon) # up to here - this will get trashed assemble_and_store_protein (protein_id, species, exons_for_transcription, target_prot_seq, assembled_protein_fasta) create_protein_alignment (protein_id, species) write_failed_species_to_status(failed_species, assembled_protein_path) return failed_species
def create_statistics(protein_list): dc = DirectoryCrawler() for (protein_id, exon_num) in protein_list: stat_file = "%s/stats.csv" % dc.get_root_path(protein_id) if not check_status_file(protein_id): continue create_protein_statistics(protein_id, stat_file)
def create_species_msa_alignments (): dc = DirectoryCrawler() pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() acg = AlignmentCommandGenerator() fill_all_containers(False) for (prot_id, exon_num) in get_protein_list(): if not check_status_file(prot_id): continue ref_prot_rec = pc.get(prot_id).get_sequence_record() ref_prot_rec.id = "Homo_sapiens" assembled_dir = dc.get_assembled_protein_path(prot_id) species_list = get_species_list(prot_id, None) for species in species_list: protein_recs = [] protein_recs.append(ref_prot_rec) if species == "Homo_sapiens": continue data_map = dmc.get((prot_id, species)) prot_rec = pc.get(data_map.protein_id).get_sequence_record() prot_rec.id = species protein_recs.append(prot_rec) if "%s.fa" % species in os.listdir(assembled_dir): exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein) protein_recs.append(exoloc_protein_rec) msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species) if len(protein_recs) == 1: continue write_seq_records_to_file(msa_species_path, protein_recs) cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species)) print cmd os.system(cmd) os.remove(msa_species_path)
def populate_sequence_protein (protein_id): ''' Populates the "/PROTEIN_ID/sequence/protein/<species>.fa" folder with fasta files containing protein sequence for all the species registered by the Reciprocal Best Search. ''' logger = Logger.Instance() alignment_logger = logger.get_logger('data_retrieval') alignment_command_generator = AlignmentCommandGenerator() directory_crawler = DirectoryCrawler() protein_path = directory_crawler.get_protein_path(protein_id) try: (proteins_known, proteins_abinitio) = DescriptionParser().get_protein_ids(protein_id) except IOError, e: alignment_logger.error("{0}, PROTEIN, , {2}".format(protein_id, e)) return
def read_status_file (protein_id): ''' @return: status_dict dictionary of mapped status entries to their values Status entries may be: MUTUAL_BEST: OK/FAILED DATA_RETRIEVAL: OK/PARTIAL/FAILED ''' dc = DirectoryCrawler() status_file_path = dc.get_mutual_best_status_file_path(protein_id) try: status_file = open(status_file_path, 'r') except IOError: raise IOError('No .status file for protein %s' % protein_id) status_dict = dict(token.split() for token in status_file.read().strip().split('\n')) status_file.close() return status_dict
def populate_sequence_exon_genewise(protein_id): ''' Populates the "/PROTEIN_ID/sequence/exon/genewisel/<species>.fa" folder with fasta files containing a list of all the exons for a particular transcript. The data is acquired using the genewise program. This is used for the proteins found with an ab_initio method, that dont have a list of exons on ensembl. ''' logger = Logger.Instance() alignment_logger = logger.get_logger('data_retrieval') directory_crawler = DirectoryCrawler() command_generator = CommandGenerator() exon_genewise_path = directory_crawler.get_exon_genewise_path(protein_id) try: (proteins_known, proteins_abinitio) = DescriptionParser().parse_descr_file(protein_id) except IOError, e: alignment_logger.error("{0}, {1}, , {2}".format(protein_id, 'GENEWISE', e)) return
def generate_blastn_alignments(protein_id, species_list = None, referenced_species = "Homo_sapiens"): ''' Runs the blastn program for a specified protein and list of species @param protein_id @param species_list: if provided, runs blastn for this list of species, \ otherwise runs for species that are missing the blastn output \ who are determined by .status file in the blastn folder. ''' logger = Logger.Instance() alignment_logger = logger.get_logger('alignment') crawler = DirectoryCrawler() command_generator = CommandGenerator() alignment_generator = AlignmentTargetGenerator() failed_species_list = [] # retrieve the blastn targets if (not species_list): species_list = alignment_generator.get_blastn_targets(protein_id) for species in species_list: ############# MOVE TO ANOTHER FNC output_file = "{0}/{1}.blastout".format(crawler.get_blastn_path(protein_id), species.strip()) input_file = "{0}/{1}.fa".format(crawler.get_expanded_gene_path(protein_id), species.strip()) database = "{0}/{1}.fa".format(crawler.get_database_path(protein_id), referenced_species) command = command_generator.generate_blastn_command(database, input_file, output_file) command_return = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) output = command_return.stdout.read() if output != "": #LOGGING os.remove(output_file) alignment_logger.warning("{0}, {1}, BLASTN, {2}".format(protein_id, species.strip(), output.strip())) failed_species_list.append(species.strip()) if failed_species_list: alignment_generator.set_failed_blastn_targets(protein_id, failed_species_list) return False return True
class AlignmentTargetGenerator(object): ''' Class used to retrieve the list of all possible targets from certain alignment ''' def __init__(self): self.crawler = DirectoryCrawler() self.description_parser = DescriptionParser() def get_blastn_targets(self, protein_id): ''' @param protein_id: retrieves the list of species (RBS) not aligned with blastn for that protein ''' path = self.crawler.get_blastn_path(protein_id) return get_species_list(protein_id, path) def set_failed_blastn_targets(self, protein_id, failed_species_list): path = self.crawler.get_blastn_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_tblastn_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with tblastn for that protein ''' path = self.crawler.get_tblastn_path(protein_id) return get_species_list(protein_id, path) def set_failed_tblastn_targets(self, protein_id, failed_species_list): path = self.crawler.get_tblastn_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_SW_gene_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with SW_gene for that protein ''' path = self.crawler.get_SW_gene_path(protein_id) return get_species_list(protein_id, path) def set_failed_SW_gene_targets(self, protein_id, failed_species_list): path = self.crawler.get_SW_gene_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_SW_exon_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with SW_exon for that protein ''' path = self.crawler.get_SW_exon_path(protein_id) return get_species_list(protein_id, path) def set_failed_SW_exon_targets(self, protein_id, failed_species_list): path = self.crawler.get_SW_exon_path(protein_id) write_failed_species_to_status(failed_species_list, path)
def generate_SW_gene_alignments(protein_id, species_list = None, referenced_species = "Homo_sapiens"): ''' Runs the SW program for a specified protein and list of species, using the expanded gene region. @param protein_id @param species_list: if provided, runs SW for this list of species, \ otherwise runs for species that are missing the SW output \ who are determined by .status file in the /SW/gene folder. ''' logger = Logger.Instance() alignment_logger = logger.get_logger('alignment') alignment_generator = AlignmentTargetGenerator() crawler = DirectoryCrawler() command_generator = CommandGenerator() if (not species_list): species_list = alignment_generator.get_SW_gene_targets(protein_id) failed_species_list = [] for species in species_list: ########### MOVE output_file = "{0}/{1}.swout".format(crawler.get_SW_gene_path(protein_id), species.strip()) query_sequence_file = "{0}/{1}.fa".format(crawler.get_expanded_gene_path(protein_id), species.strip()) target_fasta_db_file = "{0}/{1}.fa".format(crawler.get_database_path(protein_id), referenced_species) command = command_generator.generate_SW_command(query_sequence_file, target_fasta_db_file, output_file) command_return = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) output = command_return.stdout.read() if output != "": #LOGGING alignment_logger.warning("{0}, {1}, SW GENE, {2}".format(protein_id, species.strip(), output.strip())) failed_species_list.append(species.strip()) os.remove(".sw_stdout_supressed") if failed_species_list: alignment_generator.set_failed_SW_gene_targets(protein_id, failed_species_list) return False return True
def create_msa_alignments (): dc = DirectoryCrawler() pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() acg = AlignmentCommandGenerator() fill_all_containers(False) for (prot_id, exon_num) in get_protein_list(): if not check_status_file(prot_id): continue ref_prot_rec = pc.get(prot_id).get_sequence_record() exoloc_proteins = [] ensembl_proteins = [] exoloc_proteins.append(ref_prot_rec) ensembl_proteins.append(ref_prot_rec) assembled_dir = dc.get_assembled_protein_path(prot_id) for fasta in sorted(os.listdir(assembled_dir)): if fasta == "Homo_sapiens.fa": continue abs_fasta = "%s/%s" % (assembled_dir, fasta) prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein) exoloc_proteins.append(prot_rec) species_list = get_species_list(prot_id, None) for species in species_list: if species == "Homo_sapiens": continue data_map = dmc.get((prot_id, species)) prot_rec = pc.get(data_map.protein_id).get_sequence_record() prot_rec.id = species ensembl_proteins.append(prot_rec) msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id) msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id) write_seq_records_to_file(msa_exoloc_path, exoloc_proteins) write_seq_records_to_file(msa_ensembl_path, ensembl_proteins) cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id)) print cmd os.system(cmd) cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id)) print cmd os.system(cmd)
def __init__(self): self.crawler = DirectoryCrawler() self.description_parser = DescriptionParser()
def parse_blast_output (ref_protein_id, species, blast): ''' @return: Dictionary where key is reference species exon_id, and the value is list of corresponding alignments ''' logger = Logger.Instance() containers_logger = logger.get_logger('containers') dc = DirectoryCrawler() if blast == "blastn": blast_file = "{0}/{1}.blastout".format(dc.get_blastn_path(ref_protein_id), species) else: blast_file = "{0}/{1}.blastout".format(dc.get_tblastn_path(ref_protein_id), species) if not os.path.isfile(blast_file): containers_logger.error ("{0}, {1}, {2}, no blastout file".format(ref_protein_id, species, blast)) return None file_handle = open(blast_file, 'r') # parse blastn output try: blastn_record = NCBIXML.read(file_handle) except ValueError: containers_logger.error("%s,%s,%s,No hits found" % (ref_protein_id, species, blast)) return None exon_dict = {} exon_pattern = re.compile("(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)") for alignment in blastn_record.alignments: (blast_info, exon_info) = alignment.title.split() pattern_match = re.match(exon_pattern, exon_info) ref_exon_id = pattern_match.groups()[3] exon_start = int (pattern_match.groups()[0]) exon_end = int(pattern_match.groups()[1]) # limit alignments to 10 hsps num_of_hsps = 0 for hsp in alignment.hsps: # limit! if blast == "blastn": (query_frame, hit_frame) = hsp.frame if query_frame == -1 or hit_frame == -1: continue if num_of_hsps == 5: break num_of_hsps += 1 exon = Exon(blast, ref_exon_id, ref_protein_id, species) if type(hsp.gaps) is int: gaps = hsp.gaps elif type(hsp.gaps) is tuple: if not hsp.gaps[0]: gaps = 0 exon.set_alignment_info ( hsp.identities, hsp.positives, gaps, hsp.sbjct_start, hsp.sbjct_start + len(hsp.sbjct) -1, hsp.query_start, hsp.query_start + len(hsp.sbjct) -1, len(hsp.sbjct), hsp.sbjct, hsp.query, hsp.score) if not ref_exon_id in exon_dict: exon_dict[ref_exon_id] = [exon] else: exon_dict[ref_exon_id].append(exon) # means we covered the whole exon if len(hsp.sbjct) == abs(exon_end-exon_start)+1 and len(hsp.sbjct) == hsp.identities: break file_handle.close() return exon_dict
def get_exon_file_path (self): ''' Retrieve the file with the ensembl exons in fasta format ''' dc = DirectoryCrawler() return "{0}/{1}.fa".format(dc.get_exon_ensembl_path(self.ref_protein_id), self.species)
else: break i += 1 pattern = re.compile("lcl\|(.*)\spep::*") for title in best_alignments: prot_match = re.match(pattern, title) if prot_match.groups()[0] == protein_id: return title return best_alignments[0] if __name__ == '__main__': protein_id = "ENSP00000311134" acg = AlignmentCommandGenerator() dc = DirectoryCrawler() dc.generate_directory_tree(protein_id) descr_file_path = dc.get_protein_description_file_path(protein_id) descr_file = open(descr_file_path, 'w') output_file_path = dc.get_protein_path(protein_id) + "/" + "Homo_sapiens.fasta" fastacmd = acg.generate_fastacmd_protein_command(protein_id, "Homo_sapiens", "all", output_file_path) os.system(fastacmd) for species in get_default_species_list(): find_ortholog_by_RBH("Homo_sapiens", species, output_file_path, protein_id) descr_file.close()
def reset_action (protein_id, key): update_entry_in_status_file(protein_id, key, 'FAILED') crawler = DirectoryCrawler() if key == 'GENE_RETRIEVAL': clear_directory(crawler.get_gene_path(protein_id)) elif key == 'EXP_GENE_RETRIEVAL' : clear_directory(crawler.get_expanded_gene_path(protein_id)) elif key == 'PROTEIN_RETRIEVAL' : clear_directory(crawler.get_protein_path(protein_id)) elif key == 'ENSEMBL_EXON_RETRIEVAL' : clear_directory(crawler.get_exon_ensembl_path(protein_id)) elif key == 'GENEWISE_EXON_RETRIEVAL' : clear_directory(crawler.get_exon_genewise_path(protein_id)) clear_directory(crawler.get_genewise_path(protein_id)) elif key == 'REF_SP_DB_FORMATTING' : clear_directory(crawler.get_database_path(protein_id)) elif key == 'BLASTN_ALIGNMENT' : clear_directory(crawler.get_blastn_path(protein_id)) elif key == 'TBLASTN_ALIGNMENT' : clear_directory(crawler.get_tblastn_path(protein_id)) elif key == 'SW_GENE_ALIGNMENT' : clear_directory(crawler.get_SW_gene_path(protein_id)) elif key == 'SW_EXON_ALIGNMENT' : clear_directory(crawler.get_SW_exon_path(protein_id))
class DescriptionParser: """ Loads configuration files from the cfg directory """ def __init__(self): self.crawler = DirectoryCrawler() def get_gene_regions(self, protein_id): """ Parses the description file for the protein_id, and retrieves the information about protein locations for every species. Locations are stored as tuples (location_type, assembly, location_id, seq_begin, seq_end, strand) @param protein_id: protein_id for which protein ids of other species should be retrieved @return: (prot_ids_known, prot_ids_abinitio) - two dictionaries (key is species name, value is gene location data as described) """ (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id) genes_known = {} genes_abinitio = {} for key, value in proteins_known.items(): genes_known[key] = list(value)[3:] for key, value in proteins_abinitio.items(): genes_abinitio[key] = list(value)[1:] return genes_known, genes_abinitio def get_protein_ids(self, protein_id): """ Parses the description file for the protein_id, and retrieves only the protein ids for every species @param protein_id: protein_id for which protein ids of other species should be retrieved @return: (prot_ids_known, prot_ids_abinitio) - two dictionaries (key is species name, value is orthologous protein id) """ (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id) prot_ids_known = {} prot_ids_abinitio = {} for key, value in proteins_known.items(): prot_ids_known[key] = list(value)[0] for key, value in proteins_abinitio.items(): prot_ids_abinitio[key] = list(value)[0] return prot_ids_known, prot_ids_abinitio def get_species(self, protein_id): """ Parses the description file for the protein_id, and retrieves the list of species for which has reciprocal best search found a valid protein. @param protein_id: protein_id for which protein ids of other species should be retrieved @return: species_list - found by RBH """ (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id) species_list = proteins_known.keys() species_list.extend(proteins_abinitio.keys()) for species in species_list: species = species.strip() return sorted(species_list) def get_separated_species(self, protein_id): (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id) species_known = proteins_known.keys() species_abinitio = proteins_abinitio.keys() return (species_known, species_abinitio) def get_strand_information(self, protein_id): """ Parses the description file for the protein_id and retrieves list of appropriate strands on which the proteins found by RBH are found. @param protein_id: protein_id for which protein ids of other species should be retrieved @return: strands: dictionary (species:strand) - found by RBH """ strands = {} (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id) for ( species, (spec_protein_id, gene_id, transcript_id, location_type, assembly, location_id, seq_begin, seq_end, strand), ) in proteins_known.items(): strands[species] = int(strand) for ( species, (spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand), ) in proteins_abinitio.items(): strands[species] = int(strand) return strands def parse_descr_file(self, protein_id): """ Function for parsing the description file associated with the protein_id. Description file contains two different types of entries: for known protein and for abinitio. Consequently, there are two formats that can be expected. They are both tab delimited. known_format: species protein_id gene_id transcript_id location_type:assembly:location_id:seq_begin:seq_end:strand abinitio_format: species protein_id location_type:assembly:location_id:seq_begin:seq_end:strand @param protein_id: protein for which the description file will be parsed. @return: proteins_known_data - dictionary (key is species, value is a tuple of all the available data for that species protein) abinitio_known_data (the same), two dictionaries are returned as a tuple (spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand) @raise IOError: in case there is no description file present for the protein_id """ proteins_known_data = {} proteins_abinitio_data = {} descr_file_path = "{0}/{1}.descr".format(self.crawler.get_root_path(protein_id), protein_id) pattern_known = re.compile("(.*)\t(ENS.*)\t(.*)\t(.*)\t(.*):(.*):(.*):(.*):(.*):(.*)") pattern_abinitio = re.compile("(.*)\t(GEN.*)\t(.*):(.*):(.*):(.*):(.*):(.*)") try: descr_file = open(descr_file_path, "r") except IOError: raise IOError("There is no description file present for protein: %s" % protein_id) for line in descr_file.readlines(): line = line.strip() species_data = line.split() if len(species_data) == 5: species_name = species_data[0] spec_protein_id, gene_id, transcript_id = species_data[1:4] (location_type, assembly, location_id, seq_begin, seq_end, strand) = species_data[-1].split(":") proteins_known_data[species_name] = ( spec_protein_id, gene_id, transcript_id, location_type, assembly, location_id, seq_begin, seq_end, strand, ) elif len(species_data) == 3: species_name = species_data[0] (location_type, assembly, location_id, seq_begin, seq_end, strand) = species_data[-1].split(":") spec_protein_id = species_data[1] proteins_abinitio_data[species_name] = ( spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand, ) descr_file.close() return proteins_known_data, proteins_abinitio_data def parse_description_file_general_info(self, protein_id): """ Similar as parse_description_file, but returns all the information. Required for generating a model structure. known: (species_name, spec_protein_id, gene_id, transcript_id, location_type, assembly, location_id, seq_begin, seq_end, strand) abinitio: (species_name, spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand) """ proteins_known_data = [] proteins_abinitio_data = [] descr_file_path = "{0}/{1}.descr".format(self.crawler.get_root_path(protein_id), protein_id) pattern_known = re.compile("(.*)\t(ENS.*)\t(.*)\t(.*)\t(.*):(.*):(.*):(.*):(.*):(.*)") pattern_abinitio = re.compile("(.*)\t(GEN.*)\t(.*):(.*):(.*):(.*):(.*):(.*)") try: descr_file = open(descr_file_path, "r") except IOError: raise IOError("There is no description file present for protein: %s" % protein_id) for line in descr_file.readlines(): line = line.strip() match = re.match(pattern_known, line) if match: ( species_name, spec_protein_id, gene_id, transcript_id, location_type, assembly, location_id, seq_begin, seq_end, strand, ) = match.groups() proteins_known_data.append( ( species_name, spec_protein_id, gene_id, transcript_id, location_type, assembly, location_id, seq_begin, seq_end, strand, ) ) match = re.match(pattern_abinitio, line) if match: ( species_name, spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand, ) = match.groups() proteins_abinitio_data.append( (species_name, spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand) ) descr_file.close() return proteins_known_data, proteins_abinitio_data
def main (): #ERROR FILE::: err_f = open('/home/marioot/err_status_monday.txt', 'w') fill_all_containers(True) protein_tuples = get_protein_list() ec = ExonContainer.Instance() beac = BestExonAlignmentContainer.Instance() dc = DirectoryCrawler() for (protein_id, exon_num) in protein_tuples: if int(exon_num) > 15: print "too big" continue species_list = get_species_list(protein_id, None) try: ref_exons = ec.get((protein_id, "Homo_sapiens", "ensembl")) except KeyError: print "ERROR: No protein %s" % protein_id continue for species in species_list: try: print "\nBest_exon_al: %s, %s" % (protein_id, species) err_f.write("%s, %s" % (protein_id, species)) bpp = BestProteinProduct (protein_id, species, "Homo_sapiens") bpp.load_alignments() bpp.decide_on_best_exons() #bpp.patch_interexon_AAS() for ref_exon in ref_exons.get_coding_exons(): best_exon_alignment = bpp.best_exons[ref_exon.exon_id] if best_exon_alignment: beac.add(ref_exon.exon_id, species, best_exon_alignment) print "%d. Exon status: %s (%s)" % (ref_exon.ordinal, best_exon_alignment.status, ref_exon.exon_id) if best_exon_alignment.sw_gene_alignment: print ref_exon.sequence[ref_exon.frame:].translate() best_exon_alignment.sw_gene_alignment.create_cDNA() print "\tAdded %2d alignment pieces" % (len(best_exon_alignment.sw_gene_alignment.alignment_pieces)) for al_piece in best_exon_alignment.sw_gene_alignment.alignment_pieces: print "\t\t%s:" % (al_piece.type), if al_piece.type in ["coding", "insertion"]: print "PROT: %d-%d, GENOME: %d-%d, %s" % (al_piece.ref_protein_start, al_piece.ref_protein_stop, al_piece.genomic_start, al_piece.genomic_stop, al_piece.sequence_id) print "\t\t\tHUMAN:", al_piece.ref_protein_seq print "\t\t\tSPEC :", al_piece.spec_protein_seq else: print whole_prot = bpp.get_spec_protein_translation() whole_prot_rec = SeqRecord(whole_prot, id = species, description = "assembled_protein") file_name = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species) SeqIO.write(whole_prot_rec, file_name, "fasta") print beac.get("ENSE00002199725", species) except Exception, e: print '{0} {1} \n'.format(protein_id, species) err_f.write('{0} {1} \n'.format(protein_id, species))
def __init__(self): self.crawler = DirectoryCrawler()
def main(): ''' Retrieves the list of all the proteins from reference species. For each ref species protein, it tries to find orthologues for all the species (from the species list) and generates the description file accordingly. If the description file already exists, it checks the status (OK/PARTIAL/FAILED). ''' reference_species = "Homo_sapiens" dc = DirectoryCrawler() acg = AlignmentCommandGenerator() logger = Logger.Instance() mutual_best_logger = logger.get_logger('mutual_best') protein_list = get_protein_list() species_list = get_default_species_list() failed_proteins = [] for (protein_id, num_of_exons) in protein_list: known_dict = {} abinitio_dict = {} print protein_id # generate all the directories for the protein dc.generate_directory_tree(protein_id) descr_file_path = dc.get_protein_description_file_path(protein_id) status_file_path = dc.get_mutual_best_status_file_path(protein_id) if (os.path.isfile(status_file_path) and os.path.getsize(status_file_path)): print DescriptionParser().get_protein_ids(protein_id) status_dict = read_status_file(protein_id) if (status_dict.has_key('MUTUAL_BEST')): if status_dict['MUTUAL_BEST'] == 'OK': mutual_best_logger.info('-,%s,mutual_best already exists for this protein - moving to the next one' % protein_id) else : mutual_best_logger.error('-,%s,mutual_best has failed for this protein (no orthologs found) - moving on the next one' % protein_id) failed_proteins.append(protein_id) continue # create the description file descr_file = open(descr_file_path, 'w') # reference protein file ref_species_pep = dc.get_protein_path(protein_id) + "/" + reference_species + ".fasta" fastacmd = acg.generate_fastacmd_protein_command(protein_id, reference_species, "all", ref_species_pep) p = Popen(fastacmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) output = p.stdout.read() if output: mutual_best_logger.error("%s,fastacmd error" % protein_id) # find orthologues for all species for species in species_list: find_ortholog_by_RBH (reference_species, species, ref_species_pep, protein_id, descr_file, mutual_best_logger) descr_file.close() mutual_best_logger.info("\n\n") # check what we've found out, whether this protein has any orthologs (known_dict, abinitio_dict) = DescriptionParser().get_protein_ids(protein_id) if (not abinitio_dict and (not known_dict or (len(known_dict.keys()) == 1 and known_dict.keys()[0] == reference_species))): mutual_best_logger.info ("-,%s, mutual best failed for this protein." % protein_id) update_entry_in_status_file(protein_id, "MUTUAL_BEST", "FAILED") failed_proteins.append(protein_id) else: update_entry_in_status_file(protein_id, "MUTUAL_BEST", "OK") print "Failed proteins: " for failed_protein_id in failed_proteins: print failed_protein_id
def parse_SW_output (ref_protein_id, species, sw_type): ''' Parses the output from the SW# command line application. (suitable for version as it was distributed on May 1st, 2012) @param sw_type: sw_exon/sw_gene @return: dictionary of alignment exons. The keys are referent exon IDs, and values are lists of all the alignment exons which correspond to the certain reference exon ''' logger = Logger.Instance() containers_logger = logger.get_logger('containers') dc = DirectoryCrawler() # determine the swout file path if sw_type.lower() == "sw_gene": swout_file_path = dc.get_SW_gene_path(ref_protein_id) elif sw_type.lower() == "sw_exon": swout_file_path = dc.get_SW_exon_path(ref_protein_id) else: raise KeyError ("There is no known swout path for type %s" % sw_type) swout_file_path += "/%s.swout" % species if not os.path.isfile(swout_file_path): containers_logger.error ("{0}, {1}, {2}, no swout file".format(ref_protein_id, species, sw_type)) return False swout_file = open(swout_file_path, 'r') # status boolean variables parsing_query_seq = True # patterns for matching header_pattern = re.compile ("Name: >(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)") #Intervals: 1207047 1207087 30 69 (+) strand intervals_pattern = re.compile ("Intervals:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\([+-]\)\s+strand") #Identity: 31/41 (75.6%) identity_pattern = re.compile ("Identity:\s+(\d+)/(\d+).*") #Similarity: 40/41 (97.6%) similarity_pattern = re.compile ("Similarity:\s+(\d+)/.*") #Gaps: 1/41 (2.4%) gaps_pattern = re.compile ("Gaps:\s+(\d+)/\d+.*") #Score: 2828.000 score_pattern = re.compile ("Score:.*") # sequence pattern sequence_pattern = re.compile ("\s*(\d+)\s+([ATCGN-]+)\s+(\d+).*") exon_dict = {} ref_exon_id = "" identities = 0 positives = 0 gaps = 0 score = 0. sbjct_start = 0 sbjct_end = 0 query_start = 0 query_end = 0 length = 0 query_sequence = "" sbjct_sequence = "" exon = Exon(sw_type, "", ref_protein_id, species) for line in swout_file.readlines(): line = line.strip() header_match = re.match(header_pattern, line) if header_match: #add the current exon and start a new one if ref_exon_id: exon.set_alignment_info(int(identities), int(positives), int(gaps), int(sbjct_start), int(sbjct_end), int(query_start), int(query_end), int(length), sbjct_sequence, query_sequence, float(score)) if ref_exon_id in exon_dict: exon_dict[ref_exon_id].append(exon) else: exon_dict[ref_exon_id] = [exon] ref_exon_id = header_match.groups()[3] exon = Exon(sw_type, ref_exon_id, ref_protein_id, species) parsing_query_seq = True query_sequence = "" sbjct_sequence = "" # intervals intervals_match = re.match (intervals_pattern, line) if intervals_match: (query_start, query_end, sbjct_start, sbjct_end) = intervals_match.groups() # identities identity_match = re.match (identity_pattern, line) if identity_match: (identities, length) = identity_match.groups() # similarities similarity_match = re.match(similarity_pattern, line) if similarity_match: positives = similarity_match.groups()[0] # gaps gaps_match = re.match (gaps_pattern, line) if gaps_match: gaps = gaps_match.groups()[0] score_match = re.match(score_pattern, line) if score_match: score = line.split()[-1] # sequence sequence_match = re.match (sequence_pattern, line) if sequence_match: sequence_to_append = sequence_match.groups()[1].strip() if parsing_query_seq: query_sequence += sequence_to_append parsing_query_seq = False else: sbjct_sequence += sequence_to_append parsing_query_seq = True exon.set_alignment_info(int(identities), int(positives), int(gaps), int(sbjct_start), int(sbjct_end), int(query_start), int(query_end), int(length), sbjct_sequence, query_sequence, float(score)) if query_sequence: if ref_exon_id in exon_dict: exon_dict[ref_exon_id].append(exon) else: exon_dict[ref_exon_id] = [exon] return exon_dict