class AlignmentTargetGenerator(object): ''' Class used to retrieve the list of all possible targets from certain alignment ''' def __init__(self): self.crawler = DirectoryCrawler() self.description_parser = DescriptionParser() def get_blastn_targets(self, protein_id): ''' @param protein_id: retrieves the list of species (RBS) not aligned with blastn for that protein ''' path = self.crawler.get_blastn_path(protein_id) return get_species_list(protein_id, path) def set_failed_blastn_targets(self, protein_id, failed_species_list): path = self.crawler.get_blastn_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_tblastn_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with tblastn for that protein ''' path = self.crawler.get_tblastn_path(protein_id) return get_species_list(protein_id, path) def set_failed_tblastn_targets(self, protein_id, failed_species_list): path = self.crawler.get_tblastn_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_SW_gene_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with SW_gene for that protein ''' path = self.crawler.get_SW_gene_path(protein_id) return get_species_list(protein_id, path) def set_failed_SW_gene_targets(self, protein_id, failed_species_list): path = self.crawler.get_SW_gene_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_SW_exon_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with SW_exon for that protein ''' path = self.crawler.get_SW_exon_path(protein_id) return get_species_list(protein_id, path) def set_failed_SW_exon_targets(self, protein_id, failed_species_list): path = self.crawler.get_SW_exon_path(protein_id) write_failed_species_to_status(failed_species_list, path)
def reset_action (protein_id, key): update_entry_in_status_file(protein_id, key, 'FAILED') crawler = DirectoryCrawler() if key == 'GENE_RETRIEVAL': clear_directory(crawler.get_gene_path(protein_id)) elif key == 'EXP_GENE_RETRIEVAL' : clear_directory(crawler.get_expanded_gene_path(protein_id)) elif key == 'PROTEIN_RETRIEVAL' : clear_directory(crawler.get_protein_path(protein_id)) elif key == 'ENSEMBL_EXON_RETRIEVAL' : clear_directory(crawler.get_exon_ensembl_path(protein_id)) elif key == 'GENEWISE_EXON_RETRIEVAL' : clear_directory(crawler.get_exon_genewise_path(protein_id)) clear_directory(crawler.get_genewise_path(protein_id)) elif key == 'REF_SP_DB_FORMATTING' : clear_directory(crawler.get_database_path(protein_id)) elif key == 'BLASTN_ALIGNMENT' : clear_directory(crawler.get_blastn_path(protein_id)) elif key == 'TBLASTN_ALIGNMENT' : clear_directory(crawler.get_tblastn_path(protein_id)) elif key == 'SW_GENE_ALIGNMENT' : clear_directory(crawler.get_SW_gene_path(protein_id)) elif key == 'SW_EXON_ALIGNMENT' : clear_directory(crawler.get_SW_exon_path(protein_id))
def parse_SW_output (ref_protein_id, species, sw_type): ''' Parses the output from the SW# command line application. (suitable for version as it was distributed on May 1st, 2012) @param sw_type: sw_exon/sw_gene @return: dictionary of alignment exons. The keys are referent exon IDs, and values are lists of all the alignment exons which correspond to the certain reference exon ''' logger = Logger.Instance() containers_logger = logger.get_logger('containers') dc = DirectoryCrawler() # determine the swout file path if sw_type.lower() == "sw_gene": swout_file_path = dc.get_SW_gene_path(ref_protein_id) elif sw_type.lower() == "sw_exon": swout_file_path = dc.get_SW_exon_path(ref_protein_id) else: raise KeyError ("There is no known swout path for type %s" % sw_type) swout_file_path += "/%s.swout" % species if not os.path.isfile(swout_file_path): containers_logger.error ("{0}, {1}, {2}, no swout file".format(ref_protein_id, species, sw_type)) return False swout_file = open(swout_file_path, 'r') # status boolean variables parsing_query_seq = True # patterns for matching header_pattern = re.compile ("Name: >(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)") #Intervals: 1207047 1207087 30 69 (+) strand intervals_pattern = re.compile ("Intervals:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\([+-]\)\s+strand") #Identity: 31/41 (75.6%) identity_pattern = re.compile ("Identity:\s+(\d+)/(\d+).*") #Similarity: 40/41 (97.6%) similarity_pattern = re.compile ("Similarity:\s+(\d+)/.*") #Gaps: 1/41 (2.4%) gaps_pattern = re.compile ("Gaps:\s+(\d+)/\d+.*") #Score: 2828.000 score_pattern = re.compile ("Score:.*") # sequence pattern sequence_pattern = re.compile ("\s*(\d+)\s+([ATCGN-]+)\s+(\d+).*") exon_dict = {} ref_exon_id = "" identities = 0 positives = 0 gaps = 0 score = 0. sbjct_start = 0 sbjct_end = 0 query_start = 0 query_end = 0 length = 0 query_sequence = "" sbjct_sequence = "" exon = Exon(sw_type, "", ref_protein_id, species) for line in swout_file.readlines(): line = line.strip() header_match = re.match(header_pattern, line) if header_match: #add the current exon and start a new one if ref_exon_id: exon.set_alignment_info(int(identities), int(positives), int(gaps), int(sbjct_start), int(sbjct_end), int(query_start), int(query_end), int(length), sbjct_sequence, query_sequence, float(score)) if ref_exon_id in exon_dict: exon_dict[ref_exon_id].append(exon) else: exon_dict[ref_exon_id] = [exon] ref_exon_id = header_match.groups()[3] exon = Exon(sw_type, ref_exon_id, ref_protein_id, species) parsing_query_seq = True query_sequence = "" sbjct_sequence = "" # intervals intervals_match = re.match (intervals_pattern, line) if intervals_match: (query_start, query_end, sbjct_start, sbjct_end) = intervals_match.groups() # identities identity_match = re.match (identity_pattern, line) if identity_match: (identities, length) = identity_match.groups() # similarities similarity_match = re.match(similarity_pattern, line) if similarity_match: positives = similarity_match.groups()[0] # gaps gaps_match = re.match (gaps_pattern, line) if gaps_match: gaps = gaps_match.groups()[0] score_match = re.match(score_pattern, line) if score_match: score = line.split()[-1] # sequence sequence_match = re.match (sequence_pattern, line) if sequence_match: sequence_to_append = sequence_match.groups()[1].strip() if parsing_query_seq: query_sequence += sequence_to_append parsing_query_seq = False else: sbjct_sequence += sequence_to_append parsing_query_seq = True exon.set_alignment_info(int(identities), int(positives), int(gaps), int(sbjct_start), int(sbjct_end), int(query_start), int(query_end), int(length), sbjct_sequence, query_sequence, float(score)) if query_sequence: if ref_exon_id in exon_dict: exon_dict[ref_exon_id].append(exon) else: exon_dict[ref_exon_id] = [exon] return exon_dict