class AlignmentTargetGenerator(object):
    '''
    Class used to retrieve the list of all possible targets from certain alignment
    '''
    def __init__(self):
        self.crawler = DirectoryCrawler()
        self.description_parser = DescriptionParser()
    
    def get_blastn_targets(self, protein_id):
        '''
        @param protein_id: retrieves the list of species (RBS) not aligned with blastn for that protein
        '''       
        path = self.crawler.get_blastn_path(protein_id)
        return get_species_list(protein_id, path)
    
    
    def set_failed_blastn_targets(self, protein_id, failed_species_list):
        path = self.crawler.get_blastn_path(protein_id)
        write_failed_species_to_status(failed_species_list, path)
        
    def get_tblastn_targets(self, protein_id):
        '''
        @param protein_id: retrieves the list of species not aligned with tblastn for that protein
        '''       
        path = self.crawler.get_tblastn_path(protein_id)
        return get_species_list(protein_id, path)
        
    def set_failed_tblastn_targets(self, protein_id, failed_species_list):
        path = self.crawler.get_tblastn_path(protein_id)
        write_failed_species_to_status(failed_species_list, path)
        
    def get_SW_gene_targets(self, protein_id):
        '''
        @param protein_id: retrieves the list of species not aligned with SW_gene for that protein
        '''       
        path = self.crawler.get_SW_gene_path(protein_id)
        return get_species_list(protein_id, path)
    
    def set_failed_SW_gene_targets(self, protein_id, failed_species_list):
        path = self.crawler.get_SW_gene_path(protein_id)
        write_failed_species_to_status(failed_species_list, path)
    
    def get_SW_exon_targets(self, protein_id):
        '''
        @param protein_id: retrieves the list of species not aligned with SW_exon for that protein
        '''       
        path = self.crawler.get_SW_exon_path(protein_id)
        return get_species_list(protein_id, path)
    
    def set_failed_SW_exon_targets(self, protein_id, failed_species_list):
        path = self.crawler.get_SW_exon_path(protein_id)
        write_failed_species_to_status(failed_species_list, path)
def reset_action (protein_id, key):
    update_entry_in_status_file(protein_id, key, 'FAILED')
    crawler = DirectoryCrawler()
    
    if key == 'GENE_RETRIEVAL': 
        clear_directory(crawler.get_gene_path(protein_id))
    elif key == 'EXP_GENE_RETRIEVAL' : 
        clear_directory(crawler.get_expanded_gene_path(protein_id))
    elif key == 'PROTEIN_RETRIEVAL' : 
        clear_directory(crawler.get_protein_path(protein_id))
    elif key == 'ENSEMBL_EXON_RETRIEVAL' : 
        clear_directory(crawler.get_exon_ensembl_path(protein_id))
    elif key == 'GENEWISE_EXON_RETRIEVAL' : 
        clear_directory(crawler.get_exon_genewise_path(protein_id))
        clear_directory(crawler.get_genewise_path(protein_id))
    elif key == 'REF_SP_DB_FORMATTING' : 
        clear_directory(crawler.get_database_path(protein_id))
    elif key == 'BLASTN_ALIGNMENT' : 
        clear_directory(crawler.get_blastn_path(protein_id))
    elif key == 'TBLASTN_ALIGNMENT' : 
        clear_directory(crawler.get_tblastn_path(protein_id))
    elif key == 'SW_GENE_ALIGNMENT' : 
        clear_directory(crawler.get_SW_gene_path(protein_id))
    elif key == 'SW_EXON_ALIGNMENT' : 
        clear_directory(crawler.get_SW_exon_path(protein_id))
Exemplo n.º 3
0
def generate_tblastn_alignments(protein_id, species_list = None, referenced_species = "Homo_sapiens"):
    '''
        Runs the tblastn program for a specified protein and list of species
        @param protein_id
        @param species_list: if provided, runs tblastn for this list of species, \
                             otherwise runs for species that are missing the tblastn output \
                             who are determined by .status file in the tblastn folder.
    '''
    logger              = Logger.Instance()
    alignment_logger    = logger.get_logger('alignment')
    
    alignment_generator = AlignmentTargetGenerator()
    crawler             = DirectoryCrawler()
    command_generator   = CommandGenerator()
    
    if (not species_list):
        species_list    = alignment_generator.get_tblastn_targets(protein_id)
    
    failed_species_list = []
    for species in species_list:
        
        ############## MOVE
        output_file     = "{0}/{1}.blastout".format(crawler.get_tblastn_path(protein_id), species.strip())
        input_file      = "{0}/{1}.fa".format(crawler.get_protein_path(protein_id), species.strip())
        database        = "{0}/{1}.fa".format(crawler.get_database_path(protein_id), referenced_species)
        
        command         = command_generator.generate_tblastn_command(database, input_file, output_file)
        command_return  = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        output          = command_return.stdout.read()
        if output != "":
            #LOGGING
            os.remove(output_file)
            alignment_logger.warning("{0}, {1}, TBLASTN, {2}".format(protein_id, species.strip(), output.strip()))
            failed_species_list.append(species.strip())
      
    if failed_species_list:        
        alignment_generator.set_failed_tblastn_targets(protein_id, failed_species_list)
        return False
    return True
Exemplo n.º 4
0
def parse_blast_output (ref_protein_id, species, blast):
    '''
    @return: Dictionary where key is reference species exon_id, and 
    the value is list of corresponding alignments
    '''
    
    logger              = Logger.Instance()
    containers_logger   = logger.get_logger('containers')
    dc                  = DirectoryCrawler()
    
    if blast == "blastn":
        blast_file = "{0}/{1}.blastout".format(dc.get_blastn_path(ref_protein_id), species)
    else:
        blast_file = "{0}/{1}.blastout".format(dc.get_tblastn_path(ref_protein_id), species)
        
    if not os.path.isfile(blast_file):
        containers_logger.error ("{0}, {1}, {2}, no blastout file".format(ref_protein_id, species, blast))
        return None
        
    file_handle = open(blast_file, 'r')
    
    # parse blastn output  
    try:
        blastn_record = NCBIXML.read(file_handle)
    except ValueError:
        containers_logger.error("%s,%s,%s,No hits found" % (ref_protein_id, species, blast))
        return None
    
    exon_dict = {}
    exon_pattern = re.compile("(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)")
    
    for alignment in blastn_record.alignments:
        (blast_info, exon_info) = alignment.title.split()
        pattern_match = re.match(exon_pattern, exon_info)
        ref_exon_id = pattern_match.groups()[3]
        exon_start = int (pattern_match.groups()[0])
        exon_end = int(pattern_match.groups()[1])
        
        # limit alignments to 10 hsps
        
        num_of_hsps = 0
        
        for hsp in alignment.hsps:
            # limit!
            if blast == "blastn":
                (query_frame, hit_frame) = hsp.frame
                if query_frame == -1 or hit_frame == -1:
                    continue 
            if num_of_hsps == 5:
                break
            num_of_hsps += 1
            
            exon = Exon(blast, ref_exon_id, ref_protein_id, species)
            if type(hsp.gaps) is int:
                gaps = hsp.gaps
            elif type(hsp.gaps) is tuple:
                if not hsp.gaps[0]:
                    gaps = 0
            exon.set_alignment_info ( hsp.identities, 
                                      hsp.positives, 
                                      gaps, 
                                      hsp.sbjct_start, 
                                      hsp.sbjct_start + len(hsp.sbjct) -1,
                                      hsp.query_start,
                                      hsp.query_start + len(hsp.sbjct) -1,
                                      len(hsp.sbjct),
                                      hsp.sbjct,
                                      hsp.query,
                                      hsp.score)
            if not ref_exon_id in exon_dict:
                exon_dict[ref_exon_id] = [exon]
            else:
                exon_dict[ref_exon_id].append(exon)
                
            # means we covered the whole exon
            if len(hsp.sbjct) == abs(exon_end-exon_start)+1 and len(hsp.sbjct) == hsp.identities:
                break
        
    file_handle.close()
    return exon_dict