def single_design(run_no, seed, seq_no): logging.info("Design - start - run {}, seq_no {}".format(run_no, seq_no)) # send sequences to RNAfbinv # TODO: use new version of RNAfbinv designed_sequence = rna_designer.run_rnafbinv( target_structure, target_sequence, iter_no=1000, seed=seed, random_seed=random_gen.getrandbits(64)) designed_structure_map = vienna.fold(designed_sequence) logging.info("Design - compare - run {}, seq_no {}".format(run_no, seq_no)) # generate information from MFE designed_structure_mfe = designed_structure_map.get('MFE') designed_tree_mfe = shapiro_tree_aligner.get_tree(designed_structure_mfe, designed_sequence) match_tree_mfe, match_score_mfe = shapiro_tree_aligner.align_trees( designed_tree_mfe, target_tree) # generate information from centroid designed_structure_centroid = designed_structure_map.get('centroid') designed_tree_centroid = shapiro_tree_aligner.get_tree( designed_structure_centroid, designed_sequence) match_tree_centroid, match_score_centroid = shapiro_tree_aligner.align_trees( designed_tree_centroid, target_tree) # print info to file out_text = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( run_no, seq_no, seed, designed_sequence, designed_tree_mfe, match_score_mfe, match_tree_mfe, designed_tree_centroid, match_score_centroid, match_tree_centroid) logging.info("Design - printing results:\n{}".format(out_text)) design_logger.info(out_text) return seq_no, designed_sequence
def analyze_res(sequence, target_tree): structure_map = vienna.fold(sequence) # generate information from MFE structure_mfe = structure_map.get('MFE') tree_mfe = shapiro_tree_aligner.get_tree(structure_mfe, sequence) match_tree_mfe, match_score_mfe = shapiro_tree_aligner.align_trees( tree_mfe, target_tree) # generate information from centroid structure_centroid = structure_map.get('centroid') tree_centroid = shapiro_tree_aligner.get_tree(structure_centroid, sequence) match_tree_centroid, match_score_centroid = shapiro_tree_aligner.align_trees( tree_centroid, target_tree) return tree_mfe, match_score_mfe, match_tree_mfe, tree_centroid, match_score_centroid, match_tree_centroid
def __init__(self, sequence: str, options: Dict[str, Any], calc_robusntess: bool = True): self.sequence = sequence fold_map = options.get('RNAfold').fold(sequence) self.fold_type = options.get('fold') self.energy = fold_map.get("{}_energy".format(options.get('fold'))) self.structure = fold_map.get(options.get('fold')) if calc_robusntess: self.mutational_robustness = calculate_neutrality( self.sequence, self.structure, options) else: self.mutational_robustness = None target_tree = shapiro_tree_aligner.get_tree( options['target_structure'], options['target_sequence']) self.result_tree = shapiro_tree_aligner.get_tree( self.structure, self.sequence) self.align_tree, self.score = shapiro_tree_aligner.align_trees( self.result_tree, target_tree, options['alignment_rules']) # Add energy diff target_energy = options.get('target_energy') if target_energy is not None: self.score += abs( fold_map['{}_energy'.format(options.get('fold'))] - target_energy) # Add mutation robustness diff target_neutrality = options.get('target_neutrality') if target_neutrality is not None: self.score += abs( calculate_neutrality(sequence, self.structure, options) - target_neutrality) * 100 self.tree_edit_distance = tree_aligner.get_align_tree_distance( self.align_tree) self.bp_dist = bp_distance(self.structure, options['target_structure'])
def get_align_score(identifier: str, sequence: str, cm_path: str, target_tree): fasta_file = None try: fasta_file = infernal.generate_fasta({identifier: sequence}) cm_struct, new_sequence = dive_statistics.get_cm_struct( cm_path, fasta_file.name) cm_tree = shapiro_tree_aligner.get_tree(cm_struct, new_sequence) _, score = shapiro_tree_aligner.align_trees(cm_tree, target_tree) return score finally: if fasta_file is not None: os.remove(fasta_file.name)
def score_sequence(sequence: str, target_tree: tree_aligner.Tree, options: Dict[str, Any]): # Align score tree alignment + sequence alignment fold_map = options.get('RNAfold').fold(sequence) structure = fold_map[options.get('fold')] tree, score = shapiro_tree_aligner.align_trees( shapiro_tree_aligner.get_tree(structure, sequence), target_tree, options['alignment_rules']) # Add energy diff target_energy = options.get('target_energy') if target_energy is not None: score += abs(fold_map['{}_energy'.format(options.get('fold'))] - target_energy) # Add mutation robustness diff target_neutrality = options.get('target_neutrality') if target_neutrality is not None: score += abs( calculate_neutrality(sequence, structure, options) - target_neutrality) * 100 return tree, score
def run_search(run_code: str, designed_object): general_run_logger.info('Starting search {}'.format(run_code)) cm_path = os.path.join(output_dir, '{}.cm'.format(run_code)) if not infernal.generate_single_seq_cm(designed_object.sequence, cm_path, designed_object.structure): general_run_logger.error( 'Failed to build covariance model. run code: {}\n{}\n{}'.format( run_code, designed_object.sequence, designed_object.structure)) return results = infernal.search_cm(cm_path, NT_PATH) if results is None: general_run_logger.error('Search failed {} {}\n{}'.format( run_code, cm_path, designed_object.sequence)) return general_run_logger.info('Finished search {}, {} results'.format( run_code, len(results))) for res_no, res in enumerate(results): try: sequence = res.get('sequence') structure = vienna.fold(sequence)['MFE'] res_tree = shapiro_tree_aligner.get_tree(structure, sequence) tree, score = shapiro_tree_aligner.align_trees( res_tree, target_tree) if score < 300 and score % 100 < 30: general_run_logger.info( 'Adding result {}, score {} sequence {}'.format( run_code, score, sequence)) 'seq code\tmatch no\tsequence\tstructure\tscore\ttarget id' result_logger.info('{}\t{}\t{}\t{}\t{}\t{}'.format( run_code, res_no, sequence, structure, score, res.get('identifier'))) else: general_run_logger.warning( 'Score too low {} result no {}, score {}, sequence: {}'. format(run_code, res_no, score, sequence)) except Exception: general_run_logger.fatel( 'Exception in search {}, res no {}, {}'.format( run_code, res_no, res))
if __name__ == "__main__": folder = "/DB/" if len(sys.argv) < 4: print( "Usage: search_runner.py <input sequence> <input structure> <<<amount of runs> <seeds per run> <designed sequence per seed>> | <sequence list>>" ) sys.exit(-1) # gather parameters target_sequence = sys.argv[1] target_structure = sys.argv[2] # init random number generator random_gen = random.Random() # find maximum match target_tree = shapiro_tree_aligner.get_tree(target_structure, target_sequence) _, target_score = shapiro_tree_aligner.align_trees(target_tree, target_tree) logging.basicConfig( level=logging.INFO, format= '%(levelname)s:%(asctime)s:Name[%(name)s]:Thread[%(thread)d] - %(message)s' ) logging.info("Target score: {}. Target tree: {}".format( target_score, target_tree)) output_dir = os.path.join(folder, "Output") # new method infernal has it's out index indexed_fasta_map = index_all(os.path.join(folder, "fasta_db")) # Start runs if len(sys.argv) == 6: # init log (result) files run_time_stamp = datetime.datetime.fromtimestamp( time.time()).strftime("%Y_%m_%d_%H_%M_%S")
def calc_score(source_sequence: str, source_structure: str) -> float: source_tree = shapiro_tree_aligner.get_tree(source_structure, source_sequence) _, score = shapiro_tree_aligner.align_trees(source_tree, target_tree) return score
def generate_clusters(match_file_path: str, design_file_path: str, target_tree, is_filter_bacteria: bool = False, mode: MODE = MODE.OLD) -> List[DesignGroup]: vienna_folder = None try: if mode == MODE.NEW: vienna_folder = vienna.LiveRNAfold() vienna_folder.start() design_group_map = {} with open(match_file_path, 'r') as match_file, open(design_file_path, 'r') as design_file: seq_code_map = {} design_file.readline() for line in design_file: if line.strip() == '': continue items = line.strip().split('\t') if mode == MODE.OLD: seq_code_map[items[0]] = { 'sequence': items[2].strip(), 'structure': items[4].strip() } else: # new doesnt go through a filter so we will filter it now code = '{}_{}'.format(items[0].strip(), items[1].strip()) sequence = items[3].strip() structure = vienna_folder.fold(sequence)['MFE'] source_tree = shapiro_tree_aligner.get_tree( structure, sequence) _, score = shapiro_tree_aligner.align_trees( source_tree, target_tree) if score < FILTER_THRESHOLD: seq_code_map[code] = { 'sequence': sequence, 'structure': structure } match_file.readline() for line in match_file: if line.strip() == '': continue items = line.strip().split('\t') design_id = items[0].strip() if seq_code_map.get(design_id) is None: continue design_group = design_group_map.get( design_id, DesignGroup(design_id, seq_code_map.get(design_id))) if mode == MODE.OLD: if not is_filter_bacteria or not check_ancestor( 'Bacteria', get_tax_id(items[5].strip().split('/', 1)[0])): design_group.add_match( items[5].strip(), { 'identifier': items[5].strip(), 'sequence': items[2].strip(), 'round': 0 }) else: # new mode didnt save identifier, add just and replace on first search design_group.add_match( str(len(design_group.matches)), { 'identifier': len(design_group.matches), 'sequence': items[1].strip(), 'round': 0 }) design_group_map[design_id] = design_group finally: if vienna_folder is not None: vienna_folder.close() return list(design_group_map.values())
def recreate_cm(folder_path: str, design_code: str, desc_keywords: List[str], test_name: str, sequence: str, structure: str, identifier_keywords: List[str] = None, new_seq_map: Dict[str, str] = {}, top_taxonomy: str = None, score_cutoff: int = 250): def keyword_relevant(check_item: str, keymap: List[str]) -> bool: if keymap is None: return True for key in keymap: if key.lower() in check_item.lower(): return True return False def check_taxonomy(tax_id: Optional[int]) -> bool: if top_taxonomy is None: return True elif tax_id is None: return False return result_dive.check_ancestor(top_taxonomy, tax_id) def check_c_or_u(named_tree) -> Tuple[bool, bool]: _, score = shapiro_tree_aligner.align_trees(named_tree, c_tree) check_c = score < 1000 _, score = shapiro_tree_aligner.align_trees(named_tree, u_tree) check_u = score < 1000 return check_c, check_u tree = shapiro_tree_aligner.get_tree( ".....((((((((...(.(((((.......))))).)........((((((.......))))))..)))))))).....", "NNNNNNNNNNNNNUNNNNNNNNNNNNNNNNNNNNNNNNUNNNUNNNNNNNNNNNNNNNNNNNNNNYNNNNNNNNNNNNN" ) c_tree = shapiro_tree_aligner.get_tree( ".....((((((((...(.(((((.......))))).)........((((((.......))))))..)))))))).....", "NNNNNNNNNNNNNUNNNNNNNNNNNNNNNNNNNNNNNNUNNNUNNNNNNNNNNNNNNNNNNNNNNCNNNNNNNNNNNNN" ) u_tree = shapiro_tree_aligner.get_tree( ".....((((((((...(.(((((.......))))).)........((((((.......))))))..)))))))).....", "NNNNNNNNNNNNNUNNNNNNNNNNNNNNNNNNNNNNNNUNNNUNNNNNNNNNNNNNNNNNNNNNNUNNNNNNNNNNNNN" ) gather_results = {} full_results = {} with open(os.path.join(folder_path, "FINAL_all_ext"), 'r') as input_all: input_all.readline() for line in input_all: if line.strip() == '': continue parts = line.strip().split('\t') try: tax_id = int(parts[11]) except ValueError: tax_id = None if parts[0].strip() == design_code and keyword_relevant(parts[15], desc_keywords) and\ keyword_relevant(parts[1], identifier_keywords) and check_taxonomy(tax_id): read_sequence = new_seq_map.get(parts[1], parts[4]) gather_results[parts[1]] = read_sequence full_results[parts[1]] = { 'sequence': read_sequence, 'rfam_eval': parts[7], 'tax_id': parts[11], 'gene_id': parts[13], 'gene_loc': parts[14], 'gene_desc': parts[15] } if len(gather_results) == 0: print("No results from design {} match keywords {}".format( design_code, desc_keywords)) exit(0) try: cm = NamedTemporaryFile('w', suffix=".cm") cm_file_name = cm.name cm.close() sto = NamedTemporaryFile('w', suffix=".sto") sto_file_name = sto.name sto.close() real_cm = os.path.join(folder_path, "{}.cm".format(test_name)) if not os.path.exists(real_cm): if not infernal.generate_single_seq_cm(sequence, cm_file_name, structure): print( "Failed to generate single sequence-structure cm file for {}" .format(design_code)) exit(-1) shutil.copy(cm_file_name, real_cm) if not infernal.align_sequences(gather_results, os.path.join(folder_path, real_cm), sto_file_name): print("Failed to generate alignment for {}".format(design_code)) exit(-2) removed_names = set() cm_map = read_sto(sto.name) with open(os.path.join(folder_path, "{}.txt".format(test_name)), 'w') as tbl_out: tbl_out.write( 'name\tscore\ttax id\trfam evalue\tgene id\tgene location\tgene description\tcm sequence\t' 'cm structure\toriginal sequence\tguanine_bind\tadenine_bind\n' ) for name, (cmsequence, cmstructure) in cm_map.items(): named_tree = shapiro_tree_aligner.get_tree( cmstructure, cmsequence) _, score = shapiro_tree_aligner.align_trees(named_tree, tree) is_c, is_u = check_c_or_u(named_tree) if score > score_cutoff: removed_names.add(name) print("Removing {} score {}\n{}\n{}".format( name, score, cmsequence, cmstructure)) else: res_item = full_results.get(name) tbl_out.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n". format(name, score, res_item.get('tax_id'), res_item.get('rfam_eval'), res_item.get('gene_id'), res_item.get('gene_loc'), res_item.get('gene_desc'), cmsequence, cmstructure, res_item.get('sequence'), is_c, is_u)) clean_sto(removed_names, sto_file_name) real_sto = os.path.join(folder_path, "{}.sto".format(test_name)) shutil.copy(sto_file_name, real_sto) dive_statistics.generate_r2r(folder_path, real_sto, force=True) finally: try: os.remove(cm_file_name) except: pass try: os.remove(sto_file_name) except: pass
def check_c_or_u(named_tree) -> Tuple[bool, bool]: _, score = shapiro_tree_aligner.align_trees(named_tree, c_tree) check_c = score < 1000 _, score = shapiro_tree_aligner.align_trees(named_tree, u_tree) check_u = score < 1000 return check_c, check_u
#!/usr/bin/env python3 ''' Gather good indexes from sequence / structure Usage: gather_index.py source_sequence source_structure target_sequence target_structure ''' import sys from rnafbinv import shapiro_tree_aligner if len(sys.argv) < 5: print( "Usage: gather_index.py <source_sequence> <source_structure> <target_sequence> <target_structure>" ) exit(-1) source_sequence = sys.argv[1].strip("'").strip('"') source_structure = sys.argv[2].strip("'").strip('"') target_sequence = sys.argv[3].strip("'").strip('"') target_structure = sys.argv[4].strip("'").strip('"') source_tree = shapiro_tree_aligner.get_tree(source_structure, source_sequence) target_tree = shapiro_tree_aligner.get_tree(target_structure, target_sequence) aligned_tree, score = shapiro_tree_aligner.align_trees(source_tree, target_tree) matched, unmached = shapiro_tree_aligner.get_matching_indexes(aligned_tree) print(matched) print(unmached)
def get_max_score(seq, struct): tree = shapiro_tree_aligner.get_tree(struct, seq) _, score = shapiro_tree_aligner.align_trees(tree, tree) return tree, score
def simulated_annealing(options: Dict[str, Any]): if len(options) == 0: options.get('logger').fatal( "Options object was not properly initiated. ") return None alignment_rules = tree_aligner.AlignmentRules( delete_func=lambda tree_value, is_target: shapiro_tree_aligner. delete_shapiro_func(tree_value, is_target, options['reduced_bi']), cmp_func=shapiro_tree_aligner.cmp_shapiro_tree_values, merge_func=shapiro_tree_aligner.merge_shapiro_tree_values, minmax_func=min) options['alignment_rules'] = alignment_rules # init rng rng_seed = options.get('rng') if rng_seed is not None: random.seed(rng_seed) # init loop variables no_iterations = options.get('iter') no_lookahead = options.get('look_ahead') # init initial sequence current_sequence = options.get('starting_sequence') if current_sequence is None: if options.get('random'): current_sequence = generate_random_start( len(options['target_structure']), options['target_sequence'].replace('T', 'U')) else: current_sequence = current_sequence.replace('T', 'U') # Vienna starts the process vienna_sequence = vienna.inverse( options['target_structure'], vienna.inverse_seq_ready(options['target_sequence'], current_sequence)) # vienna might fail initiation + removing any wildcard left if vienna_sequence is None or vienna_sequence == '': if options.get('starting_sequence') is None: current_sequence = generate_random_start( len(options['target_structure']), options['target_sequence'].replace('T', 'U')) else: current_sequence = generate_random_start( len(options['target_structure']), options.get('starting_sequence').replace('T', 'U')) else: current_sequence = generate_random_start( len(options['target_structure']), vienna_sequence.upper().replace('T', 'U')) #print("Structure: {}\nsequence: {}\nstart: {}\ninverse: {}".format(options['target_structure'], # options['target_sequence'], # options.get('starting_sequence'), # current_sequence)) final_result = current_sequence # setup target tree and get initial sequence score (and max score) target_tree = shapiro_tree_aligner.get_tree(options['target_structure'], options['target_sequence']) if not merge_motifs(target_tree, options.get('motifs')): shapiro_str = shapiro_generator.get_shapiro( options['target_structure']).shapiro logging.error( 'Motif list does not match target structure {}\nTarget Shapiro:{}'. format(options.get('motifs'), shapiro_str)) return None _, optimal_score = shapiro_tree_aligner.align_trees( target_tree, target_tree, options['alignment_rules']) match_tree, current_score = score_sequence(current_sequence, target_tree, options) best_score = current_score options.get('logger').info( 'Initial sequence ({}): {}\nAlign tree: {}'.format( current_score, current_sequence, match_tree)) updater = options.get('updater') # main loop for iter in range(0, no_iterations): if options.get('stop') is not None: return None if best_score == 0: break progress = False for look_ahead in range(0, no_lookahead): if options.get('stop') is not None: return None new_sequence = mutator.perturbate(current_sequence, match_tree, options) new_tree, new_score = score_sequence(new_sequence, target_tree, options) temperature = calc_temp(iter, no_iterations) probability = acceptance_probability(current_score, new_score, temperature, len(current_sequence)) options.get('logger').debug( "iteration {} - TEMP: {} PROBABILITY: {}".format( iter + 1, temperature, probability)) if random.random() < probability: progress = True break ''' OLD method, decays very fast (new is boltzman probability) if new_score < current_score: progress = True break elif random.random() < (2.0 / (iter + 1.0) / no_lookahead): progress = True break ''' if progress: current_sequence = new_sequence current_score = new_score match_tree = new_tree if current_score <= best_score: best_score = current_score final_result = current_sequence options.get('logger').debug( 'Iteration {} current sequence ({}): {}\nAlign tree: {}'.format( iter + 1, current_score, current_sequence, match_tree)) if updater is not None: updater.update(iter + 1) # final print return final_result