def search_cm(target_seq: str, target_ident: str) -> Tuple[bool, Optional[str]]: seq_db = None try: seq_db = infernal.generate_fasta({target_ident: target_seq}) cm_res = infernal.search_cm(cm_purine_path, seq_db.name, inc_e=10) finally: if seq_db is not None and os.path.exists(seq_db.name): os.remove(seq_db.name) if cm_res is not None and cm_res: return True, cm_res[0]['E-value'] else: return False, None
def is_novel(sequence, cm_file): res = True fasta_file = None try: fasta_file = NTF(dir='.', mode='w', delete=False) fasta_file.write('> seq\n') for item in map( ''.join, itertools.zip_longest(*[iter(sequence)] * 80, fillvalue='')): fasta_file.write('{}\n'.format(item)) fasta_file.close() cm_res = infernal.search_cm(cm_file, fasta_file.name, res_type=infernal.ResType.TBLOUT) if cm_res is not None and len(cm_res) > 0: res = False finally: if fasta_file is not None and os.path.exists(fasta_file.name): os.remove(fasta_file.name) return res
def cm_search(sequence, seq_code, fasta_list=None): results = [] # get fasta fils if fasta_list is None: fasta_dbs = gather_fasta_dbs() else: fasta_dbs = fasta_list # cm build \ calibrate cm_path = os.path.join(output_dir, "{}.cm".format(seq_code)) infernal.generate_single_seq_cm(sequence, cm_path) # search fasta files for fasta_file in fasta_dbs: single_fasta_res = infernal.search_cm(cm_path, fasta_file) for res in single_fasta_res: res['file'] = fasta_file #res['sequence'] = recover_infernal_sequence(res, fasta_file) results += single_fasta_res for res in results: add_search_run(sequence, res['sequence'], seq_code, 'cm', res['file']) # return results list return results
def run_search(run_code: str, designed_object): general_run_logger.info('Starting search {}'.format(run_code)) cm_path = os.path.join(output_dir, '{}.cm'.format(run_code)) if not infernal.generate_single_seq_cm(designed_object.sequence, cm_path, designed_object.structure): general_run_logger.error( 'Failed to build covariance model. run code: {}\n{}\n{}'.format( run_code, designed_object.sequence, designed_object.structure)) return results = infernal.search_cm(cm_path, NT_PATH) if results is None: general_run_logger.error('Search failed {} {}\n{}'.format( run_code, cm_path, designed_object.sequence)) return general_run_logger.info('Finished search {}, {} results'.format( run_code, len(results))) for res_no, res in enumerate(results): try: sequence = res.get('sequence') structure = vienna.fold(sequence)['MFE'] res_tree = shapiro_tree_aligner.get_tree(structure, sequence) tree, score = shapiro_tree_aligner.align_trees( res_tree, target_tree) if score < 300 and score % 100 < 30: general_run_logger.info( 'Adding result {}, score {} sequence {}'.format( run_code, score, sequence)) 'seq code\tmatch no\tsequence\tstructure\tscore\ttarget id' result_logger.info('{}\t{}\t{}\t{}\t{}\t{}'.format( run_code, res_no, sequence, structure, score, res.get('identifier'))) else: general_run_logger.warning( 'Score too low {} result no {}, score {}, sequence: {}'. format(run_code, res_no, score, sequence)) except Exception: general_run_logger.fatel( 'Exception in search {}, res no {}, {}'.format( run_code, res_no, res))
def recreate_cm(folder_path: str): # read inputs gather_designs = {} with open(os.path.join(folder_path, "FINAL_summary"), "r") as input_sum: input_sum.readline() for line in input_sum: if line.strip() == '': continue parts = line.strip().split('\t') gather_designs[parts[0]] = parts[5] gather_results = {} with open(os.path.join(folder_path, "FINAL_all"), 'r') as input_all: input_all.readline() for line in input_all: if line.strip() == '': continue parts = line.strip().split('\t') res_map = gather_results.get(parts[0], {}) res_map[parts[1]] = parts[4] gather_results[parts[0]] = res_map # start calculations folder = vienna.LiveRNAfold() folder.start() for design_code, sequence in gather_designs.items(): structure = folder.fold(sequence)['MFE'] cm_path = os.path.join(folder_path, "{}.cm".format(design_code)) sto_path = os.path.join(folder_path, "{}.sto".format(design_code)) if os.path.exists(cm_path): continue temp_cm_path = "{}_tmp".format(cm_path) temp_sto_path = "{}_tmp".format(sto_path) if not infernal.generate_single_seq_cm(sequence, cm_path, structure): print("Could not generate single cm for {}".format(design_code)) exit(-1) if not infernal.align_sequences({'{}'.format(design_code): sequence}, cm_path, sto_path): print("Could not generate single sto for {}".format(design_code)) exit(-1) design_results = gather_results.get(design_code) no_found = 0 temp_fasta = infernal.generate_fasta(design_results) while no_found < len(design_results): results = infernal.search_cm(cm_path, temp_fasta.name, inc_e=10.0) sto_parts = {} sto_target = get_sto_targets(sto_path) for item in results: if item['target name'] not in sto_target: sto_parts[item['target name']] = item['sequence'] if len(sto_parts) == 0: print( "ERROR: no new sequences found for {} maxed at {} sequences out of {} original\nListing: {}" .format(design_code, len(sto_target), len(design_results), [ res for res in design_results.keys() if res not in get_sto_targets(sto_path) ])) break if not infernal.align_sequences( sto_parts, cm_path, temp_sto_path, in_align_path=sto_path): print("Could not generate sto for {}".format(design_code)) exit(-1) if filecmp.cmp(sto_path, temp_sto_path, shallow=False): print("ERROR: {} missing codes: {}".format( design_code, [ res for res in design_results.keys() if res not in get_sto_targets(sto_path) ])) shutil.move(temp_sto_path, sto_path) break shutil.move(temp_sto_path, sto_path) if not infernal.generate_cm(sto_path, temp_cm_path): print("Could not generate cm for {}".format(design_code)) exit(-1) shutil.move(temp_cm_path, cm_path) no_found = len(results) os.remove(temp_fasta.name)
def dive_single(group_id: str, single_design_group: DesignGroup, cm_dir: str, seq_db_path: str, target_tree, filter_align_score: float = 250, filter_evalue: float = 10.0, cpus: int = 12) -> \ Tuple[DesignGroup, int, Dict[int, List[str]]]: count = 0 items_in_round = {} found_new = True base_cm_name = '{}.cm'.format(group_id) if not os.path.exists(os.path.join(cm_dir, base_cm_name)): infernal.generate_single_seq_cm( single_design_group.sequence, os.path.join(cm_dir, base_cm_name), structure=single_design_group.structure, cpus=cpus) cm_name = 'TEMP_{}'.format(base_cm_name) shutil.copyfile(os.path.join(cm_dir, base_cm_name), os.path.join(cm_dir, cm_name)) stockholm_file = os.path.join(cm_dir, '{}.sto'.format(group_id)) design_group_identifies = { 'sequence': single_design_group.sequence, 'structure': single_design_group.structure } design_copy = copy(single_design_group) items_in_round[0] = design_copy.matches.keys() while found_new: count += 1 found_new = False # rebuild cm (align to old, delete and create new) full_list = {} for identifier, match in single_design_group.matches.items(): full_list[identifier] = match.get('sequence') full_list[ single_design_group.identifier] = single_design_group.sequence success = infernal.align_sequences(full_list, os.path.join(cm_dir, cm_name), stockholm_file) os.remove(os.path.join(cm_dir, cm_name)) cm_path = os.path.join(cm_dir, cm_name) success = infernal.generate_cm(stockholm_file, cm_path, cpus=cpus) # search on cm search_res = infernal.search_cm(cm_path, seq_db_path, cpus=cpus) # identify items (see different matches) and compare size of match group new_design_group = DesignGroup(single_design_group.identifier, design_group_identifies) for single_match in search_res: code = single_match.get('identifier') seq = single_match.get('sequence') align_score = get_align_score(code, seq, cm_path, target_tree) if float(single_match.get('E-value') ) < filter_evalue and align_score < filter_align_score: old_res = design_copy.matches.get(code) if old_res is None: single_match['round'] = count found_new = True else: single_match['round'] = old_res['round'] new_design_group.add_match(code, single_match) design_copy = new_design_group items_in_round[count] = design_copy.matches.keys() # organize cm shutil.move(os.path.join(cm_dir, cm_name), os.path.join(cm_dir, 'FINAL_{}'.format(base_cm_name))) shutil.move(stockholm_file, os.path.join(cm_dir, 'FINAL_{}.sto'.format(group_id))) return design_copy, count, items_in_round