def refold_stockholm(stockholm_alig, consensus_structure): """ compute refold.pl from Vienna RNA package :param stockholm_alig: :param consensus_structure: :return: """ ml.debug(fname()) # convert to clustal alignment fd, clust_tempfile = mkstemp(prefix='rba_', suffix='_23', dir=CONFIG.tmpdir) with os.fdopen(fd, 'w') as f: stockholm_alig.write_clustal(f) # write fake alifold output with given consensus structure fd, alif_fake_file = mkstemp(prefix='rba_', suffix='_24', dir=CONFIG.tmpdir) with os.fdopen(fd, 'w') as f: # the consensus sequence in alifold file is really not used for anything f.write('A'*len(consensus_structure) + '\n') f.write(consensus_structure + '\n') # compute refold # refold_path = locate_refold() refold_constrained_file = compute_refold(clust_tempfile, alif_fake_file) parsed_seqs = [] with open(refold_constrained_file, 'r') as f: # read the file for seq in BA_support.parse_seq_str(f): parsed_seqs.append(seq) # cleanup BA_support.remove_files_with_try([clust_tempfile, alif_fake_file, refold_constrained_file]) return parsed_seqs
def alifold_refold_prediction(nr_homologs_hits_fasta, all_hits_fasta, refold='refold', threads=None, params=None, msa_alg='clustalo'): """ return predicted structures for all hits based on provided sequence homologs ! beware, clustal mixes order of sequences in profile alignment, correct for it possible param keys: "clustal", "alifold", "clustalo_profile", "repred_unpaired_tr" """ ml.debug(fname()) nr_path, san_dict = sanitize_fasta_file(nr_homologs_hits_fasta) all_path, san_dict = sanitize_fasta_file(all_hits_fasta, used_dict=san_dict) if params is None: params = dict() ref_pred = ['refold', 'refold_rnafoldc', 'conserved_ss_rnafoldc'] if refold not in ref_pred: raise Exception( 'refold procedure not recognized: {}, possible values are {}'. format(refold, ' '.join(ref_pred))) cl_file = _aligner_block(nr_path, params, msa_alg, threads) # cannot rely on that, the order of a cl_file would be the same as the order of the nr_homolog_hits_file ali_file = compute_alifold(cl_file, alifold_params=params.get('alifold', '')) consensus_record = read_seq_str(ali_file)[0] clustalo_profile_params = '--outfmt clustal ' clustalo_profile_params += params.get('clustalo_profile', '') if threads: clustalo_profile_params += ' --threads {}'.format(threads) realign_file = run_clustal_profile2seqs_align( cl_file, all_path, clustalo_params=clustalo_profile_params) realign_alig = AlignIO.read(realign_file, format='clustal') # slice alignment ( get seqname from nr_homolog_hits_file, find it in the realign and slice the whole segment off # take care that the id may be the same and it must be checked for multiple occurence first_nr_record = _parse_first_record_only(nr_path) realign_allseq_possition = [ i for i, seq in enumerate(realign_alig) if seq.id == first_nr_record.id ] new_alig_for_refold = realign_alig[:realign_allseq_possition[-1]] old_alig_in_new = realign_alig[realign_allseq_possition[-1]:] orig_alignment = AlignIO.read(cl_file, format='clustal') first_original_alignment_record = orig_alignment[0] match_original_seq_in_new_alig = [ i for i in old_alig_in_new if i.id == first_original_alignment_record.id ][0] mapping = _map_alignment_columns_from_profile_match( first_original_alignment_record, match_original_seq_in_new_alig) # map and repair structure when mapping is unbiguous cs_encode = encode_structure_unicode( consensus_record.letter_annotations['ss0']) new_consensus_structure_encoded = _repair_consensus_structure_by_maping( cs_encode, mapping, len(match_original_seq_in_new_alig.seq), gap_char=49) new_consensus_structure_repaired = repair_structure_any_variant( new_consensus_structure_encoded) new_consensus_structure = decode_structure_unicode( new_consensus_structure_repaired) new_consensus_sequence = _repair_consensus_structure_by_maping( str(consensus_record.seq), mapping, len(match_original_seq_in_new_alig.seq), gap_char=ord('_')) # write new consensus to a file a_fd, new_alifold_consensus_file = mkstemp(prefix='rba_', suffix='_33', dir=CONFIG.tmpdir) with os.fdopen(a_fd, 'w') as f: f.write(new_consensus_sequence + '\n') f.write(new_consensus_structure + '\n') # write sliced alignment to a file sa_fd, sliced_alignment_file = mkstemp(prefix='rba_', suffix='_34', dir=CONFIG.tmpdir) with os.fdopen(sa_fd, 'w') as f: AlignIO.write(new_alig_for_refold, f, 'clustal') # now process the file, and map alignment to consensus structure if refold in ['refold', 'refold_rnafoldc']: refold_file = compute_refold(sliced_alignment_file, new_alifold_consensus_file) if refold == 'refold_rnafoldc': rnafold_parameters = params.get('RNAfold', '') if '-C' not in rnafold_parameters: rnafold_parameters += ' -C' seq_str = rnafold_prediction(refold_file, params=rnafold_parameters) else: seq_str = read_seq_str(refold_file) remove_one_file_with_try(refold_file) else: st_alig_file = build_stockholm_from_clustal_alig( sliced_alignment_file, new_alifold_consensus_file) repred_tr = str(params.get('repred_unpaired_tr', '9')) conseq_conserved = params.get('conseq_conserved', 1) seq_str = _refold_with_unpaired_conservation( st_alig_file, repred_tr=repred_tr, conseq_conserved=conseq_conserved) remove_one_file_with_try(st_alig_file) structures_out = desanitize_fasta_names_in_seqrec_list(seq_str, san_dict) remove_files_with_try([ nr_path, all_path, sliced_alignment_file, new_alifold_consensus_file, cl_file, ali_file, realign_file ]) return structures_out
def cmmodel_rnafold_c(allhits_fasta, cmmodel_file, threads=None, params=None, timeout=None): ml.debug(fname()) if params is None: params = dict() allhits_fasta_file, san_dict = sanitize_fasta_file(allhits_fasta) cmalign_params = '' if threads: cmalign_params += '--cpu {}'.format(threads) if 'cmalign' in params and params['cmalign']: cmalign_params += ' ' + params['cmalign'] if '--notrunc' not in cmalign_params: cmalign_params += ' --notrunc' # rnafold params rnafold_params = params.get('RNAfold', '-C') assert isinstance(rnafold_params, str), "Incorrect parameters for RNAfold -C" if '-C' not in rnafold_params: # some parameters given but -C not present rnafold_params += ' -C' alig_file = run_cmalign_on_fasta(allhits_fasta_file, cmmodel_file, cmalign_params=cmalign_params, timeout=timeout) # multiple sequence cm align # split by sequence, then run the rest cm_alig = read_st(alig_file) remove_files_with_try([allhits_fasta_file, alig_file]) # ===== use refold.pl directly ==== cm_alig_upper = cm_alig.to_upper() fd, temp_mock_consensus = mkstemp(prefix='rba_', suffix='_41', dir=CONFIG.tmpdir) f, temp_clustal_aln = mkstemp(prefix='rba_', suffix='_42', dir=CONFIG.tmpdir) with os.fdopen(f, 'w') as h_clustal, os.fdopen(fd, 'w') as h_constraints: cm_alig_upper.write_clustal(h_clustal) h_constraints.write('{}\n{}\n'.format( re.sub('[^ACTGU]', '_', cm_alig_upper.column_annotations['RF'], flags=re.IGNORECASE), cm_strucutre2br(cm_alig_upper.column_annotations['SS_cons']))) temp_constraint_file = compute_refold(temp_clustal_aln, temp_mock_consensus, timeout=timeout) structures = rnafold_prediction(temp_constraint_file, params=rnafold_params, timeout=timeout) str_out = desanitize_fasta_names_in_seqrec_list(structures, san_dict) remove_files_with_try( [temp_constraint_file, temp_clustal_aln, temp_mock_consensus]) return str_out