def compute_alifold(msa_file, alifold_params=''): ml.info('Running RNAalifold.') ml.debug(fname()) fd, out_path = mkstemp(prefix='rba_', suffix='_02', dir=CONFIG.tmpdir) with TemporaryFile(mode='w+', encoding='utf-8') as tmp, os.fdopen( fd, 'w') as output, open(msa_file, 'r') as inp: cmd = [ '{}RNAalifold'.format(CONFIG.viennarna_path), '--noPS', '-f', 'C', ] + shlex.split(alifold_params) ml.debug(cmd) p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=output, stderr=tmp, universal_newlines=True) p.communicate(input=inp.read()) if p.returncode: msgfail = 'RNAalifold failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.RNAalifoldException(msgfail, tmp.read()) return out_path
def build_stockholm_from_clustal_alig(clustal_file, alif_file): """ build stockholm alignment :return: """ ml.debug(fname()) with open(clustal_file, 'r') as cf, open(alif_file, 'r') as af: # write stockholm align to buffer and read it with my parser clust = AlignIO.read(cf, format='clustal') temp = StringIO() AlignIO.write(clust, temp, format='stockholm') temp.seek(0) st_alig = stockholm_read(temp) # parse alifold output and add structure to stockholm alignment for i, alif in enumerate(parse_seq_str(af)): alifold_structure = alif.letter_annotations['ss0'] st_alig.column_annotations['SS_cons'] = alifold_structure if i == 0: break st_fd, st_file = mkstemp(prefix='rba_', suffix='_15', dir=CONFIG.tmpdir) with os.fdopen(st_fd, 'w') as sf: st_alig.write_stockholm(sf) return st_file
def select_sequences_from_similarity_rec(dist_mat: np.ndarray, sim_threshold_percent=90) -> list: """ :param dist_mat: distmat table, by default obtained from read_clustal_distmat_file, values in percent :param sim_threshold_percent: threshold for similarity in percent :return: """ ml.debug(fname()) # dists = np.triu(dist_mat.as_matrix(), 1) # removes unwanted similarities if dist_mat is None: return [0] dists = dist_mat.transpose() # row, col = where(dists > sim_threshold_percent) # determine where the similarities are include = set() exclude = set() a = np.array(range(len(dists))) for i, r in enumerate(dists): pr = r[~np.isnan(r)] pa = a[~np.isnan(r)] if (i in exclude) | (any(pr >= sim_threshold_percent)): pu = np.where(pr >= sim_threshold_percent) u = pa[pu] if i not in exclude: include |= {i} to_ex = set(u.tolist()) - include exclude |= to_ex # union operation else: include |= {i} return sorted(include)
def run_cmbuild(cmbuild_input_file, cmbuild_params=''): """ run cmbuild procedure input must be MSA in stockholm format with secondary structure prediction note: consider what to do if only one sequence is available :param cmbuild_input_file: Stockholm or selex alignment file :param cmbuild_params: additional params to cmbuild :return: """ ml.info('Runing cmbuild.') ml.debug(fname()) cm_fd, cm_file = mkstemp(prefix='rba_', suffix='_13', dir=CONFIG.tmpdir) os.close(cm_fd) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: cmd = ['{}cmbuild'.format(CONFIG.infernal_path), '-F'] if cmbuild_params != '': cmd += cmbuild_params.split() cmd += [cm_file, cmbuild_input_file] ml.debug(cmd) r = call(cmd, stdout=tmp, stderr=tmp) if r: msgfail = 'Call to cmbuild failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.CmbuildException(msgfail, tmp.read()) return cm_file
def run_cmalign_on_fasta(fasta_file, model_file, cmalign_params='--notrunc', alig_format='stockholm'): """ run cmalign program with provided CM model file :param fasta_file: input fasta to be aligned to cm model :param model_file: file containing one or more cm models :param cmalign_params: parameter of the search :return: """ ml.info('Runing cmaling.') ml.debug(fname()) cma_fd, cma_file = mkstemp(prefix='rba_', suffix='_14', dir=CONFIG.tmpdir) os.close(cma_fd) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: cmd = [ '{}cmalign'.format(CONFIG.infernal_path), '--informat', 'fasta', '--outformat', alig_format, ] if cmalign_params != '': cmd += cmalign_params.split() cmd += ['-o', cma_file, model_file, fasta_file] ml.debug(cmd) r = call(cmd, stdout=tmp, stderr=tmp) if r: msgfail = 'Call to cmalign failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.CmalignException(msgfail, tmp.read()) return cma_file
def rebuild_structures_output_from_pred(reference_sequences_list, predicted_structures_list, method=None): ml.debug(fname()) structuresids = [ seq.id for seq in predicted_structures_list if hasattr(seq, 'id') ] structures_list = [] for seq in reference_sequences_list: nr = SeqRecord(seq.seq, id=seq.id, name=seq.name, description=seq.description, annotations=seq.annotations, letter_annotations=seq.letter_annotations) if seq.id in structuresids: n = structuresids.index(seq.id) nr.letter_annotations.update( predicted_structures_list[n].letter_annotations) nr.annotations.update(predicted_structures_list[n].annotations) nr.annotations['predicted'] = True else: if method: wmsg = '{} failed to predict structure for seq {}.'.format( method, nr.id) ml.warning(wmsg) nr.annotations['predicted'] = False structures_list.append(nr) del nr return structures_list
def build_cm_model_rsearch(query_seq, path2selected_sim_array): ml.debug(fname()) query_structure = rna_blast_analyze.BR_core.viennaRNA.RNAfold( str(query_seq.seq))[0] # remove any annotations from query: qs_clean = deepcopy(query_seq) qs_clean.annotations = dict() qs_clean.letter_annotations = dict() # query_structure = RNA.fold(str(analyzed_hits.query.seq))[0] # build stockholm like file for use in cm mohdel build st_like = StockholmAlig() st_like.append(qs_clean) st_like.column_annotations['SS_cons'] = query_structure fds, stock_file = mkstemp(prefix='rba_', suffix='_30', dir=CONFIG.tmpdir) with os.fdopen(fds, 'w') as f: st_like.write_stockholm(f) # run actual cmbuild cm_model_file = run_cmbuild( stock_file, cmbuild_params='--rsearch {}'.format(path2selected_sim_array)) # cleanup BA_support.remove_one_file_with_try(stock_file) return cm_model_file
def RNAfold(sequence): ml.debug(fname()) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: r = check_output([ '{}RNAfold'.format(CONFIG.viennarna_path), '--noPS', ], input=sequence.encode(), stderr=tmp) if isinstance(r, Exception): msgfail = 'RNAfold failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.RNAfoldException(msgfail, tmp.read()) # more robust decode out_str = r.decode() spl = out_str.split('\n') seq = spl[0] structure = spl[1][:len(seq)] energy = float(spl[1][len(seq) + 2:-1]) # seq, structure, energy = r.decode().split() # return seq, structure, float(energy[1:-1]) return structure, energy
def run_cmemit(model, params='', out_file=None): """ :param model: :param params: :return: """ ml.info('Run cmemit.') ml.debug(fname()) if out_file: out = out_file else: fd, out = mkstemp(prefix='rba_', suffix='_12', dir=CONFIG.tmpdir) os.close(fd) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: # build commandline cmd = ['{}cmemit'.format(CONFIG.infernal_path)] if params != '': cmd += params.split() cmd += ['-o', out, model] ml.debug(cmd) r = call(cmd, stdout=tmp, stderr=tmp) if r: msgfail = 'Call to cmemit failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.CmemitException(msgfail, tmp.read(0)) return out
def get_cm_model_table(query_file, params=None, threads=None, rfam=None, timeout=None): ml.debug(fname()) if params is None: params = dict() cmscan_params = '-g ' if params and ('cmscan' in params) and params['cmscan']: cmscan_params += params['cmscan'] try: out_table = run_cmscan(query_file, params=cmscan_params, threads=threads, rfam=rfam, timeout=timeout) f = open(out_table, 'r') cmscan_data = parse_cmalign_infernal_table(f) f.close() remove_one_file_with_try(out_table) return cmscan_data except exceptions.CmscanException as e: return None
def _aligner_block(nr_homolog_hits_file, params, msa_alg, threads=None): """ returns alignment file in clustal format :param nr_homolog_hits_file: :param params: :param msa_alg: :param threads: int :return: """ ml.debug(fname()) if msa_alg == 'clustalo': clustal_params = '--outfmt=clustal --force' clustal_params += params.get('clustalo', '') if threads: clustal_params += ' --threads={}'.format(threads) alig_file = compute_clustalo_clasic(nr_homolog_hits_file, clustalo_params=clustal_params) elif msa_alg == 'muscle': if params and ('muscle' in params) and params['muscle']: alig_file = run_muscle(nr_homolog_hits_file, muscle_params=params['muscle'], reorder=True) else: alig_file = run_muscle(nr_homolog_hits_file, reorder=True) else: print('invalig MSA alg chosen {}, valid are "clustalo" and "muscle"'. format(msa_alg)) raise AttributeError() return alig_file
def check_rfam_present(): """ Check if RFAM file is present and converted to binary format required by cmscan by running program cmpres. If present but not converted, conversion is attempted. :return: bool """ ml.debug(fname()) rfam = RfamInfo() cm_present = os.path.isfile(rfam.file_path) if cm_present: if not check_if_cmpress_processed(): try: run_cmpress(rfam.file_path) except exceptions.CmpressException as e: ml.error(str(e)) ml.error( 'The Rfam file might be corrupt. Please check following output to get more information.\n' ) print(e.errors) return False return True else: return False
def rfam_subopt_pred(all_sequence_fasta, cm_ref_str, params=None, threads=1): ml.debug(fname()) if params is None: params = dict() if params and ('mfold' in params) and params['mfold']: assert isinstance(params['mfold'], (tuple, list)) and 3 == len(params['mfold']), \ "Incorrect parameters for hybrid_ss_min given. Need tuple of 3 numbers." subs = run_hybrid_ss_min(all_sequence_fasta, mfold=params['mfold'], threads=threads) else: subs = run_hybrid_ss_min(all_sequence_fasta, threads=threads) # now compute rna distance score if threads == 1: new_structures = [] for seq in subs: new_structures.append(_helper_subopt(seq, cm_ref_str)) else: with multiprocessing.Pool(processes=threads) as pool: tuples = [(seq, cm_ref_str) for seq in subs] new_structures = pool.starmap(_helper_subopt, tuples) return new_structures
def compute_refold(alig_file, cons_file, timeout=None): """ runs refold program :param alig_file: MSA alignment file in clustal format :param cons_file: file with consensus structure in alifold format :return: """ ml.debug(fname()) fd, out_path = mkstemp(prefix='rba_', suffix='_03', dir=CONFIG.tmpdir) cmd = ['{}refold.pl'.format(CONFIG.refold_path), alig_file, cons_file] ml.debug(cmd) with TemporaryFile(mode='w+', encoding='utf-8') as tmp, os.fdopen(fd, 'w') as output: with subprocess.Popen(cmd, stdout=output, stderr=tmp) as p: try: p.wait(timeout=timeout) except subprocess.TimeoutExpired: p.kill() p.wait() raise if p.returncode: msgfail = 'Call to refold.pl failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.RefoldException(msgfail, tmp.read()) return out_path
def create_nr_homolog_hits_file_MSA_unsafe(sim_threshold_percent=None, all_hits=None, query=None, cmscore_tr=0.0, cm_threshold_percent=None, len_diff=0.1): """ create non redundant homologous hits file """ ml.debug(fname()) dist_table, homologous_seqs, msgs = _trusted_hits_selection_wrapper( all_hits, query, cmscore_tr, cm_threshold_percent, len_diff_=len_diff) if dist_table.size == 0: nr_homolog_hits = [query] else: # normal execution to_include = rna_blast_analyze.BR_core.predict_structures.select_sequences_from_similarity_rec( dist_table, sim_threshold_percent=sim_threshold_percent) nr_homolog_hits = [homologous_seqs[i] for i in to_include] fd_h, nr_homo_hits_file = mkstemp(prefix='rba_', suffix='_59', dir=CONFIG.tmpdir) with os.fdopen(fd_h, 'w') as f: SeqIO.write(nr_homolog_hits, f, 'fasta') return nr_homo_hits_file, homologous_seqs, msgs
def extract_ref_from_cm(cm_file): ml.debug(fname()) single_alig_file = run_cmemit(cm_file, params='-a -N 1') o = open(single_alig_file, 'r') salig = stockholm_read(o) o.close() remove_one_file_with_try(single_alig_file) if len(salig) != 1: raise AssertionError('File from cmemit does not have only one record in (not including reference).') # recode structure, return it ss = salig.column_annotations['SS_cons'] # inserts = str(salig[0].seq) inserts = str(salig.column_annotations['RF']) gapchars = '.~' structure_list = [] for i, j in zip(ss, inserts): if i in gapchars: continue structure_list.append(i) # recode structure list structure = cm_strucutre2br(''.join(structure_list)) return structure
def refold_stockholm(stockholm_alig, consensus_structure): """ compute refold.pl from Vienna RNA package :param stockholm_alig: :param consensus_structure: :return: """ ml.debug(fname()) # convert to clustal alignment fd, clust_tempfile = mkstemp(prefix='rba_', suffix='_23', dir=CONFIG.tmpdir) with os.fdopen(fd, 'w') as f: stockholm_alig.write_clustal(f) # write fake alifold output with given consensus structure fd, alif_fake_file = mkstemp(prefix='rba_', suffix='_24', dir=CONFIG.tmpdir) with os.fdopen(fd, 'w') as f: # the consensus sequence in alifold file is really not used for anything f.write('A'*len(consensus_structure) + '\n') f.write(consensus_structure + '\n') # compute refold # refold_path = locate_refold() refold_constrained_file = compute_refold(clust_tempfile, alif_fake_file) parsed_seqs = [] with open(refold_constrained_file, 'r') as f: # read the file for seq in BA_support.parse_seq_str(f): parsed_seqs.append(seq) # cleanup BA_support.remove_files_with_try([clust_tempfile, alif_fake_file, refold_constrained_file]) return parsed_seqs
def run_cmfetch(cmfile, modelid, outfile=None): """ :param cmfile: :param modelid: :return: """ ml.info('Runing cmfetch.') ml.debug(fname()) if outfile: out = outfile else: fd, out = mkstemp(prefix='rba_', suffix='_11', dir=CONFIG.tmpdir) os.close(fd) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: cmd = [ '{}cmfetch'.format(CONFIG.infernal_path), '-o', out, cmfile, modelid ] ml.debug(cmd) r = call(cmd, stdout=tmp, stderr=tmp) if r: msgfail = 'Call to cmfetch failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.CmfetchException(msgfail, tmp.read()) return out
def run_rnaplot(seq, structure=None, format='svg', outfile=None, timeout=None): """ run rnaplot in desired format if seq :param seq: :param structure: :return: """ ml.debug(fname()) if structure is None: sequence = str(seq.seq) structure = seq.letter_annotations['ss0'] else: sequence = seq assert len(sequence) == len(structure) allowed_formats = {'ps', 'svg', 'gml'} if format not in allowed_formats: raise TypeError('Format can be only from {}.'.format(allowed_formats)) currdirr = os.getcwd() tmpdir = gettempdir() os.chdir(tmpdir) cmd = ['{}RNAplot'.format(CONFIG.viennarna_path), '--output-format={}'.format(format)] ml.debug(cmd) rnaname = generate_random_name(10) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: with subprocess.Popen(cmd, universal_newlines=True, stdin=subprocess.PIPE, stdout=tmp, stderr=tmp) as p: try: p.communicate(input='>{}\n{}\n{}\n'.format( rnaname, sequence, structure ), timeout=timeout) except subprocess.TimeoutExpired: p.kill() p.wait() raise if p.returncode: msgfail = 'Call to RNAplot failed.' ml.error(msgfail) os.chdir(currdirr) tmp.seek(0) details = tmp.read() ml.debug(details) raise exceptions.RNAplotException(msgfail, details) plot_output_file = os.path.join(tmpdir, rnaname + '_ss.' + format) os.chdir(currdirr) if outfile is None: return plot_output_file else: shutil.move(os.path.join(tmpdir, plot_output_file), outfile) return outfile
def subopt_fold_alifold(all_fasta_hits_file, homologs_file, aligner='muscle', params=None, threads=None): """ run clustal/muscle on selected homologs file :return: """ ml.debug(fname()) if params is None: params = dict() # run aligner # ================================================================================================================= if 'clustalo' == aligner: clustal_params = ' --outfmt=clustal --force' clustal_params += params.get('clustalo', '') if threads: clustal_params += ' --threads={}'.format(threads) alig_file = compute_clustalo_clasic(homologs_file, clustalo_params=clustal_params) elif 'muscle' == aligner: alig_file = run_muscle(homologs_file, muscle_params=params.get('muscle', ''), reorder=False) else: raise KeyError( 'provided key ({}) not recognized - avalible: "clustalo" "muscle"'. format(aligner)) # run consensus prediction # ================================================================================================================= alif_file = compute_alifold(alig_file, alifold_params=params.get('alifold', '')) # possibly need to decode alifold structure alif_str = read_seq_str(alif_file)[0] consensus_structure = alif_str.letter_annotations['ss0'] subs = run_hybrid_ss_min(all_fasta_hits_file, mfold=params.get('mfold', (10, 2, 20)), threads=threads) # now compute rna distance score if threads == 1: new_structures = [] for seq in subs: new_structures.append(_helper_subopt(seq, consensus_structure)) else: with multiprocessing.Pool(processes=threads) as pool: tuples = [(seq, consensus_structure) for seq in subs] new_structures = pool.starmap(_helper_subopt, tuples) remove_files_with_try([alif_file, alig_file]) return new_structures
def infer_hits_cm(bit_sc, tr=0): ml.debug(fname()) pred = [] for i in bit_sc: if i > tr: pred.append(True) else: pred.append(False) return pred
def run_rnaplot(seq, structure=None, format='svg', outfile=None): """ run rnaplot in desired format if seq :param seq: :param structure: :return: """ ml.debug(fname()) if structure is None: sequence = str(seq.seq) structure = seq.letter_annotations['ss0'] else: sequence = seq assert len(sequence) == len(structure) allowed_formats = {'ps', 'svg', 'gml', 'xrna'} if format not in allowed_formats: raise TypeError('Format can be only from {}.'.format(allowed_formats)) fd, tmpfile = mkstemp(prefix='rba_', suffix='_08', dir=CONFIG.tmpdir) rnaname = tmpfile.split('/')[-1].split('\\')[-1] currdirr = os.getcwd() tmpdir = gettempdir() os.chdir(tmpdir) with os.fdopen(fd, 'w') as fh: fh.write('>{}\n{}\n{}\n'.format(rnaname, sequence, structure)) cmd = '{} --output-format={} < {}'.format( shlex.quote('{}RNAplot'.format(CONFIG.viennarna_path)), shlex.quote(format), shlex.quote(tmpfile)) ml.debug(cmd) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: r = call(cmd, shell=True, stdout=tmp, stderr=tmp) if r: msgfail = 'Call to RNAplot failed.' ml.error(msgfail) os.chdir(currdirr) tmp.seek(0) details = tmp.read() ml.debug(details) raise exceptions.RNAplotException(msgfail, details) # output file is name of the sequence (in this case name of the file) + "_ss." + chosen format remove_one_file_with_try(tmpfile) plot_output_file = os.path.join(tmpdir, rnaname + '_ss.' + format) os.chdir(currdirr) if outfile is None: return plot_output_file else: shutil.move(os.path.join(tmpdir, plot_output_file), outfile) return outfile
def download_cmmodels_file(path=None, url=None): """ downloads cm model from rfam database default retrieve url is: 'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz' :param path: :param url: :return: """ print('Running CM download from RFAM.') ml.debug(fname()) rfam = RfamInfo() if path is None: path = rfam.rfam_dir if url is None: url = rfam.url if not os.path.exists(path): os.makedirs(path) cmd = ['wget', '-N', '-P', path, url] ml.debug(cmd) ml.info('Downloading RFAM database (aprox 300Mb). This may take a while...') with TemporaryFile(mode='w+', encoding='utf-8') as tmp: r = call(cmd, stderr=tmp, stdout=tmp) if r: msgfail = 'Call to wget failed. Please check the internet connection and/or availability of "wget".' ml.error(msgfail) ml.debug(cmd) sys.exit(1) tmp.seek(0) cmd_output = tmp.read() if 'Remote file no newer than local file' in cmd_output: # do not download msg = 'No new data. Nothing to do.' ml.info(msg) if ml.getEffectiveLevel() > 20: print(msg) else: # unzip using build in gzip with gzip.open(os.path.join(path, rfam.gzname), 'rb') as fin: with open(os.path.join(path, rfam.rfam_file_name), 'wb') as fout: shutil.copyfileobj(fin, fout) # run cmpress to create binary files needed to run cmscan try: run_cmpress(os.path.join(path, rfam.rfam_file_name)) except exceptions.CmpressException as e: ml.error(str(e)) ml.error('The Rfam file might be corrupt. Please check following output to get more information.\n') print(e.errors) sys.exit(1) return os.path.join(path, rfam.rfam_file_name)
def run_muscle(fasta_file, out_file=None, muscle_params='', reorder=True): """ beware, muscle does not keep sequence order and the --stable switch is broken :param fasta_file: :param out_file: :param muscle_params: :param reorder: :return: """ ml.info('Running muscle.') ml.debug(fname()) if out_file: cl_file = out_file else: cl_fd, cl_file = mkstemp(prefix='rba_', suffix='_07', dir=CONFIG.tmpdir) os.close(cl_fd) cmd = [ '{}muscle'.format(CONFIG.muscle_path), '-clwstrict', '-seqtype', 'rna', '-out', cl_file, '-in', fasta_file, '-quiet' ] if muscle_params != '': cmd += [' '.join([shlex.quote(i) for i in shlex.split(muscle_params)])] ml.debug(cmd) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: r = call(cmd, stdout=tmp, stderr=tmp) if r: msgfail = 'Call to muscle failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.MuscleException(msgfail, tmp.read()) if reorder: # reorder sequences acording to input file with open(fasta_file, 'r') as ff, open(cl_file, 'r+') as oo: orig_seqs = [i.id for i in SeqIO.parse(ff, format='fasta')] muscle_align = { i.id: i for i in AlignIO.read(oo, format='clustal') } # reorder reo_alig = [] for s_name in orig_seqs: # muscle cuts names reo_alig.append(muscle_align[s_name[:32]]) alig = AlignIO.MultipleSeqAlignment(reo_alig) # write oo.seek(0) AlignIO.write(alig, oo, format='clustal') oo.truncate() return cl_file
def run_turbofold(sequences, params, timeout=None): ml.info('Running Turbofold.') ml.debug(fname()) try: return _turbofold_worker(sequences, params, timeout=timeout) except exceptions.TurboFoldException as e: return e except AssertionError as e: return e
def filter_by_bits(blast_hitlist, getter, filter_conditions): ml.debug(fname()) result = blast_hitlist for relation, condition in filter_conditions: result = [ h for h in result if OPERATIONS[relation](getter(h).bits, condition) ] return result
def create_report_object_from_locarna(exp_hit, locarna_alig): """ create object which will be appended to BlastSearchRecompute class This needs to be Subsequences object :param exp_hit: :param locarna_alig: :return: """ ml.debug(fname()) # chop alignment by seq query_ind = [i for i, j in enumerate(locarna_alig) if j.id == 'query'] if len(query_ind) != 1: raise exceptions.SubseqMatchError('Got multiple hits with id "query" in the Locarna alignment.') trimmed_locarna_alig = trim_alignment_by_sequence( locarna_alig, str(locarna_alig[query_ind[0]].seq), structure_annotation='SS_cons' ) aligned_subsequence = BA_support.select_analyzed_aligned_hit(trimmed_locarna_alig, exp_hit.id) # add annotations from exp hit aligned_subsequence.annotations = exp_hit.annotations aligned_subsequence.name = exp_hit.name # also add annotations from locarna, mainly score aligned_subsequence.annotations.update(locarna_alig.annotations) # get the structure # by refold refold_structures = refold_stockholm(trimmed_locarna_alig, trimmed_locarna_alig.column_annotations['SS_cons']) # select refold structure for my seq seq_refold_structure = _select_refold_structure(refold_structures, exp_hit.id) aligned_subsequence.letter_annotations['ss0'] = seq_refold_structure.letter_annotations['ss0'] aligned_subsequence.annotations['sss'] = ['ss0'] # prepare seq_record for subsequences aligned_subsequence.description = '' hit = BA_support.Subsequences(exp_hit) hit.extension = aligned_subsequence # find the matching sequence pos_match = re.search(str(aligned_subsequence.seq), str(exp_hit.seq), flags=re.IGNORECASE) if not pos_match: raise exceptions.SubseqMatchError( 'Aligned portion of subject sequence in Locarna alignment was not found in parent sequence.' ) hit.best_start, hit.best_end = compute_true_location_locarna(hit, pos_match) return hit
def get_cm_model(query_file, params=None, threads=None): ml.debug(fname()) cmscan_data = get_cm_model_table(query_file, params, threads) best_model_row = select_best_matching_model_from_cmscan(cmscan_data) if best_model_row is None: return None best_model = best_model_row['target_name'] ml.info('Best matching model: {}'.format(best_model)) return best_model
def centroid_homfold_fast(all_seqs, query, all_seqs_fasta, n, centroid_homfold_params, len_diff): ml.debug(fname()) selected_seqs = centroid_homfold_fast_prep(all_seqs, query, n, len_diff) ch, homologous_file = mkstemp(prefix='rba_', suffix='_74', dir=CONFIG.tmpdir) with os.fdopen(ch, 'w') as h: SeqIO.write(selected_seqs, h, 'fasta') structures, _ = me_centroid_homfold(all_seqs_fasta, homologous_file, params=centroid_homfold_params) BA_support.remove_one_file_with_try(homologous_file) return structures
def centroid_homfold_fast_prep(all_seqs, query, n, len_diff): ml.debug(fname()) assert n >= 1, "Number of sequences for centroid-fast must be greater then 0." if query.annotations['ambiguous']: msgfail = "Query sequence contains ambiguous characters. Can't use centroid-fast." ml.error(msgfail) raise AmbiguousQuerySequenceException(msgfail) nr_na_ld = BA_support.sel_seq_simple(all_seqs, query, len_diff) nr_na_ld_n = nr_na_ld[:int(n)] return nr_na_ld_n