def tab_output_equal(csvfile=None, jsonfile=None, pdfile=None, fastafile=None, fastastructures=None, ref_seqs_file=None): c = None j = None p = None ff = None fs = None if csvfile is not None: c = pd.read_csv(csvfile)['best_sequence'].tolist() if jsonfile is not None: with open(jsonfile, 'r') as f: j = blastsearchrecomputefromdict(json.load(f)).hits j = [i.extension for i in j] j = [str(i.seq) for i in j] if pdfile is not None: p = pd.read_pickle(pdfile)['best_sequence'].tolist() if fastafile is not None: with open(fastafile, 'r') as f: ff = [str(i.seq) for i in SeqIO.parse(f, format='fasta')] if fastastructures is not None: fs = [i for i in parse_named_structure_file(fastastructures)] fs = [str(i.seq) for i in fs] outputs = [c, j, p, ff, fs] outputs = [i for i in outputs if i is not None] if ref_seqs_file is not None: json_file = ref_seqs_file f = open(json_file, 'r') mydata = json.load(f) f.close() bb = blastsearchrecomputefromdict(mydata).hits bb = [i.extension for i in bb] bb = [str(i.seq) for i in bb] outputs += [bb] # check length if not all(len(i) == len(outputs[0]) for i in outputs): raise AssertionError( 'All output files have not same length' ) # check sequences (all outputs should have same output sequences) for ll in zip(*outputs): if not all(i == ll[0] for i in ll): raise AssertionError( 'All output sequences should be same.' ) return True
def test_output_with_sequence_fail(self): f = open(self.json_file, 'r') mydata = json.load(f) f.close() bb = convert_classes.blastsearchrecomputefromdict(mydata) hit = bb.hits.pop(1) hit.extension = None bb.hits_failed.append(hit) with open(self.htmlo, 'wb') as h: h.write(write_html_output(bb)) try: with open(self.htmlo, 'rb') as f, open( os.path.abspath( os.path.dirname(__file__) + '/test_data/RF00001_reference_missing_hit.html.md5' ) ) as r: self.assertEqual( hashlib.md5(f.read()).hexdigest(), r.read() ) finally: try: os.remove(self.htmlo) except: print('removing temporary test files failed')
def prepare_new_htmlout(): json_file = os.path.abspath( os.path.dirname(__file__) + '/../RF00001_output.json') f = open(json_file, 'r') mydata = json.load(f) bb = convert_classes.blastsearchrecomputefromdict(mydata) fd, html_file = tempfile.mkstemp(prefix='rba_', suffix='_t30') os.close(fd) with open(html_file, 'wb') as h: h.write(write_html_output(bb)) target = os.path.abspath( os.path.dirname(__file__) + '/../RF00001_reference_output.html.md5') with open(html_file, 'rb') as f, open(target, 'w') as t: t.write(hashlib.md5(f.read()).hexdigest()) os.remove(html_file) bb.hits[1].extension = None with open(html_file, 'wb') as h: h.write(write_html_output(bb)) target = os.path.abspath( os.path.dirname(__file__) + '/../RF00001_reference_missing_hit.html.md5') with open(html_file, 'rb') as f, open(target, 'w') as t: t.write(hashlib.md5(f.read()).hexdigest()) os.remove(html_file)
def setUp(self): self.json_file = os.path.abspath( os.path.dirname(__file__) + '/test_data/RF00001_output.json') f = open(self.json_file, 'r') mydata = json.load(f) f.close() self.data = blastsearchrecomputefromdict(mydata)
def test_blastrecompute_with_blast_data(self): # load blast data blast_outputs = [] with open( os.path.join(fwd, 'test_data', 'blast_parse_hits_txt_standalone.txt'), 'r') as f: for r in blast_parse_txt(f): blast_outputs.append(r) q = SeqRecord(Seq('ACGUTGU'), id='qq') s = BlastSearchRecompute(None, q, 0) h = HitList() a = Subsequences( SeqRecord(Seq('ACGUTGU'), id='aa', annotations={ 'blast': (0, blast_outputs[0].alignments[0].hsps[0]) })) b = Subsequences( SeqRecord(Seq('ACGAUCGUGAC'), id='bb', annotations={ 'blast': (1, blast_outputs[0].alignments[0].hsps[1]) })) h.append(a) h.append(b) s.hits = h s.query = SeqRecord(Seq('ACGUGUGCA'), id='query') s.args = Namespace(**{'aa': 'asdq', 'bb': 'acoi'}) encoded = convert_classes.blastsearchrecompute2dict(s) encoded_json = json.dumps(encoded) encoded = json.loads(encoded_json) decoded = convert_classes.blastsearchrecomputefromdict(encoded) tc.recrusive_compare(s, decoded)
def tab_output_equal_structures(csvfile=None, jsonfile=None, pdfile=None, fastastructures=None): names = method_required_tools.keys() cc = None jj = None pp = None fsfs = None if csvfile is not None: c = pd.read_csv(csvfile) cc = dict() for n in names: if n in c.columns: cc[n] = c[n].tolist() if jsonfile is not None: with open(jsonfile, 'r') as f: j = blastsearchrecomputefromdict(json.load(f)).hits j = [i.extension for i in j] jj = dict() for s in j: for key in s.letter_annotations.keys(): if key not in jj: jj[key] = [] jj[key].append(s.letter_annotations[key]) if pdfile is not None: p = pd.read_pickle(pdfile) pp = dict() for n in names: if n in p.columns: pp[n] = p[n].tolist() if fastastructures is not None: fs = [i for i in parse_named_structure_file(fastastructures)] assert all([len(fs[0].letter_annotations) == k for k in [len(i.letter_annotations) for i in fs]]) fsfs = dict() for s in fs: for key in s.letter_annotations.keys(): if key not in fsfs: fsfs[key] = [] fsfs[key].append(s.letter_annotations[key]) outputs = [cc, jj, pp, fsfs] outputs = [i for i in outputs if i is not None] # check length if not all(len(i) == len(outputs[0]) for i in outputs): raise AssertionError( 'All output files have not same length' ) # check structures (all outputs should have same output structures) keys = outputs[0].keys() for k in keys: for ss in zip(*[ll[k] for ll in outputs]): if not all(i == ss[0] for i in ss): raise AssertionError( 'All output structures should be same.' ) return True
def setUp(self): bixml = os.path.abspath( os.path.dirname(__file__) + '/test_data/web_multi_hit.xml') with open(bixml, 'r') as b: self.blast = blast_hsps2list([i for i in NCBIXML.parse(b)][0]) jfile = os.path.abspath( os.path.dirname(__file__) + '/test_data/RF00001_output.json') with open(jfile, 'r') as j: self.res = blastsearchrecomputefromdict(json.load(j))
def setUp(self): f = open(ref_json_file, 'r') mydata = json.load(f) f.close() bb = convert_classes.blastsearchrecomputefromdict(mydata) self.data = bb ff, csv = tempfile.mkstemp(prefix='rba_', suffix='_t1') os.close(ff) self.csv = csv ff, html = tempfile.mkstemp(prefix='rba_', suffix='_t2') os.close(ff) self.html = blast_query + 'test_html.html' ff, pandas_dump = tempfile.mkstemp(prefix='rba_', suffix='_t3') os.close(ff) self.pandas_dump = pandas_dump ff, json_file = tempfile.mkstemp(prefix='rba_', suffix='_t4') os.close(ff) self.json = json_file ff, fasta = tempfile.mkstemp(prefix='rba_', suffix='_t5') os.close(ff) self.fasta = fasta ff, allHits_fasta = tempfile.mkstemp(prefix='rba_', suffix='_t6') with os.fdopen(ff, 'w') as f: SeqIO.write([i.extension for i in self.data.hits], f, 'fasta') self.fasta_structures = allHits_fasta self.args = Pseudoargs( blast_query, blast_in, blast_db, b_type='plain', prediction_method=['rnafold'], blast_regexp=r'(?<=\|)[A-Z0-9]*\.?\d*$', enable_overwrite=True, html=self.html, ) self.func_args = { 'query': self.data.query, 'seqs2predict_fasta': allHits_fasta, 'pred_method_params': {}, 'all_hits_list': [i.extension for i in self.data.hits], 'seqs2predict_list': [i.extension for i in self.data.hits], 'use_cm_file': 'abc', } remove_files_with_try([blast_in + '.tmp_rboAnalyzer'])
def test_blastrecompute(self): q = SeqRecord(Seq('ACGUTGU'), id='qq') s = BlastSearchRecompute(None, q, 0) h = HitList() a = Subsequences(SeqRecord(Seq('ACGUTGU'), id='aa')) b = Subsequences(SeqRecord(Seq('ACGAUCGUGAC'), id='bb')) h.append(a) h.append(b) s.hits = h s.query = SeqRecord(Seq('ACGUGUGCA'), id='query') s.args = Namespace(**{'aa': 'asdq', 'bb': 'acoi'}) encoded = convert_classes.blastsearchrecompute2dict(s) encoded_json = json.dumps(encoded) encoded = json.loads(encoded_json) decoded = convert_classes.blastsearchrecomputefromdict(encoded) tc.recrusive_compare(s, decoded)
def test_continuation2(self): with open(blast_output, 'r') as f, open(self.test_backup_file, 'w') as ff: data = blastsearchrecomputefromdict(json.load(f)) data.args.blast_in = blast_in data.args.json = None data.args.html = test_output_file data.args.sha1 = self.sha1 data.args.prediction_method += ['centroid'] new_structures = {'rnafold': []} for h in data.hits: n = copy(h.extension) n.letter_annotations['ss0'] = n.letter_annotations['rnafold'] del n.letter_annotations['rnafold'] new_structures['rnafold'].append(n) json.dump([blastsearchrecompute2dict(data)], ff, indent=2) out = lunch_with_args(self.args) self.assertEqual(1, 1) # test_output for i in range(len(out)): out[0].to_csv(self.csv) j_obj = json.dumps(blastsearchrecompute2dict(out[0]), indent=2) with open(self.json, 'w') as ff: ff.write(j_obj) out[0].write_results_fasta(self.fasta) out[0].write_results_structures(self.fasta_structures) t = tab_output_equal(csvfile=self.csv, jsonfile=self.json, fastafile=self.fasta, fastastructures=self.fasta_structures, ref_seqs_file=os.path.join( fwd, test_data_dir, 'simple.json')) self.assertEqual(t, True)
def lunch_computation(args_inner, shared_list=None): ml.debug(fname()) if not shared_list: shared_list = [] # update params if different config is requested CONFIG.override(tools_paths(args_inner.config_file)) p_blast = BA_support.blast_in(args_inner.blast_in, b=args_inner.b_type) query_seqs = [i for i in SeqIO.parse(args_inner.blast_query, 'fasta')] if len(p_blast) != len(query_seqs): ml.error( 'Number of query sequences in provided BLAST output file ({}) does not match number of query sequences' ' in query FASTA file ({}).'.format(len(p_blast), len(query_seqs))) sys.exit(1) # check if BLAST does not contain unexpected sequence characters validate_args.check_blast(p_blast) # create list of correct length if needed all_saved_data = [None] * len(query_seqs) saved_file = '{}.r-{}'.format(args_inner.blast_in, args_inner.sha1[:10]) with open(saved_file, 'r+') as f: _saved = json.load(f) if _saved is None: f.seek(0) f.truncate() json.dump(all_saved_data, f) else: msg = "Loading backup data." print('STATUS: ' + msg) ml.info(msg + ' file: ' + saved_file) all_saved_data = _saved for saved_data in all_saved_data: # we can have partially computed data if saved_data is None: continue if saved_data['args']['sha1'] != args_inner.sha1: msg = "Input argument hash does not match the saved argument hash. " if saved_data['args']['sha1'][:10] == args_inner.sha1[:10]: msg += "This is because of truncating hashes to first 10 characters. " msg += "Please remove the '{}' file.".format( saved_file) ml.error(msg) sys.exit(1) else: msg += "Please remove the '{}' file.".format( saved_file) sys.exit(1) if len(p_blast) > 1: multi_query = True else: multi_query = False # this is done for each query ml_out_line = [] all_analyzed = [] for iteration, (bhp, query, saved_data) in enumerate( zip(p_blast, query_seqs, all_saved_data)): if saved_data is None: print('STATUS: processing query: {}'.format(query.id)) validate_args.verify_query_blast(blast=bhp, query=query) analyzed_hits = BlastSearchRecompute(args_inner, query, iteration) analyzed_hits.multi_query = multi_query # run cm model build # allows to fail fast if rfam was selected and we dont find the model ih_model, analyzed_hits = find_and_extract_cm_model( args_inner, analyzed_hits) # select all all_blast_hits = BA_support.blast_hsps2list(bhp) if len(all_blast_hits) == 0: ml.error('No hits found in {} - {}. Nothing to do.'.format( args_inner.blast_in, bhp.query)) continue # filter if needed if args_inner.filter_by_eval is not None: tmp = filter_by_eval(all_blast_hits, BA_support.blast_hit_getter_from_hits, args_inner.filter_by_eval) if len(tmp) == 0 and len(all_blast_hits) != 0: ml.error( 'The requested filter removed all BLAST hits {} - {}. Nothing to do.' .format(args_inner.blast_in, bhp.query)) continue elif args_inner.filter_by_bitscore is not None: tmp = filter_by_bits(all_blast_hits, BA_support.blast_hit_getter_from_hits, args_inner.filter_by_bitscore) if len(tmp) == 0 and len(all_blast_hits) != 0: ml.error( 'The requested filter removed all BLAST hits {} - {}. Nothing to do.' .format(args_inner.blast_in, bhp.query)) continue all_short = all_blast_hits # now this is different for each mode if args_inner.mode == 'simple': analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_simple_core( analyzed_hits, query, args_inner, all_short, multi_query, iteration, ih_model) elif args_inner.mode == 'locarna': analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_locarna_core( analyzed_hits, query, args_inner, all_short, multi_query, iteration, ih_model) elif args_inner.mode == 'meta': analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_meta_core( analyzed_hits, query, args_inner, all_short, multi_query, iteration, ih_model) else: raise ValueError( 'Unknown option - should be cached by argparse.') if len(analyzed_hits.hits) == 0: ml.error( "Extension failed for all sequences. Please see the error message. You can also try '--mode simple'." ) sys.exit(1) analyzed_hits.copy_hits() with open(args_inner.blast_in + '.r-' + args_inner.sha1[:10], 'r+') as f: all_saved_data = json.load(f) all_saved_data[iteration] = blastsearchrecompute2dict( analyzed_hits) f.seek(0) f.truncate() json.dump(all_saved_data, f, indent=2) else: print( 'STATUS: extended sequences loaded from backup file for query {}' .format(query.id)) analyzed_hits = blastsearchrecomputefromdict(saved_data) # overwrite the saved args with current # this will update used prediction methods and other non essential stuff analyzed_hits.args = args_inner if analyzed_hits.args.cm_file: cm_file_rfam_user = analyzed_hits.args.cm_file else: cm_file_rfam_user = None all_analyzed.append(analyzed_hits) # write all hits to fasta fda, all_hits_fasta = mkstemp(prefix='rba_', suffix='_22', dir=CONFIG.tmpdir) os.close(fda) analyzed_hits.write_results_fasta(all_hits_fasta) out_line = [] # multiple prediction params if args_inner.dev_pred: dp_list = [] # acomodate more dev pred outputs dpfile = None if getattr(args_inner, 'dump', False): dpfile = args_inner.dump.strip('dump') if getattr(args_inner, 'pandas_dump', False): dpfile = args_inner.pandas_dump.strip('pandas_dump') if getattr(args_inner, 'json', False): dpfile = args_inner.json.strip('json') # optimization so the rfam cm file is used only once if cm_file_rfam_user is None and 'rfam' in ''.join( args_inner.prediction_method): best_model = get_cm_model(args_inner.blast_query, threads=args_inner.threads) rfam = RfamInfo() cm_file_rfam_user = run_cmfetch(rfam.file_path, best_model) for method in args_inner.prediction_method: # cycle the prediction method settings # get set of params for each preditcion selected_pred_params = [ kk for kk in args_inner.pred_params if method in kk ] shuffle(selected_pred_params) # for method_params in args_inner.pred_params: for i, method_params in enumerate(selected_pred_params): ah = deepcopy(analyzed_hits) random_flag = BA_support.generate_random_name( 8, shared_list) shared_list.append(random_flag) pname = re.sub(' ', '', str(method)) flag = '|pred_params|' + random_flag # rebuild the args only with actualy used prediction settings ah.args.prediction_method = method ah.args.pred_params = method_params if getattr(args_inner, 'dump', False): spa = args_inner.dump.split('.') ah.args.dump = '.'.join( spa[:-1]) + flag + '.' + spa[-1] if getattr(args_inner, 'pandas_dump', False): spa = args_inner.pandas_dump.split('.') ah.args.pandas_dump = '.'.join( spa[:-1]) + flag + '.' + spa[-1] if getattr(args_inner, 'pdf_out', False): spa = args_inner.pdf_out.split('.') ah.args.pdf_out = '.'.join( spa[:-1]) + flag + '.' + spa[-1] if getattr(args_inner, 'json', False): spa = args_inner.json.split('.') ah.args.json = '.'.join( spa[:-1]) + flag + '.' + spa[-1] wrapped_ending_with_prediction( args_inner=ah.args, analyzed_hits=ah, pred_method=method, method_params=method_params, used_cm_file=cm_file_rfam_user, multi_query=multi_query, iteration=iteration, ) success = True out_line.append(to_tab_delim_line_simple(ah.args)) dp_list.append((i, method_params, success, flag, pname, random_flag, args_inner.pred_params)) if dpfile is not None: with open(dpfile + 'devPredRep', 'wb') as devf: pickle.dump(dp_list, devf) else: wrapped_ending_with_prediction( args_inner=args_inner, analyzed_hits=analyzed_hits, used_cm_file=cm_file_rfam_user, multi_query=multi_query, iteration=iteration, ) out_line.append(to_tab_delim_line_simple(args_inner)) ml_out_line.append('\n'.join(out_line)) if cm_file_rfam_user is not None and os.path.exists(cm_file_rfam_user): BA_support.remove_one_file_with_try(cm_file_rfam_user) BA_support.remove_one_file_with_try(all_hits_fasta) return '\n'.join(ml_out_line), all_analyzed
def setUp(self): with open(os.path.join(fwd, test_dir, 'RF00001_output.json'), 'r') as ff: ll = json.load(ff) self._bsdata = convert_classes.blastsearchrecomputefromdict(ll) self.data = self._bsdata.copy()