def testParse(self): f = open("./SCOP/dir.cla.scop.txt_test") try: cla = f.read() f.close() f = open("./SCOP/dir.des.scop.txt_test") des = f.read() f.close() f = open("./SCOP/dir.hie.scop.txt_test") hie = f.read() finally: f.close() scop = Scop(StringIO(cla), StringIO(des), StringIO(hie)) cla_out = StringIO() scop.write_cla(cla_out) lines = zip(cla.rstrip().split("\n"), cla_out.getvalue().rstrip().split("\n")) for expected_line, line in lines: self.assertTrue(self._compare_cla_lines(expected_line, line)) des_out = StringIO() scop.write_des(des_out) self.assertEqual(des_out.getvalue(), des) hie_out = StringIO() scop.write_hie(hie_out) self.assertEqual(hie_out.getvalue(), hie) domain = scop.getDomainBySid("d1hbia_") self.assertEqual(domain.sunid, 14996) domains = scop.getDomains() self.assertEqual(len(domains), 14) self.assertEqual(domains[4].sunid, 14988) dom = scop.getNodeBySunid(-111) self.assertEqual(dom, None) dom = scop.getDomainBySid("no such domain") self.assertEqual(dom, None)
def test_get_descendents(self): """Test getDescendents method.""" scop = Scop(dir_path="SCOP", version="test") fold = scop.getNodeBySunid(46457) # get px descendents domains = fold.getDescendents("px") self.assertEqual(len(domains), 14) for d in domains: self.assertEqual(d.type, "px") sfs = fold.getDescendents("superfamily") self.assertEqual(len(sfs), 1) for d in sfs: self.assertEqual(d.type, "sf") # cl has no cl descendent cl = fold.getDescendents("cl") self.assertEqual(cl, [])
def testParse(self): f = open("./SCOP/dir.cla.scop.txt_test") try: cla = f.read() f.close() f = open("./SCOP/dir.des.scop.txt_test") des = f.read() f.close() f = open("./SCOP/dir.hie.scop.txt_test") hie = f.read() finally: f.close() scop = Scop(StringIO(cla), StringIO(des), StringIO(hie)) cla_out = StringIO() scop.write_cla(cla_out) lines = zip(cla.rstrip().split('\n'), cla_out.getvalue().rstrip().split('\n')) for expected_line, line in lines: self.assertTrue(self._compare_cla_lines(expected_line, line)) des_out = StringIO() scop.write_des(des_out) self.assertEqual(des_out.getvalue(), des) hie_out = StringIO() scop.write_hie(hie_out) self.assertEqual(hie_out.getvalue(), hie) domain = scop.getDomainBySid("d1hbia_") self.assertEqual(domain.sunid, 14996) domains = scop.getDomains() self.assertEqual(len(domains), 14) self.assertEqual(domains[4].sunid, 14988) dom = scop.getNodeBySunid(-111) self.assertEqual(dom, None) dom = scop.getDomainBySid("no such domain") self.assertEqual(dom, None)
def test_get_descendents(self): """Test getDescendents method""" scop = Scop(dir_path="SCOP", version="test") fold = scop.getNodeBySunid(46457) # get px descendents domains = fold.getDescendents('px') self.assertEqual(len(domains), 14) for d in domains: self.assertEqual(d.type, 'px') sfs = fold.getDescendents('superfamily') self.assertEqual(len(sfs), 1) for d in sfs: self.assertEqual(d.type, 'sf') # cl has no cl descendent cl = fold.getDescendents('cl') self.assertEqual(cl, [])
def blast(algo): seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta', key_function=lambda x: x.split()[0]) hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb')) scop = Scop(dir_path='data/scop', version='1.75') tmpdir = Path(f'.{algo}') tmpdir.mkdir(exist_ok=True) auc_result = {} for sf in tqdm(hie): px_list = hie[sf] if len(px_list) < 1: continue sid = random.sample(px_list, 1)[0] record = seqindex[sid] f_fasta = tmpdir/f'{sid}.fasta' f_xml = tmpdir/f'{sid}.xml' SeqIO.write(record, f_fasta.as_posix(), 'fasta') try: if algo == 'psiblast': NcbipsiblastCommandline(query=f_fasta.as_posix(), db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75', num_threads=int(os.cpu_count()), num_iterations=3, evalue=999999, outfmt=5, out=f_xml.as_posix())() elif algo == 'deltablast': NcbideltablastCommandline(query=f_fasta.as_posix(), db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75', num_threads=int(os.cpu_count()), num_iterations=3, evalue=999999, outfmt=5, out=f_xml.as_posix())() else: raise ValueError(f'Invalid algorithm ({algo})') except ApplicationError as e: logging.error(e) f_xml.unlink() continue finally: f_fasta.unlink() results = SearchIO.parse(f_xml.as_posix(), 'blast-xml') results = list(results)[-1] results = list(results)[:500] sf_sccs = scop.getNodeBySunid(sf).sccs roc_score = [] roc_label = [] for result in results: result_sf_sccs = result.description.split(' ')[0][:-2] roc_score.append(-result.hsps[0].evalue) if result_sf_sccs == sf_sccs: roc_label.append(1) else: roc_label.append(0) if np.all(np.array(roc_label) == 1): auc = 1.0 elif np.all(np.array(roc_label) == 0): auc = 0.0 else: auc = metrics.roc_auc_score(roc_label, roc_score) auc_result[sf_sccs] = {'auc': auc, 'sample': sid, 'num': len(results)} f_xml.unlink() now = int(time.time()) pickle.dump(auc_result, Path(f'auc_result_{algo}_{now}.pkl').open('wb'))
('d3cw9a1', datetime.datetime(2008, 9, 2, 0, 0), 'e.23.1', 22), ('d2axtd1', datetime.datetime(2009, 2, 10, 0, 0), 'f.26.1', 174), ('d2axto1', datetime.datetime(2009, 2, 10, 0, 0), 'f.4.1', 15), ('d2vy4a1', datetime.datetime(2009, 2, 17, 0, 0), 'g.37.1', 182), ('d3d9ta1', datetime.datetime(2009, 2, 10, 0, 0), 'g.52.1', 81), ) test_data = [x[0] for x in test_data] scop40 = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta') scop100_hie = Scop(dir_path=Path('data/scop'), version='1.75') if n_splits > 1: fold1 = next(KFold(n_splits=n_splits, shuffle=True).split(scop40)) samples = np.array([v for i, v in enumerate(scop40) if i in fold1[1]]) # sf_sunid of scop100 (sid) px = np.array([x.sid for x in scop100_hie.getNodeBySunid(sf_sunid).getDescendents('px')]) # select only sf_sunid in scop40 from scop100 isect = np.intersect1d(samples, px) # select half of sf_sunid only in scop40 for test data fold1 = next(KFold(n_splits=test_n_splits, shuffle=True).split(isect)) tests = np.array([v for i, v in enumerate(isect) if i in fold1[1]]) np.save(Path(f'data/test/scop40_{n_splits}fold_sf{sf_sunid}_testdata_{test_n_splits}fold.npy'), tests) # select domain sids only in scop40 for training data train = np.setdiff1d(samples, tests) np.save(Path(f'data/train/scop40_{n_splits}fold_trainingdata.npy'), train) # {sf: [sid]} list for making alignment pairs in the same superfamily hie = {} for i in train: dom = scop100_hie.getDomainBySid(i) if dom: sf = dom.getAscendent('sf').sccs