def generate_structural_alignments(scop40_fasta: str, scop_dir: str, scop_version: str, pdb_dir: str, out_file: str): scop40 = SeqIO.index(scop40_fasta, 'fasta') scop_root = Scop(dir_path=scop_dir, version=scop_version).getRoot() results = [] for cl in tqdm(scop_root.getChildren()): for cf in tqdm(cl.getChildren()): for sf in tqdm(cf.getChildren()): px = sf.getDescendents('px') if len(px) < 2: continue with ThreadPoolExecutor() as executor: futures = [] for c in itertools.combinations(px, 2): if c[0].sid in scop40 and c[1].sid in scop40: futures.append( executor.submit( run_tmalign_async, f'{pdb_dir}/{c[0].sid[2:4]}/{c[0].sid}.ent', f'{pdb_dir}/{c[1].sid[2:4]}/{c[1].sid}.ent' )) for future in as_completed(futures): result = future.result() if result is not None: results.append(result[0]) results.append(result[1]) SeqIO.write(results, out_file, 'fasta')
def testConstructWithCustomFile(self): scop = Scop(dir_path="SCOP", version="test") astral = Astral( scop=scop, astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa") self.assertEqual(str(astral.getSeqBySid("d3sdha_")), "AAAAA") self.assertEqual(str(astral.getSeqBySid("d4hbib_")), "KKKKK")
def testGetAscendent(self): scop = Scop(dir_path="SCOP", version="test") domain = scop.getDomainBySid("d1hbia_") # get the fold fold = domain.getAscendent("cf") self.assertEqual(fold.sunid, 46457) # get the superfamily sf = domain.getAscendent("superfamily") self.assertEqual(sf.sunid, 46458) # px has no px ascendent px = domain.getAscendent("px") self.assertEqual(px, None) # an sf has no px ascendent px2 = sf.getAscendent("px") self.assertEqual(px2, None)
def test_get_descendents(self): """Test getDescendents method""" scop = Scop(dir_path="SCOP", version="test") fold = scop.getNodeBySunid(46457) # get px descendents domains = fold.getDescendents('px') self.assertEqual(len(domains), 14) for d in domains: self.assertEqual(d.type, 'px') sfs = fold.getDescendents('superfamily') self.assertEqual(len(sfs), 1) for d in sfs: self.assertEqual(d.type, 'sf') # cl has no cl descendent cl = fold.getDescendents('cl') self.assertEqual(cl, [])
def testGetAscendent(self): scop = Scop(dir_path="SCOP", version="test") domain = scop.getDomainBySid("d1hbia_") # get the fold fold = domain.getAscendent('cf') self.assertEqual(fold.sunid, 46457) # get the superfamily sf = domain.getAscendent('superfamily') self.assertEqual(sf.sunid, 46458) # px has no px ascendent px = domain.getAscendent('px') self.assertEqual(px, None) # an sf has no px ascendent px2 = sf.getAscendent('px') self.assertEqual(px2, None)
def test_get_descendents(self): """Test getDescendents method.""" scop = Scop(dir_path="SCOP", version="test") fold = scop.getNodeBySunid(46457) # get px descendents domains = fold.getDescendents("px") self.assertEqual(len(domains), 14) for d in domains: self.assertEqual(d.type, "px") sfs = fold.getDescendents("superfamily") self.assertEqual(len(sfs), 1) for d in sfs: self.assertEqual(d.type, "sf") # cl has no cl descendent cl = fold.getDescendents("cl") self.assertEqual(cl, [])
class AstralTests(unittest.TestCase): def setUp(self): self.scop = Scop(dir_path="SCOP", version="test") self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test") def testGetSeq(self): self.assertEqual(str(self.astral.getSeqBySid("d3sdha_")), "AAAAA") self.assertEqual(str(self.astral.getSeqBySid("d4hbib_")), "KKKKK") dom = self.scop.getDomainBySid("d3sdha_") self.assertEqual(str(self.astral.getSeq(dom)), "AAAAA") def testConstructWithCustomFile(self): scop = Scop(dir_path="SCOP", version="test") astral = Astral( scop=scop, astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa") self.assertEqual(str(astral.getSeqBySid("d3sdha_")), "AAAAA") self.assertEqual(str(astral.getSeqBySid("d4hbib_")), "KKKKK") def testGetDomainsFromFile(self): filename = "SCOP/scopseq-test/astral-scopdom-seqres-sel-gs-bib-20-test.id" domains = self.astral.getAstralDomainsFromFile(filename) self.assertEqual(len(domains), 3) self.assertEqual(domains[0].sid, "d3sdha_") self.assertEqual(domains[1].sid, "d4hbib_") self.assertEqual(domains[2].sid, "d5hbia_") def testGetDomainsClustered(self): domains1 = self.astral.domainsClusteredById(20) self.assertEqual(len(domains1), 3) self.assertEqual(domains1[0].sid, "d3sdha_") self.assertEqual(domains1[1].sid, "d4hbib_") self.assertEqual(domains1[2].sid, "d5hbia_") domains2 = self.astral.domainsClusteredByEv(1e-15) self.assertEqual(len(domains2), 1)
class AstralTests(unittest.TestCase): def setUp(self): self.scop = Scop(dir_path="SCOP", version="test") self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test") def testGetSeq(self): self.assertEqual(str(self.astral.getSeqBySid('d3sdha_')), "AAAAA") self.assertEqual(str(self.astral.getSeqBySid('d4hbib_')), "KKKKK") dom = self.scop.getDomainBySid('d3sdha_') self.assertEqual(str(self.astral.getSeq(dom)), "AAAAA") def testConstructWithCustomFile(self): scop = Scop(dir_path="SCOP", version="test") astral = Astral(scop=scop, astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa") self.assertEqual(str(astral.getSeqBySid('d3sdha_')), "AAAAA") self.assertEqual(str(astral.getSeqBySid('d4hbib_')), "KKKKK") def testGetDomainsFromFile(self): filename = "SCOP/scopseq-test/astral-scopdom-seqres-sel-gs-bib-20-test.id" domains = self.astral.getAstralDomainsFromFile(filename) self.assertEqual(len(domains), 3) self.assertEqual(domains[0].sid, "d3sdha_") self.assertEqual(domains[1].sid, "d4hbib_") self.assertEqual(domains[2].sid, "d5hbia_") def testGetDomainsClustered(self): domains1 = self.astral.domainsClusteredById(20) self.assertEqual(len(domains1), 3) self.assertEqual(domains1[0].sid, "d3sdha_") self.assertEqual(domains1[1].sid, "d4hbib_") self.assertEqual(domains1[2].sid, "d5hbia_") domains2 = self.astral.domainsClusteredByEv(1e-15) self.assertEqual(len(domains2), 1)
def testConstructFromDirectory(self): scop = Scop(dir_path="SCOP", version="test") self.assertTrue(isinstance(scop, Scop)) domain = scop.getDomainBySid("d1hbia_") self.assertEqual(domain.sunid, 14996)
def setUp(self): self.scop = Scop(dir_path="SCOP", version="test") self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")
import numpy as np from sklearn import metrics from Bio.SCOP import Scop from Bio.Blast.Applications import NcbideltablastCommandline, NcbipsiblastCommandline from Bio import SeqIO, SearchIO from Bio.Seq import Seq from Bio.Alphabet import generic_protein import matplotlib.pyplot import matplotlib.figure import matplotlib.axes import pandas import seaborn as sns from tqdm import tqdm hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb')) scop100 = Scop(dir_path='data/scop', version='1.75') seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta') test_data = ( ('d1wlqc_', datetime.datetime(2009, 2, 17, 0, 0), 'a.4.5', 762), ('d2axtu1', datetime.datetime(2009, 2, 10, 0, 0), 'a.60.12', 159), ('d2zqna1', datetime.datetime(2009, 2, 10, 0, 0), 'b.42.2', 119), ('d1qg3a1', datetime.datetime(2009, 1, 20, 0, 0), 'b.1.2', 344), ('d1wzca1', datetime.datetime(2009, 1, 27, 0, 0), 'c.108.1', 296), ('d2dsta1', datetime.datetime(2009, 1, 27, 0, 0), 'c.69.1', 975), ('d1y5ha3', datetime.datetime(2009, 2, 10, 0, 0), 'd.37.1', 62), ('d2pzza1', datetime.datetime(2009, 1, 20, 0, 0), 'd.77.1', 92), ('d1ni9a_', datetime.datetime(2009, 2, 10, 0, 0), 'e.7.1', 151), ('d3cw9a1', datetime.datetime(2008, 9, 2, 0, 0), 'e.23.1', 22), ('d2axtd1', datetime.datetime(2009, 2, 10, 0, 0), 'f.26.1', 174), ('d2axto1', datetime.datetime(2009, 2, 10, 0, 0), 'f.4.1', 15),
def testParse(self): f = open("./SCOP/dir.cla.scop.txt_test") try: cla = f.read() f.close() f = open("./SCOP/dir.des.scop.txt_test") des = f.read() f.close() f = open("./SCOP/dir.hie.scop.txt_test") hie = f.read() finally: f.close() scop = Scop(StringIO(cla), StringIO(des), StringIO(hie)) cla_out = StringIO() scop.write_cla(cla_out) lines = zip(cla.rstrip().split('\n'), cla_out.getvalue().rstrip().split('\n')) for expected_line, line in lines: self.assertTrue(self._compare_cla_lines(expected_line, line)) des_out = StringIO() scop.write_des(des_out) self.assertEqual(des_out.getvalue(), des) hie_out = StringIO() scop.write_hie(hie_out) self.assertEqual(hie_out.getvalue(), hie) domain = scop.getDomainBySid("d1hbia_") self.assertEqual(domain.sunid, 14996) domains = scop.getDomains() self.assertEqual(len(domains), 14) self.assertEqual(domains[4].sunid, 14988) dom = scop.getNodeBySunid(-111) self.assertEqual(dom, None) dom = scop.getDomainBySid("no such domain") self.assertEqual(dom, None)
def blast(algo): seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta', key_function=lambda x: x.split()[0]) hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb')) scop = Scop(dir_path='data/scop', version='1.75') tmpdir = Path(f'.{algo}') tmpdir.mkdir(exist_ok=True) auc_result = {} for sf in tqdm(hie): px_list = hie[sf] if len(px_list) < 1: continue sid = random.sample(px_list, 1)[0] record = seqindex[sid] f_fasta = tmpdir/f'{sid}.fasta' f_xml = tmpdir/f'{sid}.xml' SeqIO.write(record, f_fasta.as_posix(), 'fasta') try: if algo == 'psiblast': NcbipsiblastCommandline(query=f_fasta.as_posix(), db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75', num_threads=int(os.cpu_count()), num_iterations=3, evalue=999999, outfmt=5, out=f_xml.as_posix())() elif algo == 'deltablast': NcbideltablastCommandline(query=f_fasta.as_posix(), db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75', num_threads=int(os.cpu_count()), num_iterations=3, evalue=999999, outfmt=5, out=f_xml.as_posix())() else: raise ValueError(f'Invalid algorithm ({algo})') except ApplicationError as e: logging.error(e) f_xml.unlink() continue finally: f_fasta.unlink() results = SearchIO.parse(f_xml.as_posix(), 'blast-xml') results = list(results)[-1] results = list(results)[:500] sf_sccs = scop.getNodeBySunid(sf).sccs roc_score = [] roc_label = [] for result in results: result_sf_sccs = result.description.split(' ')[0][:-2] roc_score.append(-result.hsps[0].evalue) if result_sf_sccs == sf_sccs: roc_label.append(1) else: roc_label.append(0) if np.all(np.array(roc_label) == 1): auc = 1.0 elif np.all(np.array(roc_label) == 0): auc = 0.0 else: auc = metrics.roc_auc_score(roc_label, roc_score) auc_result[sf_sccs] = {'auc': auc, 'sample': sid, 'num': len(results)} f_xml.unlink() now = int(time.time()) pickle.dump(auc_result, Path(f'auc_result_{algo}_{now}.pkl').open('wb'))
def testParse(self): f = open("./SCOP/dir.cla.scop.txt_test") try: cla = f.read() f.close() f = open("./SCOP/dir.des.scop.txt_test") des = f.read() f.close() f = open("./SCOP/dir.hie.scop.txt_test") hie = f.read() finally: f.close() scop = Scop(StringIO(cla), StringIO(des), StringIO(hie)) cla_out = StringIO() scop.write_cla(cla_out) lines = zip(cla.rstrip().split("\n"), cla_out.getvalue().rstrip().split("\n")) for expected_line, line in lines: self.assertTrue(self._compare_cla_lines(expected_line, line)) des_out = StringIO() scop.write_des(des_out) self.assertEqual(des_out.getvalue(), des) hie_out = StringIO() scop.write_hie(hie_out) self.assertEqual(hie_out.getvalue(), hie) domain = scop.getDomainBySid("d1hbia_") self.assertEqual(domain.sunid, 14996) domains = scop.getDomains() self.assertEqual(len(domains), 14) self.assertEqual(domains[4].sunid, 14988) dom = scop.getNodeBySunid(-111) self.assertEqual(dom, None) dom = scop.getDomainBySid("no such domain") self.assertEqual(dom, None)
('d1qg3a1', datetime.datetime(2009, 1, 20, 0, 0), 'b.1.2', 344), ('d1wzca1', datetime.datetime(2009, 1, 27, 0, 0), 'c.108.1', 296), ('d2dsta1', datetime.datetime(2009, 1, 27, 0, 0), 'c.69.1', 975), ('d1y5ha3', datetime.datetime(2009, 2, 10, 0, 0), 'd.37.1', 62), ('d2pzza1', datetime.datetime(2009, 1, 20, 0, 0), 'd.77.1', 92), ('d1ni9a_', datetime.datetime(2009, 2, 10, 0, 0), 'e.7.1', 151), ('d3cw9a1', datetime.datetime(2008, 9, 2, 0, 0), 'e.23.1', 22), ('d2axtd1', datetime.datetime(2009, 2, 10, 0, 0), 'f.26.1', 174), ('d2axto1', datetime.datetime(2009, 2, 10, 0, 0), 'f.4.1', 15), ('d2vy4a1', datetime.datetime(2009, 2, 17, 0, 0), 'g.37.1', 182), ('d3d9ta1', datetime.datetime(2009, 2, 10, 0, 0), 'g.52.1', 81), ) test_data = [x[0] for x in test_data] scop40 = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta') scop100_hie = Scop(dir_path=Path('data/scop'), version='1.75') if n_splits > 1: fold1 = next(KFold(n_splits=n_splits, shuffle=True).split(scop40)) samples = np.array([v for i, v in enumerate(scop40) if i in fold1[1]]) # sf_sunid of scop100 (sid) px = np.array([x.sid for x in scop100_hie.getNodeBySunid(sf_sunid).getDescendents('px')]) # select only sf_sunid in scop40 from scop100 isect = np.intersect1d(samples, px) # select half of sf_sunid only in scop40 for test data fold1 = next(KFold(n_splits=test_n_splits, shuffle=True).split(isect)) tests = np.array([v for i, v in enumerate(isect) if i in fold1[1]]) np.save(Path(f'data/test/scop40_{n_splits}fold_sf{sf_sunid}_testdata_{test_n_splits}fold.npy'), tests) # select domain sids only in scop40 for training data train = np.setdiff1d(samples, tests) np.save(Path(f'data/train/scop40_{n_splits}fold_trainingdata.npy'), train)