Пример #1
0
def generate_structural_alignments(scop40_fasta: str, scop_dir: str,
                                   scop_version: str, pdb_dir: str,
                                   out_file: str):
    scop40 = SeqIO.index(scop40_fasta, 'fasta')
    scop_root = Scop(dir_path=scop_dir, version=scop_version).getRoot()
    results = []
    for cl in tqdm(scop_root.getChildren()):
        for cf in tqdm(cl.getChildren()):
            for sf in tqdm(cf.getChildren()):
                px = sf.getDescendents('px')
                if len(px) < 2:
                    continue
                with ThreadPoolExecutor() as executor:
                    futures = []
                    for c in itertools.combinations(px, 2):
                        if c[0].sid in scop40 and c[1].sid in scop40:
                            futures.append(
                                executor.submit(
                                    run_tmalign_async,
                                    f'{pdb_dir}/{c[0].sid[2:4]}/{c[0].sid}.ent',
                                    f'{pdb_dir}/{c[1].sid[2:4]}/{c[1].sid}.ent'
                                ))
                    for future in as_completed(futures):
                        result = future.result()
                        if result is not None:
                            results.append(result[0])
                            results.append(result[1])
    SeqIO.write(results, out_file, 'fasta')
Пример #2
0
 def testConstructWithCustomFile(self):
     scop = Scop(dir_path="SCOP", version="test")
     astral = Astral(
         scop=scop,
         astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa")
     self.assertEqual(str(astral.getSeqBySid("d3sdha_")), "AAAAA")
     self.assertEqual(str(astral.getSeqBySid("d4hbib_")), "KKKKK")
Пример #3
0
    def testGetAscendent(self):
        scop = Scop(dir_path="SCOP", version="test")
        domain = scop.getDomainBySid("d1hbia_")

        # get the fold
        fold = domain.getAscendent("cf")
        self.assertEqual(fold.sunid, 46457)

        # get the superfamily
        sf = domain.getAscendent("superfamily")
        self.assertEqual(sf.sunid, 46458)

        # px has no px ascendent
        px = domain.getAscendent("px")
        self.assertEqual(px, None)

        # an sf has no px ascendent
        px2 = sf.getAscendent("px")
        self.assertEqual(px2, None)
Пример #4
0
    def test_get_descendents(self):
        """Test getDescendents method"""
        scop = Scop(dir_path="SCOP", version="test")
        fold = scop.getNodeBySunid(46457)

        # get px descendents
        domains = fold.getDescendents('px')
        self.assertEqual(len(domains), 14)
        for d in domains:
            self.assertEqual(d.type, 'px')

        sfs = fold.getDescendents('superfamily')
        self.assertEqual(len(sfs), 1)
        for d in sfs:
            self.assertEqual(d.type, 'sf')

        # cl has no cl descendent
        cl = fold.getDescendents('cl')
        self.assertEqual(cl, [])
Пример #5
0
    def testGetAscendent(self):
        scop = Scop(dir_path="SCOP", version="test")
        domain = scop.getDomainBySid("d1hbia_")

        # get the fold
        fold = domain.getAscendent('cf')
        self.assertEqual(fold.sunid, 46457)

        # get the superfamily
        sf = domain.getAscendent('superfamily')
        self.assertEqual(sf.sunid, 46458)

        # px has no px ascendent
        px = domain.getAscendent('px')
        self.assertEqual(px, None)

        # an sf has no px ascendent
        px2 = sf.getAscendent('px')
        self.assertEqual(px2, None)
Пример #6
0
    def test_get_descendents(self):
        """Test getDescendents method."""
        scop = Scop(dir_path="SCOP", version="test")
        fold = scop.getNodeBySunid(46457)

        # get px descendents
        domains = fold.getDescendents("px")
        self.assertEqual(len(domains), 14)
        for d in domains:
            self.assertEqual(d.type, "px")

        sfs = fold.getDescendents("superfamily")
        self.assertEqual(len(sfs), 1)
        for d in sfs:
            self.assertEqual(d.type, "sf")

        # cl has no cl descendent
        cl = fold.getDescendents("cl")
        self.assertEqual(cl, [])
Пример #7
0
class AstralTests(unittest.TestCase):
    def setUp(self):
        self.scop = Scop(dir_path="SCOP", version="test")
        self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")

    def testGetSeq(self):
        self.assertEqual(str(self.astral.getSeqBySid("d3sdha_")), "AAAAA")
        self.assertEqual(str(self.astral.getSeqBySid("d4hbib_")), "KKKKK")

        dom = self.scop.getDomainBySid("d3sdha_")
        self.assertEqual(str(self.astral.getSeq(dom)), "AAAAA")

    def testConstructWithCustomFile(self):
        scop = Scop(dir_path="SCOP", version="test")
        astral = Astral(
            scop=scop,
            astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa")
        self.assertEqual(str(astral.getSeqBySid("d3sdha_")), "AAAAA")
        self.assertEqual(str(astral.getSeqBySid("d4hbib_")), "KKKKK")

    def testGetDomainsFromFile(self):
        filename = "SCOP/scopseq-test/astral-scopdom-seqres-sel-gs-bib-20-test.id"
        domains = self.astral.getAstralDomainsFromFile(filename)

        self.assertEqual(len(domains), 3)
        self.assertEqual(domains[0].sid, "d3sdha_")
        self.assertEqual(domains[1].sid, "d4hbib_")
        self.assertEqual(domains[2].sid, "d5hbia_")

    def testGetDomainsClustered(self):
        domains1 = self.astral.domainsClusteredById(20)
        self.assertEqual(len(domains1), 3)
        self.assertEqual(domains1[0].sid, "d3sdha_")
        self.assertEqual(domains1[1].sid, "d4hbib_")
        self.assertEqual(domains1[2].sid, "d5hbia_")

        domains2 = self.astral.domainsClusteredByEv(1e-15)
        self.assertEqual(len(domains2), 1)
Пример #8
0
class AstralTests(unittest.TestCase):

    def setUp(self):
        self.scop = Scop(dir_path="SCOP", version="test")
        self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")

    def testGetSeq(self):
        self.assertEqual(str(self.astral.getSeqBySid('d3sdha_')), "AAAAA")
        self.assertEqual(str(self.astral.getSeqBySid('d4hbib_')), "KKKKK")

        dom = self.scop.getDomainBySid('d3sdha_')
        self.assertEqual(str(self.astral.getSeq(dom)), "AAAAA")

    def testConstructWithCustomFile(self):
        scop = Scop(dir_path="SCOP", version="test")
        astral = Astral(scop=scop,
                        astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa")
        self.assertEqual(str(astral.getSeqBySid('d3sdha_')), "AAAAA")
        self.assertEqual(str(astral.getSeqBySid('d4hbib_')), "KKKKK")

    def testGetDomainsFromFile(self):
        filename = "SCOP/scopseq-test/astral-scopdom-seqres-sel-gs-bib-20-test.id"
        domains = self.astral.getAstralDomainsFromFile(filename)

        self.assertEqual(len(domains), 3)
        self.assertEqual(domains[0].sid, "d3sdha_")
        self.assertEqual(domains[1].sid, "d4hbib_")
        self.assertEqual(domains[2].sid, "d5hbia_")

    def testGetDomainsClustered(self):
        domains1 = self.astral.domainsClusteredById(20)
        self.assertEqual(len(domains1), 3)
        self.assertEqual(domains1[0].sid, "d3sdha_")
        self.assertEqual(domains1[1].sid, "d4hbib_")
        self.assertEqual(domains1[2].sid, "d5hbia_")

        domains2 = self.astral.domainsClusteredByEv(1e-15)
        self.assertEqual(len(domains2), 1)
Пример #9
0
    def testConstructFromDirectory(self):
        scop = Scop(dir_path="SCOP", version="test")
        self.assertTrue(isinstance(scop, Scop))

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)
Пример #10
0
 def setUp(self):
     self.scop = Scop(dir_path="SCOP", version="test")
     self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")
Пример #11
0
 def setUp(self):
     self.scop = Scop(dir_path="SCOP", version="test")
     self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")
Пример #12
0
import numpy as np
from sklearn import metrics
from Bio.SCOP import Scop
from Bio.Blast.Applications import NcbideltablastCommandline, NcbipsiblastCommandline
from Bio import SeqIO, SearchIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_protein
import matplotlib.pyplot
import matplotlib.figure
import matplotlib.axes
import pandas
import seaborn as sns
from tqdm import tqdm

hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb'))
scop100 = Scop(dir_path='data/scop', version='1.75')
seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa',
                       'fasta')
test_data = (
    ('d1wlqc_', datetime.datetime(2009, 2, 17, 0, 0), 'a.4.5', 762),
    ('d2axtu1', datetime.datetime(2009, 2, 10, 0, 0), 'a.60.12', 159),
    ('d2zqna1', datetime.datetime(2009, 2, 10, 0, 0), 'b.42.2', 119),
    ('d1qg3a1', datetime.datetime(2009, 1, 20, 0, 0), 'b.1.2', 344),
    ('d1wzca1', datetime.datetime(2009, 1, 27, 0, 0), 'c.108.1', 296),
    ('d2dsta1', datetime.datetime(2009, 1, 27, 0, 0), 'c.69.1', 975),
    ('d1y5ha3', datetime.datetime(2009, 2, 10, 0, 0), 'd.37.1', 62),
    ('d2pzza1', datetime.datetime(2009, 1, 20, 0, 0), 'd.77.1', 92),
    ('d1ni9a_', datetime.datetime(2009, 2, 10, 0, 0), 'e.7.1', 151),
    ('d3cw9a1', datetime.datetime(2008, 9, 2, 0, 0), 'e.23.1', 22),
    ('d2axtd1', datetime.datetime(2009, 2, 10, 0, 0), 'f.26.1', 174),
    ('d2axto1', datetime.datetime(2009, 2, 10, 0, 0), 'f.4.1', 15),
Пример #13
0
    def testParse(self):
        f = open("./SCOP/dir.cla.scop.txt_test")
        try:
            cla = f.read()
            f.close()

            f = open("./SCOP/dir.des.scop.txt_test")
            des = f.read()
            f.close()

            f = open("./SCOP/dir.hie.scop.txt_test")
            hie = f.read()
        finally:
            f.close()

        scop = Scop(StringIO(cla), StringIO(des), StringIO(hie))

        cla_out = StringIO()
        scop.write_cla(cla_out)
        lines = zip(cla.rstrip().split('\n'),
                    cla_out.getvalue().rstrip().split('\n'))
        for expected_line, line in lines:
            self.assertTrue(self._compare_cla_lines(expected_line, line))

        des_out = StringIO()
        scop.write_des(des_out)
        self.assertEqual(des_out.getvalue(), des)

        hie_out = StringIO()
        scop.write_hie(hie_out)
        self.assertEqual(hie_out.getvalue(), hie)

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)

        domains = scop.getDomains()
        self.assertEqual(len(domains), 14)
        self.assertEqual(domains[4].sunid, 14988)

        dom = scop.getNodeBySunid(-111)
        self.assertEqual(dom, None)
        dom = scop.getDomainBySid("no such domain")
        self.assertEqual(dom, None)
Пример #14
0
    def testConstructFromDirectory(self):
        scop = Scop(dir_path="SCOP", version="test")
        self.assertTrue(isinstance(scop, Scop))

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)
Пример #15
0
def blast(algo):
    seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa',
                           'fasta', key_function=lambda x: x.split()[0])
    hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb'))
    scop = Scop(dir_path='data/scop', version='1.75')
    tmpdir = Path(f'.{algo}')
    tmpdir.mkdir(exist_ok=True)
    auc_result = {}
    for sf in tqdm(hie):
        px_list = hie[sf]
        if len(px_list) < 1:
            continue
        sid = random.sample(px_list, 1)[0]
        record = seqindex[sid]
        f_fasta = tmpdir/f'{sid}.fasta'
        f_xml = tmpdir/f'{sid}.xml'
        SeqIO.write(record, f_fasta.as_posix(), 'fasta')
        try:
            if algo == 'psiblast':
                NcbipsiblastCommandline(query=f_fasta.as_posix(),
                                        db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75',
                                        num_threads=int(os.cpu_count()),
                                        num_iterations=3,
                                        evalue=999999,
                                        outfmt=5,
                                        out=f_xml.as_posix())()
            elif algo == 'deltablast':
                NcbideltablastCommandline(query=f_fasta.as_posix(),
                                          db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75',
                                          num_threads=int(os.cpu_count()),
                                          num_iterations=3,
                                          evalue=999999,
                                          outfmt=5,
                                          out=f_xml.as_posix())()
            else:
                raise ValueError(f'Invalid algorithm ({algo})')
        except ApplicationError as e:
            logging.error(e)
            f_xml.unlink()
            continue
        finally:
            f_fasta.unlink()
        results = SearchIO.parse(f_xml.as_posix(), 'blast-xml')
        results = list(results)[-1]
        results = list(results)[:500]
        sf_sccs = scop.getNodeBySunid(sf).sccs
        roc_score = []
        roc_label = []
        for result in results:
            result_sf_sccs = result.description.split(' ')[0][:-2]
            roc_score.append(-result.hsps[0].evalue)
            if result_sf_sccs == sf_sccs:
                roc_label.append(1)
            else:
                roc_label.append(0)
        if np.all(np.array(roc_label) == 1):
            auc = 1.0
        elif np.all(np.array(roc_label) == 0):
            auc = 0.0
        else:
            auc = metrics.roc_auc_score(roc_label, roc_score)
        auc_result[sf_sccs] = {'auc': auc, 'sample': sid, 'num': len(results)}
        f_xml.unlink()
    now = int(time.time())
    pickle.dump(auc_result, Path(f'auc_result_{algo}_{now}.pkl').open('wb'))
Пример #16
0
    def testParse(self):
        f = open("./SCOP/dir.cla.scop.txt_test")
        try:
            cla = f.read()
            f.close()

            f = open("./SCOP/dir.des.scop.txt_test")
            des = f.read()
            f.close()

            f = open("./SCOP/dir.hie.scop.txt_test")
            hie = f.read()
        finally:
            f.close()

        scop = Scop(StringIO(cla), StringIO(des), StringIO(hie))

        cla_out = StringIO()
        scop.write_cla(cla_out)
        lines = zip(cla.rstrip().split("\n"),
                    cla_out.getvalue().rstrip().split("\n"))
        for expected_line, line in lines:
            self.assertTrue(self._compare_cla_lines(expected_line, line))

        des_out = StringIO()
        scop.write_des(des_out)
        self.assertEqual(des_out.getvalue(), des)

        hie_out = StringIO()
        scop.write_hie(hie_out)
        self.assertEqual(hie_out.getvalue(), hie)

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)

        domains = scop.getDomains()
        self.assertEqual(len(domains), 14)
        self.assertEqual(domains[4].sunid, 14988)

        dom = scop.getNodeBySunid(-111)
        self.assertEqual(dom, None)
        dom = scop.getDomainBySid("no such domain")
        self.assertEqual(dom, None)
Пример #17
0
    ('d1qg3a1', datetime.datetime(2009, 1, 20, 0, 0), 'b.1.2', 344),
    ('d1wzca1', datetime.datetime(2009, 1, 27, 0, 0), 'c.108.1', 296),
    ('d2dsta1', datetime.datetime(2009, 1, 27, 0, 0), 'c.69.1', 975),
    ('d1y5ha3', datetime.datetime(2009, 2, 10, 0, 0), 'd.37.1', 62),
    ('d2pzza1', datetime.datetime(2009, 1, 20, 0, 0), 'd.77.1', 92),
    ('d1ni9a_', datetime.datetime(2009, 2, 10, 0, 0), 'e.7.1', 151),
    ('d3cw9a1', datetime.datetime(2008, 9, 2, 0, 0), 'e.23.1', 22),
    ('d2axtd1', datetime.datetime(2009, 2, 10, 0, 0), 'f.26.1', 174),
    ('d2axto1', datetime.datetime(2009, 2, 10, 0, 0), 'f.4.1', 15),
    ('d2vy4a1', datetime.datetime(2009, 2, 17, 0, 0), 'g.37.1', 182),
    ('d3d9ta1', datetime.datetime(2009, 2, 10, 0, 0), 'g.52.1', 81),
)
test_data = [x[0] for x in test_data]

scop40 = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
scop100_hie = Scop(dir_path=Path('data/scop'), version='1.75')

if n_splits > 1:
    fold1 = next(KFold(n_splits=n_splits, shuffle=True).split(scop40))
    samples = np.array([v for i, v in enumerate(scop40) if i in fold1[1]])
    # sf_sunid of scop100 (sid)
    px = np.array([x.sid for x in scop100_hie.getNodeBySunid(sf_sunid).getDescendents('px')])
    # select only sf_sunid in scop40 from scop100
    isect = np.intersect1d(samples, px)
    # select half of sf_sunid only in scop40 for test data
    fold1 = next(KFold(n_splits=test_n_splits, shuffle=True).split(isect))
    tests = np.array([v for i, v in enumerate(isect) if i in fold1[1]])
    np.save(Path(f'data/test/scop40_{n_splits}fold_sf{sf_sunid}_testdata_{test_n_splits}fold.npy'), tests)
    # select domain sids only in scop40 for training data
    train = np.setdiff1d(samples, tests)
    np.save(Path(f'data/train/scop40_{n_splits}fold_trainingdata.npy'), train)