Пример #1
0
    def testParse(self):
        f = open("./SCOP/dir.cla.scop.txt_test")
        try:
            cla = f.read()
            f.close()

            f = open("./SCOP/dir.des.scop.txt_test")
            des = f.read()
            f.close()

            f = open("./SCOP/dir.hie.scop.txt_test")
            hie = f.read()
        finally:
            f.close()

        scop = Scop(StringIO(cla), StringIO(des), StringIO(hie))

        cla_out = StringIO()
        scop.write_cla(cla_out)
        lines = zip(cla.rstrip().split("\n"),
                    cla_out.getvalue().rstrip().split("\n"))
        for expected_line, line in lines:
            self.assertTrue(self._compare_cla_lines(expected_line, line))

        des_out = StringIO()
        scop.write_des(des_out)
        self.assertEqual(des_out.getvalue(), des)

        hie_out = StringIO()
        scop.write_hie(hie_out)
        self.assertEqual(hie_out.getvalue(), hie)

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)

        domains = scop.getDomains()
        self.assertEqual(len(domains), 14)
        self.assertEqual(domains[4].sunid, 14988)

        dom = scop.getNodeBySunid(-111)
        self.assertEqual(dom, None)
        dom = scop.getDomainBySid("no such domain")
        self.assertEqual(dom, None)
Пример #2
0
    def test_get_descendents(self):
        """Test getDescendents method."""
        scop = Scop(dir_path="SCOP", version="test")
        fold = scop.getNodeBySunid(46457)

        # get px descendents
        domains = fold.getDescendents("px")
        self.assertEqual(len(domains), 14)
        for d in domains:
            self.assertEqual(d.type, "px")

        sfs = fold.getDescendents("superfamily")
        self.assertEqual(len(sfs), 1)
        for d in sfs:
            self.assertEqual(d.type, "sf")

        # cl has no cl descendent
        cl = fold.getDescendents("cl")
        self.assertEqual(cl, [])
Пример #3
0
    def testParse(self):
        f = open("./SCOP/dir.cla.scop.txt_test")
        try:
            cla = f.read()
            f.close()

            f = open("./SCOP/dir.des.scop.txt_test")
            des = f.read()
            f.close()

            f = open("./SCOP/dir.hie.scop.txt_test")
            hie = f.read()
        finally:
            f.close()

        scop = Scop(StringIO(cla), StringIO(des), StringIO(hie))

        cla_out = StringIO()
        scop.write_cla(cla_out)
        lines = zip(cla.rstrip().split('\n'),
                    cla_out.getvalue().rstrip().split('\n'))
        for expected_line, line in lines:
            self.assertTrue(self._compare_cla_lines(expected_line, line))

        des_out = StringIO()
        scop.write_des(des_out)
        self.assertEqual(des_out.getvalue(), des)

        hie_out = StringIO()
        scop.write_hie(hie_out)
        self.assertEqual(hie_out.getvalue(), hie)

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)

        domains = scop.getDomains()
        self.assertEqual(len(domains), 14)
        self.assertEqual(domains[4].sunid, 14988)

        dom = scop.getNodeBySunid(-111)
        self.assertEqual(dom, None)
        dom = scop.getDomainBySid("no such domain")
        self.assertEqual(dom, None)
Пример #4
0
    def test_get_descendents(self):
        """Test getDescendents method"""
        scop = Scop(dir_path="SCOP", version="test")
        fold = scop.getNodeBySunid(46457)

        # get px descendents
        domains = fold.getDescendents('px')
        self.assertEqual(len(domains), 14)
        for d in domains:
            self.assertEqual(d.type, 'px')

        sfs = fold.getDescendents('superfamily')
        self.assertEqual(len(sfs), 1)
        for d in sfs:
            self.assertEqual(d.type, 'sf')

        # cl has no cl descendent
        cl = fold.getDescendents('cl')
        self.assertEqual(cl, [])
Пример #5
0
def blast(algo):
    seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa',
                           'fasta', key_function=lambda x: x.split()[0])
    hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb'))
    scop = Scop(dir_path='data/scop', version='1.75')
    tmpdir = Path(f'.{algo}')
    tmpdir.mkdir(exist_ok=True)
    auc_result = {}
    for sf in tqdm(hie):
        px_list = hie[sf]
        if len(px_list) < 1:
            continue
        sid = random.sample(px_list, 1)[0]
        record = seqindex[sid]
        f_fasta = tmpdir/f'{sid}.fasta'
        f_xml = tmpdir/f'{sid}.xml'
        SeqIO.write(record, f_fasta.as_posix(), 'fasta')
        try:
            if algo == 'psiblast':
                NcbipsiblastCommandline(query=f_fasta.as_posix(),
                                        db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75',
                                        num_threads=int(os.cpu_count()),
                                        num_iterations=3,
                                        evalue=999999,
                                        outfmt=5,
                                        out=f_xml.as_posix())()
            elif algo == 'deltablast':
                NcbideltablastCommandline(query=f_fasta.as_posix(),
                                          db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75',
                                          num_threads=int(os.cpu_count()),
                                          num_iterations=3,
                                          evalue=999999,
                                          outfmt=5,
                                          out=f_xml.as_posix())()
            else:
                raise ValueError(f'Invalid algorithm ({algo})')
        except ApplicationError as e:
            logging.error(e)
            f_xml.unlink()
            continue
        finally:
            f_fasta.unlink()
        results = SearchIO.parse(f_xml.as_posix(), 'blast-xml')
        results = list(results)[-1]
        results = list(results)[:500]
        sf_sccs = scop.getNodeBySunid(sf).sccs
        roc_score = []
        roc_label = []
        for result in results:
            result_sf_sccs = result.description.split(' ')[0][:-2]
            roc_score.append(-result.hsps[0].evalue)
            if result_sf_sccs == sf_sccs:
                roc_label.append(1)
            else:
                roc_label.append(0)
        if np.all(np.array(roc_label) == 1):
            auc = 1.0
        elif np.all(np.array(roc_label) == 0):
            auc = 0.0
        else:
            auc = metrics.roc_auc_score(roc_label, roc_score)
        auc_result[sf_sccs] = {'auc': auc, 'sample': sid, 'num': len(results)}
        f_xml.unlink()
    now = int(time.time())
    pickle.dump(auc_result, Path(f'auc_result_{algo}_{now}.pkl').open('wb'))
Пример #6
0
    ('d3cw9a1', datetime.datetime(2008, 9, 2, 0, 0), 'e.23.1', 22),
    ('d2axtd1', datetime.datetime(2009, 2, 10, 0, 0), 'f.26.1', 174),
    ('d2axto1', datetime.datetime(2009, 2, 10, 0, 0), 'f.4.1', 15),
    ('d2vy4a1', datetime.datetime(2009, 2, 17, 0, 0), 'g.37.1', 182),
    ('d3d9ta1', datetime.datetime(2009, 2, 10, 0, 0), 'g.52.1', 81),
)
test_data = [x[0] for x in test_data]

scop40 = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
scop100_hie = Scop(dir_path=Path('data/scop'), version='1.75')

if n_splits > 1:
    fold1 = next(KFold(n_splits=n_splits, shuffle=True).split(scop40))
    samples = np.array([v for i, v in enumerate(scop40) if i in fold1[1]])
    # sf_sunid of scop100 (sid)
    px = np.array([x.sid for x in scop100_hie.getNodeBySunid(sf_sunid).getDescendents('px')])
    # select only sf_sunid in scop40 from scop100
    isect = np.intersect1d(samples, px)
    # select half of sf_sunid only in scop40 for test data
    fold1 = next(KFold(n_splits=test_n_splits, shuffle=True).split(isect))
    tests = np.array([v for i, v in enumerate(isect) if i in fold1[1]])
    np.save(Path(f'data/test/scop40_{n_splits}fold_sf{sf_sunid}_testdata_{test_n_splits}fold.npy'), tests)
    # select domain sids only in scop40 for training data
    train = np.setdiff1d(samples, tests)
    np.save(Path(f'data/train/scop40_{n_splits}fold_trainingdata.npy'), train)
    # {sf: [sid]} list for making alignment pairs in the same superfamily
    hie = {}
    for i in train:
        dom = scop100_hie.getDomainBySid(i)
        if dom:
            sf = dom.getAscendent('sf').sccs