def testParseBiomolecule(self): pdbfile = self.config.getTestFile('3p1u.pdb') parser = LegacyStructureParser(pdbfile) s2 = parser.parse_biomolecule(2) self.assertEqual(len(s2.chains), 1) self.assertEqual(s2.first_chain.id, 'B1') self.assertRaises(KeyError, parser.parse_biomolecule, 3)
def testCommaSplitting(self): """ @see: [CSB 0000067] """ pdbfile = self.config.getTestFile('3shm_ca.pdb') parser = LegacyStructureParser(pdbfile) s1 = parser.parse_biomolecule(1, True) self.assertEqual(len(s1.chains), 60) self.assertEqual(s1.first_chain.id, 'A')
def setUp(self): super(TestDumpLoad, self).setUp() self.lists = [[], list(range(1000)), list("Although that way may not be" + "obvious at first" + "unless you're Dutch.")] self.arrays = [ numpy.array([]), numpy.random.random(1000), numpy.arange(1000), ] self.strings = ["", "Although that way may not be" + \ "obvious at first" + \ "unless you're Dutch.", "([0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])"] # Completly connnected graph self.big_graph = [] for _i in range(250): n = Node() self.big_graph.append(n) for n in self.big_graph: n.connections = set(self.big_graph) n.connections.remove(n) # Protein pdbfile = self.config.getTestFile('ake-xray-ensemble-ca.pdb') self.protein = LegacyStructureParser(pdbfile).parse_models()[0] self.objs = [self.lists, self.arrays, self.strings, self.protein]
def main(self): try: parser = LegacyStructureParser(self.args.pdb) models = parser.models() except IOError as e: self.exit('PDB file parsing failed\n' + str(e.value), ExitCodes.IO_ERROR) if len(models) < 2: self.exit('PDB file contains only one model', ExitCodes.USAGE_ERROR) ensemble = parser.parse_models(models) X = numpy.array([model[self.args.chain].get_coordinates(['CA'], True) for model in ensemble]) x_mu = average_structure(X) #n = X.shape[1] m = X.shape[0] R = numpy.zeros((m, 3, 3)) t = numpy.ones((m, 3)) prior = GammaPrior() mixture = ScaleMixture(scales=X.shape[1], prior=prior, d=3) for i in range(m): R[i, :, :], t[i, :] = fit(x_mu, X[i]) # gibbs sampling cycle for j in range(self.args.niter): # apply rotation data = numpy.array([numpy.sum((x_mu - numpy.dot(X[i], numpy.transpose(R[i])) - t[i]) ** 2, -1) ** 0.5 for i in range(m)]).T # sample scales mixture.estimate(data) # sample rotations for i in range(m): R[i, :, :], t[i, :] = wfit(x_mu, X[i], mixture.scales) out_ensemble = csb.bio.structure.Ensemble() for i, model in enumerate(ensemble): model.transform(R[i], t[i]) out_ensemble.models.append(model) out_ensemble.to_pdb(self.args.outfile)
def main(self): try: parser = LegacyStructureParser(self.args.infile) models = parser.models() except: self.exit('PDB file parsing failed', ExitCodes.IO_ERROR) if len(models) < 2: self.exit('PDB file contains only one model', ExitCodes.USAGE_ERROR) ensemble = parser.parse_models(models) X = numpy.array([model.get_coordinates(['CA'], True) for model in ensemble]) if self.args.type == 'segments': self.main_segments(ensemble, X) elif self.args.type == 'conformers': self.main_conformers(ensemble, X) else: raise ValueError('type must be "segments" or "conformers"')
def testParseHetMolecules(self): with self.config.getTempStream() as tmp: tmp.write('HETATM 1 NA BLM A 1 -14.575 27.241 3.310 1.00 0.00 N ') tmp.flush() parser = LegacyStructureParser(tmp.name) self.assertRaises(HeaderFormatError, parser.parse_structure) del parser
def _ake_ensemble_coords(self): pdbfile = self.config.getTestFile('ake-xray-ensemble-ca.pdb') ensemble = LegacyStructureParser(pdbfile).parse_models() X = array([model.get_coordinates(['CA'], True) for model in ensemble]) self.assertEqual(X.shape, (16, 211, 3)) self._ake_ensemble_coords = lambda: X return X
def main(self): try: parser = LegacyStructureParser(self.args.infile) models = parser.models() except: self.exit('PDB file parsing failed', ExitCodes.IO_ERROR) if len(models) < 2: self.exit('PDB file contains only one model', ExitCodes.USAGE_ERROR) ensemble = parser.parse_models(models) X = numpy.array( [model.get_coordinates(['CA'], True) for model in ensemble]) if self.args.type == 'segments': self.main_segments(ensemble, X) elif self.args.type == 'conformers': self.main_conformers(ensemble, X) else: raise ValueError('type must be "segments" or "conformers"')
def main(self): try: parser = LegacyStructureParser(self.args.pdb1) r = parser.parse() parser = LegacyStructureParser(self.args.pdb2) m = parser.parse() except IOError as e: self.exit('PDB file parsing failed\n' + str(e.value), ExitCodes.IO_ERROR) X = numpy.array(r[self.args.chain1].get_coordinates(['CA'], True)) Y = numpy.array(m[self.args.chain2].get_coordinates(['CA'], True)) if self.args.alignment is not None: align = SequenceAlignment.parse(file(self.args.alignment).read()) align = align[:2, :] matches = [] for i in range(1, align.length + 1): if not align.gap_at(i): matches.append([align.columns[i][0].rank - 1, align.columns[i][1].rank - 1]) matches = numpy.array(matches) X = X[matches[:, 0], :] Y = Y[matches[:, 1], :] if len(X) != len(Y): self.exit('Structures are of different lengths,' + ' please specify an alignment', ExitCodes.INPUT_ERROR) R, t = csb.bio.utils.bfit(X, Y, self.args.niter, self.args.scalemixture, self.args.em) m.transform(R, t) m.to_pdb(self.args.outfile)
def testEnsemble(self): """ The posterior of a gaussian scale mixture with gamma prior is a Student's t distribution, with parameters alpha and beta. Give enough samples, we shoud be able to estimate these parameters """ pdbfile = self.config.getTestFile('ake-xray-ensemble-ca.pdb') ensemble = LegacyStructureParser(pdbfile).parse_models() X = numpy.array([model.get_coordinates(['CA'], True) for model in ensemble]) x_mu = average_structure(X) n =X.shape[1] m =X.shape[0] R = numpy.zeros((m,3,3)) t = numpy.ones((m,3)) prior = GammaPrior() mixture = ScaleMixture(scales=n, prior = prior, d=3) from csb.bio.utils import fit, wfit for i in range(m): R[i,:,:], t[i,:] = fit(x_mu, X[i]) # gibbs sampling cycle for j in range(200): # apply rotation data = numpy.array([numpy.sum((x_mu - numpy.dot(X[i], numpy.transpose(R[i])) - t[i]) **2, -1)**0.5 for i in range(m)]).T # sample scales mixture.estimate(data) # sample rotations for i in range(m): R[i,:,:], t[i,:] = wfit(x_mu, X[i], mixture.scales) self.assertEqual(mixture.scales.shape, (211,)) R_opt = numpy.eye(3) t_opt = numpy.zeros((3,)) for k in range(m): for i in range(3): self.assertAlmostEqual(t[k,i], t_opt[i], delta=2.) for j in range(3): self.assertAlmostEqual(abs(R[k,i, j]), R_opt[i, j], delta=0.15)
def testInvGammaMAP(self): """ The posterior of a gaussian scale mixture with gamma prior is a Student's t distribution, with parameters alpha and beta. Give enough samples, we shoud be able to estimate these parameters """ pdbfile = self.config.getTestFile('ake-xray-ensemble-ca.pdb') ensemble = LegacyStructureParser(pdbfile).parse_models() X = numpy.array(ensemble[0].get_coordinates(['CA'], True)) Y = numpy.array(ensemble[13].get_coordinates(['CA'], True)) prior = InvGammaPrior() prior.estimator = InvGammaPosteriorMAP() mixture = ScaleMixture(scales=X.shape[0], prior=prior, d=3) from csb.bio.utils import fit, wfit R, t = fit(X, Y) #numpy.random.seed(100) # gibbs sampling cycle for i in range(200): # apply rotation data = numpy.sum((X - numpy.dot(Y, numpy.transpose(R)) - t) ** 2, axis= -1) ** (1. / 2) # sample scales mixture.estimate(data) # sample rotations R, t = wfit(X, Y, mixture.scales) self.assertEqual(mixture.scales.shape, (211,)) R_opt = numpy.eye(3) t_opt = numpy.zeros((3,)) for i in range(3): self.assertAlmostEqual(t[i], t_opt[i], delta=2.) for j in range(3): self.assertAlmostEqual(R_opt[i, j], R[i, j], delta=1e-1)
def main(self): try: parser = LegacyStructureParser(self.args.pdb1) r = parser.parse() parser = LegacyStructureParser(self.args.pdb2) m = parser.parse() except IOError as e: self.exit('PDB file parsing failed\n' + str(e.value), ExitCodes.IO_ERROR) X = numpy.array(r[self.args.chain1].get_coordinates(['CA'], True)) Y = numpy.array(m[self.args.chain2].get_coordinates(['CA'], True)) if self.args.alignment is not None: align = SequenceAlignment.parse(file(self.args.alignment).read()) align = align[:2, :] matches = [] for i in range(1, align.length + 1): if not align.gap_at(i): matches.append([ align.columns[i][0].rank - 1, align.columns[i][1].rank - 1 ]) matches = numpy.array(matches) X = X[matches[:, 0], :] Y = Y[matches[:, 1], :] if len(X) != len(Y): self.exit( 'Structures are of different lengths,' + ' please specify an alignment', ExitCodes.INPUT_ERROR) R, t = csb.bio.utils.bfit(X, Y, self.args.niter, self.args.scalemixture, self.args.em) m.transform(R, t) m.to_pdb(self.args.outfile)
def setUp(self): super(TestLegacyStructureParser, self).setUp() self.pdb = self.config.getTestFile('1d3z.legacy.pdb') self.parser = LegacyStructureParser(self.pdb)
class TestLegacyStructureParser(test.Case): def setUp(self): super(TestLegacyStructureParser, self).setUp() self.pdb = self.config.getTestFile('1d3z.legacy.pdb') self.parser = LegacyStructureParser(self.pdb) def testParseModels(self): ensemble = self.parser.parse_models() self.assertEqual(ensemble.models.length, 10) self.assertEqual(ensemble[0].model_id, 1) self.assertEqual(ensemble.models[1].model_id, 1) def testParseStructure(self): structure = self.parser.parse(model=1) self.assertEqual(self.parser.parse_structure().model_id, 1) self.assertEqual(structure.accession, '1d3z') self.assertEqual(structure.model_id, 1) # Chain level self.assertEqual(structure.chains.length, 1) self.assertEqual(len(structure.chains), 1) self.assertEqual(structure.first_chain.molecule_id, '1') self.assertEqual(structure.chains['A'].sequence, 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG') self.assertEqual(len(structure.chains['A']), 76) self.assertEqual(len(structure['A']), 76) # Residue level self.assertEqual(len(structure['A'][1:10]), 9) self.assertEqual(structure['A'][0].type, SequenceAlphabets.Protein.MET) self.assertEqual(structure['A'][0].label, 'MSE') self.assertEqual(structure['A'][1].label, 'GLN') self.assertTrue(structure['A'][0].is_modified) self.assertFalse(structure['A'][1].is_modified) # Atom level self.assertEqual(structure['A'][1].atoms['CA'].element, None) self.assertNotEqual(structure['A'][2].atoms['CA'].element, None) self.assertEqual(structure['A'][2].atoms['CA'].element, ChemElements.C) vector = [51.653, -89.304, 8.833] self.assertEqual(structure['A'][0]['CA'].vector.tolist(), vector) def testParseResidue(self): self.assertEqual(self.parser.parse_residue('AGM'), SequenceAlphabets.Protein.ARG.name) #@UndefinedVariable self.assertEqual(self.parser.parse_residue('AGM', as_type=SequenceTypes.Protein), SequenceAlphabets.Protein.ARG.name) #@UndefinedVariable self.assertRaises(UnknownPDBResidueError, self.parser.parse_residue, 'AGM', as_type=SequenceTypes.NucleicAcid) def testParseResidueSafe(self): self.assertEqual(self.parser.parse_residue_safe('AGM', as_type=None), SequenceAlphabets.Protein.ARG.name) #@UndefinedVariable self.assertEqual(self.parser.parse_residue_safe('AGM', as_type=SequenceTypes.Protein), SequenceAlphabets.Protein.ARG.name) #@UndefinedVariable self.assertEqual(self.parser.parse_residue_safe('AGM', as_type=SequenceTypes.NucleicAcid), SequenceAlphabets.Nucleic.Any.name) #@UndefinedVariable self.assertEqual(self.parser.parse_residue_safe('junk', as_type=SequenceTypes.Protein), SequenceAlphabets.Unknown.UNK.name) #@UndefinedVariable def testGuessSequenceType(self): self.assertEqual(self.parser.guess_sequence_type('AGM'), SequenceTypes.Protein) self.assertEqual(self.parser.guess_sequence_type('DOC'), SequenceTypes.NucleicAcid) self.assertRaises(UnknownPDBResidueError, self.parser.guess_sequence_type, 'junk') def testFileName(self): self.assertEqual(self.parser.filename, self.pdb) def testModels(self): self.assertEqual(self.parser.models(), list(range(1, 11))) def testParseBiomolecule(self): pdbfile = self.config.getTestFile('3p1u.pdb') parser = LegacyStructureParser(pdbfile) s2 = parser.parse_biomolecule(2) self.assertEqual(len(s2.chains), 1) self.assertEqual(s2.first_chain.id, 'B1') self.assertRaises(KeyError, parser.parse_biomolecule, 3) def testParseHetMolecules(self): with self.config.getTempStream() as tmp: tmp.write('HETATM 1 NA BLM A 1 -14.575 27.241 3.310 1.00 0.00 N ') tmp.flush() parser = LegacyStructureParser(tmp.name) self.assertRaises(HeaderFormatError, parser.parse_structure) del parser