def main(): ''' Demo to create a feature vector for protein fold classification. In this demo we try to classify a protein chain as either an all alpha or all beta protein based on protein sequence. We use n-grams and Word2Vec representation of the protein sequence as a feature vector. ''' start = time.time() conf = SparkConf() \ .setMaster("local[*]") \ .setAppName("featuresDemo") sc = SparkContext(conf=conf) path = "../../resources/mmtf_reduced_sample/" # Read MMTF Hadoop sequence file and create a non-redundant set (<=40% seq. identity) # of L-protein chains sequenceIdentity = 40 resolution = 2.0 pdb = MmtfReader \ .readSequenceFile(path, sc) \ .filter(pisces(sequenceIdentity, resolution)) \ .flatMap(structureToPolymerChains()) \ .filter(pisces(sequenceIdentity, resolution)) \ .filter(containsLProteinChain()) \ # Get secondary structure content data = secondaryStructureExtractor.getDataset(pdb) # classify chains by secondary structure type minThreshold = 0.05 maxThreshold = 0.15 data = addProteinFoldType(data, minThreshold, maxThreshold) # add Word2Vec encoded feature vector encoder = proteinSequenceEncoder(data) n = 2 # Create 2-grams windowSize = 25 # 25-amino residue window size for Word2Vec vectorSize = 50 # dimension of feature vector data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize).cache() data.printSchema() data.show(25) # keep only a subset of relevant fields for futher processing data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features") data.write.mode("overwrite").format("parquet").save( "/home/marshuang80/PDB/data/demo.parquet") end = time.time() print("Time: %f sec." % (end - start)) sc.stop()
def main(): ''' This class creates a dataset of sequence segments dericed from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a seuqnce segment, and a one-hot encoding of the sequence segment. The dataset is saved in a file specified by the user ''' start = time.time() conf = SparkConf() \ .setMaster("local[*]") \ .setAppName("SecondaryStructureOneHotEncoderDemo") sc = SparkContext(conf=conf) # Read MMTF Hadoop sequence file and create a non-redundant set # (<=20% seq. identity) of L-protein chains path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 #.downloadMmtfFiles(["2ONX",'1JLP','5X6H','5L2G','2MK1' ],sc) \ pdb = MmtfReader \ .readSequenceFile(path, sc) \ .flatMap(structureToPolymerChains()) \ .filter(pisces(sequenceIdentity, resolution)) \ .filter(containsLProteinChain()) \ .sample(False, fraction, seed) segmentLength = 11 data = secondaryStructureSegmentExtractor.getDataset( pdb, segmentLength).cache() print(f"original data : {data.count()}") data = data.dropDuplicates(["labelQ3", "sequence"]).cache() print(f"- duplicate Q3/seq : {data.count()}") data = data.dropDuplicates(["sequence"]) print(f"- duplicate seq : {data.count()}") encoder = proteinSequenceEncoder(data) data = encoder.oneHotEncode() data.printSchema() data.show(25, False) end = time.time() print("Time: %f sec." % (end - start)) sc.stop()
def test1(self): pdb = self.pdb.filter(containsLProteinChain()) \ .flatMap(structureToPolymerChains()) \ .filter(containsLProteinChain()) seq = secondaryStructureExtractor.getDataset(pdb) self.assertTrue(seq.count() == 5)
def test6(self): pdb_6 = self.pdb.flatMap(structureToPolymerChains()) pdb_6 = pdb_6.filter( secondaryStructure(0.70, 0.75, 0.00, 0.40, 0.25, 0.50)) results_6 = pdb_6.keys().collect() self.assertTrue('2C7M.A' in results_6) self.assertFalse('2C7M.B' in results_6)
def test2(self): pdb_2 = self.pdb.filter(blastCluster(40)) pdb_2 = pdb_2.flatMap(structureToPolymerChains()) results_2 = pdb_2.keys().collect() self.assertFalse('1O06' in results_2) self.assertTrue('1O06.A' in results_2) self.assertFalse('2ONX' in results_2)
def main(): ''' This class creates a dataset of sequence segment derived from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a sequence segment, and a Word2Vec encoding of the sequence segment. The Data is saved in JSON file specified by the user. ''' start = time.time() conf = SparkConf() \ .setMaster("local[*]") \ .setAppName("secondaryStructureWord2VecEncodeDemo") sc = SparkContext(conf=conf) # Read MMTF Hadoop sequence file and create a non-redundant set # (<=20% seq. identity) of L-protein chains path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 pdb = MmtfReader \ .readSequenceFile(path, sc) \ .flatMap(structureToPolymerChains()) \ .filter(pisces(sequenceIdentity, resolution)) \ .filter(containsLProteinChain()) \ .sample(False, fraction, seed) segmentLength = 11 data = secondaryStructureSegmentExtractor.getDataset( pdb, segmentLength).cache() # add Word2Vec encoded feature vector encoder = proteinSequenceEncoder(data) n = 2 windowSize = (segmentLength - 1) // 2 vectorSize = 50 data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize) data.printSchema() data.show(25, False) end = time.time() print("Time: %f sec." % (end - start)) sc.stop()
def test2(self): # This test runs a chain elvel query and compares chain level results pdb_2 = self.pdb.flatMap(structureToPolymerChains()) whereClause = "WHERE ecNo='2.7.11.1' AND source='H**o sapiens'" fields = ["ecNo", "source"] pdb_2 = pdb_2.filter(customReportQuery(whereClause, fields)) results_2 = pdb_2.keys().collect() self.assertTrue('5JDE.A' in results_2) self.assertTrue('5JDE.B' in results_2) self.assertTrue('5CU4.A' in results_2) self.assertTrue('5L6W.L' in results_2) self.assertFalse('5L6W.C' in results_2) self.assertFalse('5UFU.A' in results_2) self.assertFalse('5UFU.B' in results_2) self.assertFalse('5UFU.C' in results_2) self.assertFalse('5IHB.A' in results_2) self.assertFalse('5IHB.B' in results_2) self.assertFalse('5IHB.C' in results_2) self.assertFalse('5IHB.D' in results_2)
def main(): start = time.time() conf = SparkConf().setMaster("local[*]") \ .setAppName("secondaryStructureElementDemo") sc = SparkContext(conf = conf) pdb = MmtfReader.downloadMmtfFiles(["1STP"],sc).cache() pdb = pdb.flatMap(structureToPolymerChains()) \ .filter(containsLProteinChain()) ds = secondaryStructureElementExtractor.getDataset(pdb,"E", 6) ds.show(50, False) end = time.time() print("Time: %f sec." %(end-start)) sc.stop()
def test4(self): query = "<orgPdbQuery>" + \ "<queryType>org.pdb.query.simple.EnzymeClassificationQuery</queryType>" + \ "<Enzyme_Classification>2.7.11.1</Enzyme_Classification>" + \ "</orgPdbQuery>" pdb_4 = self.pdb.flatMap(structureToPolymerChains()) \ .filter(advancedQuery(query)) results_4 = pdb_4.keys().collect() self.assertFalse('1PEN.A' in results_4) self.assertFalse('1OCZ.A' in results_4) self.assertFalse('2ONX.A' in results_4) self.assertTrue('5L6W.L' in results_4) self.assertFalse('5L6W.C' in results_4) self.assertFalse('5KHU.A' in results_4) self.assertFalse('5KHU.B' in results_4) self.assertTrue('5KHU.Q' in results_4) self.assertTrue('1F3M.A' in results_4) self.assertTrue('1F3M.B' in results_4) self.assertTrue('1F3M.C' in results_4) self.assertTrue('1F3M.D' in results_4)
def test1(self): pdb = self.pdb.flatMap(structureToPolymerChains()) seq = polymerSequenceExtractor.getDataset(pdb) self.assertTrue(seq.count() == 5)
def test1(self): pdb_1 = self.pdb.flatMap(structureToPolymerChains()) results_1 = pdb_1.keys().collect() self.assertTrue(len(results_1) == 10)
def test1(self): pdb = self.pdb.flatMap(structureToPolymerChains()) seq = secondaryStructureSegmentExtractor.getDataset(pdb, 25) self.assertTrue("DPSKDSKAQVSAAEAGITGTWYNQL" == seq.head()[1])