def setUp(self): self.spark = SparkSession.builder.master("local[*]") \ .appName("LigandInteractionFingerprintTest") \ .getOrCreate() t0 = time.time() #raw = read_raw_sequence_file("/Users/peter/MMTF_Files/full") # 85 sec #gp = raw.mapValues(lambda t: default_api.ungzip_data(t)) # 91 # gp = raw.mapValues(lambda t: default_api.ungzip_data(t).read()) # 133 sec #gp = raw.map(lambda t: (t[0], default_api.ungzip_data(t[1]).read())) # 131 sec #gp = raw.map(lambda t: (t[0], msgpack.unpackb(default_api.ungzip_data(t[1]).read(), raw=False))) # 553 sec #gp = raw.map(lambda t: (t[0], pd.read_msgpack(gzip.decompress(t[1])))) # 169 sec [4]:144 ## convert directly to columnar structure?, lazy decoding? #gc.disable() # 643 with gc disabled #gp = raw.map(lambda t: (t[0], MmtfStructure(msgpack.unpackb(unzip_data(t[1]), raw=False)))) ## 664 sec #gp = raw.map(lambda t: (t[0], MmtfStructure(msgpack.unpackb(default_api.ungzip_data(t[1]).read(), raw=False)))) ## 664 sec # gp = raw.mapValues(lambda t: MmtfStructure(msgpack.unpackb(default_api.ungzip_data(t).read(), raw=False))) # 653 sec #func1 = default_api.ungzip_data # try local version #func2 = msgpack.unpackb #func3 = MmtfStructure #gp = raw.map(lambda t: (t[0], MmtfStructure(func2(func1(t[1]).read(), raw=False)))) # 640 sec #gp = raw.mapValues(lambda t: func3(func2(func1(t).read(), raw=False))) # 615 #print("partitions:", gp.getNumPartitions()) #print(gp.count()) #t1 = time.time() #print("raw:", t1-t0) self.pdb = read_sequence_file("/Users/peter/GitRespositories/mmtf-pyspark/resources/mmtf_full_sample")
def setUp(self): path = 'resources/sample_rdd' stringIds = "1FDK,1FDL,1FDM,1FDN,1FDO,1FDP,1FDQ,1FDR,1FDS,1FDT" self.pdbIds = stringIds.split(',') conf = SparkConf().setMaster("local[*]").setAppName('read_sequence_file') self.sc = SparkContext(conf=conf) self.pdb = read_sequence_file(path, self.sc, pdbId = self.pdbIds)
def setUp(self): path = 'resources/mmtf_full_sample' #TODO stringIds = "1FDK,1FDL,1FDM,1FDN,1FDO,1FDP,1FDQ,1FDR,1FDS,1FDT" self.pdbIds = stringIds.split(',') self.spark = SparkSession.builder.master("local[*]") \ .appName("read_sequence_file") \ .getOrCreate() self.pdb = read_sequence_file(path, pdbId=self.pdbIds)
from mmtfPyspark.structureViewer import view_structure # ## Configure Spark # In[2]: conf = SparkConf().setMaster("local[*]").setAppName("FilterByReleaseDate") sc = SparkContext(conf=conf) # ## Read in MMTF Files, filter and count # In[3]: path = "../../resources/mmtf_reduced_sample/" structures = mmtfReader.read_sequence_file(path, sc).filter( ReleaseDate("2000-01-28", "2017-02-28")) print( f"Number of structure released between 2000-01-28 and 2017-02-28 is: {structures.count()}" ) # ## Visualize Structures # In[4]: structure_names = structures.keys().collect() view_structure(structure_names, style='line') # ## Terminate Spark # In[5]:
# ## Configure Spark Context # In[2]: conf = SparkConf().setMaster("local[*]").setAppName( "SequenceSimilaritySearchDemo") sc = SparkContext(conf=conf) # ## Read PDB in MMTF format, split into polymer chain, search by sequence similarity, and print sequence found # In[6]: path = "../../resources/mmtf_reduced_sample/" pdb = mmtfReader.read_sequence_file(path, sc).flatMap( StructureToPolymerChains()).filter( SequenceSimilarity(sequence="NLVQFGVMIEKMTGKSALQYNDYGCYCGIGGSHWPVDQ", searchTool=SequenceSimilarity.BLAST, eValueCutoff=0.001, sequenceIdentityCutoff=40, maskLowComplexity=True)).collect() for pdbId, structure in pdb: print(f"{pdbId} : {structure.entity_list[0]['sequence']}") # ## Terminate Spark Context # In[7]: sc.stop()
# ## Configure Spark # In[2]: conf = SparkConf().setMaster("local[*]").setAppName("polypeptideCahinStats") sc = SparkContext(conf=conf) # ## Read in mmtf files, flatMap to polymer chains, filter by polymer composition, and get number of groups # In[4]: path = "../../resources/mmtf_full_sample/" chainLengths = mmtfReader.read_sequence_file(path, sc).flatMap( StructureToPolymerChains(False, True)).filter( PolymerComposition(PolymerComposition.AMINO_ACIDS_20)).map( lambda t: t[1].num_groups).cache() # ## Print out poly-peptide chain statistics # In[5]: print(f"Total number of chains: {chainLengths.count()}") print(f"Total number of groups: {chainLengths.sum()}") print(f"Min chain length: {chainLengths.min()}") print(f"Mean chain length: {chainLengths.mean()}") print(f"Max chain length: {chainLengths.max()}") # ## Terminate Spark # In[6]:
# Create variables APP_NAME = "MMTF_Spark" path = "../../resources/mmtf_full_sample/" # Configure Spark conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]") sc = SparkContext(conf=conf) # ## Read PDB and create PISCES non-redundant set # In[14]: pdb = mmtfReader.read_sequence_file(path, sc) pdb = pdb.filter(Pisces(sequenceIdentity = 30, resolution = 2.5)) # ## Setup criteria for metal interactions # In[15]: # Chemical component codes of metals in different oxidation states metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"} interactions_filter = InteractionFilter(distanceCutoff = 3.0, minInteractions=4, maxInteractions=6) interactions_filter.set_query_groups(True, metals) # Exclude non-polar interactions
# ## Read MMTF Hadoop sequence file and # # Create a non-redundant set(<=20% seq. identity) of L-protein chains # In[3]: path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 pdb = mmtfReader.read_sequence_file( path, sc).flatMap(StructureToPolymerChains()).filter( Pisces(sequenceIdentity, resolution)).filter(ContainsLProteinChain()).sample( False, fraction, seed) # ## Get content # In[4]: segmentLength = 11 data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache() print(f"original data : {data.count()}") # ## Drop Q3 and sequence duplicates # In[5]:
from mmtfPyspark.io import mmtfReader # ## Configure Spark Context # In[2]: conf = SparkConf().setMaster("local[*]").setAppName("AuthorSearchDemo") sc = SparkContext(conf=conf) # ## Query to find PDB structures for Doudna, J.A. as a deposition (audit) author or as an author in the primary PDB citation # In[6]: sqlQuery = "SELECT pdbid from audit_author " + "WHERE name LIKE 'Doudna%J.A.%' " + "UNION " + "SELECT pdbid from citation_author " + "WHERE citation_id = 'primary' AND name LIKE 'Doudna%J.A.%'" # ## Read PDB and filter by author # In[8]: path = "../../resources/mmtf_reduced_sample/" pdb = mmtfReader.read_sequence_file(path, sc).filter(PdbjMineSearch(sqlQuery)) print(f"Number of entries matching query: {pdb.count()}") # ## Terminate Spark Context # In[9]: sc.stop()
"SecondaryStructureElementsWord2VecEncoderDemo") sc = SparkContext(conf=conf) # ## Read MMTF Hadoop sequence file and # # Create a non-redundant set(<=20% seq. identity) of L-protein chains # In[3]: path = "../../resources/mmtf_reduced_sample/" fraction = 0.05 seed = 123 pdb = mmtfReader.read_sequence_file(path, sc).flatMap( StructureToPolymerChains(False, True)).filter(ContainsLProteinChain()).sample( False, fraction, seed) # ## Extract Element "H" from Secondary Structure # In[4]: label = "H" data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache() print(f"original data : {data.count()}") data.show(10, False) # ## Word2Vec encoded feature Vector # In[6]:
# ## Configure Spark # In[2]: conf = SparkConf().setMaster("local[*]").setAppName("WriteMMTFCustomSubset") sc = SparkContext(conf=conf) # ## Read in a fractions of entries from a local Hadoop Sequence File # In[3]: path = "../../resources/mmtf_full_sample/" fraction = 0.5 seed = 123 pdb = mmtfReader.read_sequence_file(path, sc, fraction=fraction, seed=seed) count = pdb.count() print(f'number of pdb entries read : {count}') # ## Retain high resolution X-ray structures # In[4]: pdb = pdb.filter(ExperimentalMethods( ExperimentalMethods.X_RAY_DIFFRACTION)).filter(Resolution(0, 2.0)).filter( RFree(0, 2.0)) print(f'number of pdb entries left : {pdb.count()}')
# ## Configure Spark # In[6]: conf = SparkConf().setMaster("local[*]").setAppName("wildTypeQuery") sc = SparkContext(conf=conf) # ## Read in Hadoop Sequence Files and filter by WildType # In[7]: path = "../../resources/mmtf_reduced_sample/" pdb = mmtfReader.read_sequence_file(path, sc).filter( WildTypeQuery(includeExpressionTags=True, percentSequenceCoverage=WildTypeQuery.SEQUENCE_COVERAGE_95)) # ## Count results and show top 5 structures # In[8]: count = pdb.count() print(f"Number of structures after filtering : {count}") pdb.top(5) # ## Terminate Spark # In[9]:
# ## Configure Spark Context # In[18]: conf = SparkConf().setMaster("local[*]").setAppName("MachineLearningDemo") sc = SparkContext(conf=conf) # ## Read MMTF File and create a non-redundant set (<=40% seq. identity) of L-protein clains # In[19]: pdb = mmtfReader.read_sequence_file('../../resources/mmtf_reduced_sample/', sc).flatMap( StructureToPolymerChains()).filter( Pisces(sequenceIdentity=40, resolution=3.0)) # ## Get secondary structure content # In[20]: data = secondaryStructureExtractor.get_dataset(pdb) # ## Define addProteinFoldType function # In[21]: def add_protein_fold_type(data, minThreshold, maxThreshold):
# ## Configure Spark # In[3]: conf = SparkConf().setMaster("local[*]").setAppName( "FilterExclusivelyByLProtein") sc = SparkContext(conf=conf) # ## Read in MMTF Files, filter by L protein, and count the entries # In[4]: path = "../../resources/mmtf_reduced_sample/" structures = mmtfReader.read_sequence_file(path, sc).filter( ContainsLProteinChain(exclusive=True)) print(f"Number of L-Proteins: {structures.count()}") # ## Visualize Structures # In[5]: structure_names = structures.keys().collect() view_structure(structure_names, style='sphere') # ## Terminate Spark # In[6]: sc.stop()
# ## Configure Spark Context # In[3]: conf = SparkConf().setMaster("local[*]").setAppName("KinaseDemo") sc = SparkContext(conf=conf) # ## Query for human protein-serine/threonine kinases using SIFTS data # In[4]: sql = "SELECT t.pdbid, t.chain FROM sifts.pdb_chain_taxonomy AS t " + "JOIN sifts.pdb_chain_enzyme AS e ON (t.pdbid = e.pdbid AND t.chain = e.chain) " + "WHERE t.scientific_name = 'H**o sapiens' AND e.ec_number = '2.7.11.1'" # ## Read PDB and filter by author # In[6]: path = "../../resources/mmtf_reduced_sample/" pdb = mmtfReader.read_sequence_file(path, sc).flatMap( StructureToPolymerChains()).filter(PdbjMineSearch(sql)) print(f"Number of entries matching query: {pdb.count()}") # ## Terminate Spark Context # In[7]: sc.stop()
# In[2]: conf = SparkConf().setMaster("local[*]").setAppName("FilterByPolymerChainType") sc = SparkContext(conf=conf) # ## Read in MMTF Files, filter and count # # #### * Not filter returns the opposite of a particular filter* # In[3]: path = "../../resources/mmtf_reduced_sample/" structures = mmtfReader.read_sequence_file(path, sc).filter( ContainsPolymerChainType("DNA LINKING", ContainsPolymerChainType.RNA_LINKING)).filter( NotFilter(ContainsLProteinChain())).filter( NotFilter(ContainsDSaccharideChain())) print(f"Number of pure DNA and RNA entires: {structures.count()}") # ## View Structures # In[4]: structure_names = structures.keys().collect() view_structure(structure_names, style='sphere') # ## Terminate Spark # In[5]: