def setUp(self):
        self.spark = SparkSession.builder.master("local[*]") \
                                 .appName("LigandInteractionFingerprintTest") \
                                 .getOrCreate()

        t0 = time.time()
        #raw = read_raw_sequence_file("/Users/peter/MMTF_Files/full")  # 85 sec
        #gp = raw.mapValues(lambda t: default_api.ungzip_data(t))  # 91
        # gp = raw.mapValues(lambda t: default_api.ungzip_data(t).read())  # 133 sec
        #gp = raw.map(lambda t: (t[0], default_api.ungzip_data(t[1]).read()))  # 131 sec
        #gp = raw.map(lambda t: (t[0], msgpack.unpackb(default_api.ungzip_data(t[1]).read(), raw=False)))  # 553 sec
        #gp = raw.map(lambda t: (t[0], pd.read_msgpack(gzip.decompress(t[1]))))  # 169 sec [4]:144
        ## convert directly to columnar structure?, lazy decoding?

        #gc.disable()  # 643 with gc disabled
        #gp = raw.map(lambda t: (t[0], MmtfStructure(msgpack.unpackb(unzip_data(t[1]), raw=False))))  ## 664 sec
        #gp = raw.map(lambda t: (t[0], MmtfStructure(msgpack.unpackb(default_api.ungzip_data(t[1]).read(), raw=False))))  ## 664 sec
        # gp = raw.mapValues(lambda t: MmtfStructure(msgpack.unpackb(default_api.ungzip_data(t).read(), raw=False)))  # 653 sec
        #func1 = default_api.ungzip_data  # try local version
        #func2 = msgpack.unpackb
        #func3 = MmtfStructure
        #gp = raw.map(lambda t: (t[0], MmtfStructure(func2(func1(t[1]).read(), raw=False))))  # 640 sec
        #gp = raw.mapValues(lambda t: func3(func2(func1(t).read(), raw=False)))  # 615

        #print("partitions:", gp.getNumPartitions())
        #print(gp.count())
        #t1 = time.time()
        #print("raw:", t1-t0)
        self.pdb = read_sequence_file("/Users/peter/GitRespositories/mmtf-pyspark/resources/mmtf_full_sample")
    def setUp(self):
        path = 'resources/sample_rdd'
        stringIds = "1FDK,1FDL,1FDM,1FDN,1FDO,1FDP,1FDQ,1FDR,1FDS,1FDT"

        self.pdbIds = stringIds.split(',')
        conf = SparkConf().setMaster("local[*]").setAppName('read_sequence_file')
        self.sc = SparkContext(conf=conf)
        self.pdb = read_sequence_file(path, self.sc, pdbId = self.pdbIds)
示例#3
0
    def setUp(self):
        path = 'resources/mmtf_full_sample'
        #TODO
        stringIds = "1FDK,1FDL,1FDM,1FDN,1FDO,1FDP,1FDQ,1FDR,1FDS,1FDT"

        self.pdbIds = stringIds.split(',')
        self.spark = SparkSession.builder.master("local[*]") \
                                 .appName("read_sequence_file") \
                                 .getOrCreate()

        self.pdb = read_sequence_file(path, pdbId=self.pdbIds)
示例#4
0
from mmtfPyspark.structureViewer import view_structure

# ## Configure Spark

# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName("FilterByReleaseDate")
sc = SparkContext(conf=conf)

# ## Read in MMTF Files, filter and count

# In[3]:

path = "../../resources/mmtf_reduced_sample/"

structures = mmtfReader.read_sequence_file(path, sc).filter(
    ReleaseDate("2000-01-28", "2017-02-28"))

print(
    f"Number of structure released between 2000-01-28 and 2017-02-28 is: {structures.count()}"
)

# ## Visualize Structures

# In[4]:

structure_names = structures.keys().collect()
view_structure(structure_names, style='line')

# ## Terminate Spark

# In[5]:
示例#5
0
# ## Configure Spark Context

# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName(
    "SequenceSimilaritySearchDemo")
sc = SparkContext(conf=conf)

# ## Read PDB in MMTF format, split into polymer chain, search by sequence similarity, and print sequence found

# In[6]:

path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains()).filter(
        SequenceSimilarity(sequence="NLVQFGVMIEKMTGKSALQYNDYGCYCGIGGSHWPVDQ",
                           searchTool=SequenceSimilarity.BLAST,
                           eValueCutoff=0.001,
                           sequenceIdentityCutoff=40,
                           maskLowComplexity=True)).collect()

for pdbId, structure in pdb:
    print(f"{pdbId} :     {structure.entity_list[0]['sequence']}")

# ## Terminate Spark Context

# In[7]:

sc.stop()
示例#6
0
# ## Configure Spark

# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName("polypeptideCahinStats")
sc = SparkContext(conf=conf)

# ## Read in mmtf files, flatMap to polymer chains, filter by polymer composition, and get number of groups

# In[4]:

path = "../../resources/mmtf_full_sample/"

chainLengths = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains(False, True)).filter(
        PolymerComposition(PolymerComposition.AMINO_ACIDS_20)).map(
            lambda t: t[1].num_groups).cache()

# ## Print out poly-peptide chain statistics

# In[5]:

print(f"Total number of chains: {chainLengths.count()}")
print(f"Total number of groups: {chainLengths.sum()}")
print(f"Min chain length: {chainLengths.min()}")
print(f"Mean chain length: {chainLengths.mean()}")
print(f"Max chain length: {chainLengths.max()}")

# ## Terminate Spark

# In[6]:
示例#7
0
                                                               
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "../../resources/mmtf_full_sample/"

# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")  
sc = SparkContext(conf=conf)                                   


# ## Read PDB and create PISCES non-redundant set

# In[14]:


pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = 30, resolution = 2.5))         


# ## Setup criteria for metal interactions

# In[15]:


# Chemical component codes of metals in different oxidation states
metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"}

interactions_filter = InteractionFilter(distanceCutoff = 3.0, minInteractions=4, maxInteractions=6)
interactions_filter.set_query_groups(True, metals)

# Exclude non-polar interactions
示例#8
0
#  ## Read MMTF Hadoop sequence file and
#
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:

path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader.read_sequence_file(
    path, sc).flatMap(StructureToPolymerChains()).filter(
        Pisces(sequenceIdentity,
               resolution)).filter(ContainsLProteinChain()).sample(
                   False, fraction, seed)

# ## Get content

# In[4]:

segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb,
                                                      segmentLength).cache()
print(f"original data   : {data.count()}")

# ## Drop Q3 and sequence duplicates

# In[5]:
from mmtfPyspark.io import mmtfReader

# ## Configure Spark Context

# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName("AuthorSearchDemo")
sc = SparkContext(conf=conf)

# ## Query to find PDB structures for Doudna, J.A. as a deposition (audit) author or as an author in the primary PDB citation

# In[6]:

sqlQuery = "SELECT pdbid from audit_author " + "WHERE name LIKE 'Doudna%J.A.%' " + "UNION " + "SELECT pdbid from citation_author " + "WHERE citation_id = 'primary' AND name LIKE 'Doudna%J.A.%'"

# ## Read PDB and filter by author

# In[8]:

path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc).filter(PdbjMineSearch(sqlQuery))

print(f"Number of entries matching query: {pdb.count()}")

# ## Terminate Spark Context

# In[9]:

sc.stop()
    "SecondaryStructureElementsWord2VecEncoderDemo")

sc = SparkContext(conf=conf)

#  ## Read MMTF Hadoop sequence file and
#
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:

path = "../../resources/mmtf_reduced_sample/"
fraction = 0.05
seed = 123

pdb = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains(False,
                             True)).filter(ContainsLProteinChain()).sample(
                                 False, fraction, seed)

# ## Extract Element "H" from Secondary Structure

# In[4]:

label = "H"
data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache()
print(f"original data   : {data.count()}")
data.show(10, False)

# ## Word2Vec encoded feature Vector

# In[6]:
# ## Configure Spark

# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName("WriteMMTFCustomSubset")
sc = SparkContext(conf=conf)

# ## Read in a fractions of entries from a local Hadoop Sequence File

# In[3]:

path = "../../resources/mmtf_full_sample/"
fraction = 0.5
seed = 123

pdb = mmtfReader.read_sequence_file(path, sc, fraction=fraction, seed=seed)

count = pdb.count()

print(f'number of pdb entries read : {count}')

# ## Retain high resolution X-ray structures

# In[4]:

pdb = pdb.filter(ExperimentalMethods(
    ExperimentalMethods.X_RAY_DIFFRACTION)).filter(Resolution(0, 2.0)).filter(
        RFree(0, 2.0))

print(f'number of pdb entries left : {pdb.count()}')
# ## Configure Spark

# In[6]:

conf = SparkConf().setMaster("local[*]").setAppName("wildTypeQuery")
sc = SparkContext(conf=conf)

# ## Read in Hadoop Sequence Files and filter by WildType

# In[7]:

path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc).filter(
    WildTypeQuery(includeExpressionTags=True,
                  percentSequenceCoverage=WildTypeQuery.SEQUENCE_COVERAGE_95))

# ## Count results and show top 5 structures

# In[8]:

count = pdb.count()

print(f"Number of structures after filtering : {count}")

pdb.top(5)

# ## Terminate Spark

# In[9]:
# ## Configure Spark Context

# In[18]:

conf = SparkConf().setMaster("local[*]").setAppName("MachineLearningDemo")

sc = SparkContext(conf=conf)

# ## Read MMTF File and create a non-redundant set (<=40% seq. identity) of L-protein clains

# In[19]:

pdb = mmtfReader.read_sequence_file('../../resources/mmtf_reduced_sample/',
                                    sc).flatMap(
                                        StructureToPolymerChains()).filter(
                                            Pisces(sequenceIdentity=40,
                                                   resolution=3.0))

# ## Get secondary structure content

# In[20]:

data = secondaryStructureExtractor.get_dataset(pdb)

# ## Define addProteinFoldType function

# In[21]:


def add_protein_fold_type(data, minThreshold, maxThreshold):
# ## Configure Spark

# In[3]:

conf = SparkConf().setMaster("local[*]").setAppName(
    "FilterExclusivelyByLProtein")
sc = SparkContext(conf=conf)

# ## Read in MMTF Files, filter by L protein, and count the entries

# In[4]:

path = "../../resources/mmtf_reduced_sample/"

structures = mmtfReader.read_sequence_file(path, sc).filter(
    ContainsLProteinChain(exclusive=True))

print(f"Number of L-Proteins: {structures.count()}")

# ## Visualize Structures

# In[5]:

structure_names = structures.keys().collect()
view_structure(structure_names, style='sphere')

# ## Terminate Spark

# In[6]:

sc.stop()
# ## Configure Spark Context

# In[3]:

conf = SparkConf().setMaster("local[*]").setAppName("KinaseDemo")
sc = SparkContext(conf=conf)

# ## Query for human protein-serine/threonine kinases using SIFTS data

# In[4]:

sql = "SELECT t.pdbid, t.chain FROM sifts.pdb_chain_taxonomy AS t  " + "JOIN sifts.pdb_chain_enzyme AS e ON (t.pdbid = e.pdbid AND t.chain = e.chain) " + "WHERE t.scientific_name = 'H**o sapiens' AND e.ec_number = '2.7.11.1'"

# ## Read PDB and filter by author

# In[6]:

path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains()).filter(PdbjMineSearch(sql))

print(f"Number of entries matching query: {pdb.count()}")

# ## Terminate Spark Context

# In[7]:

sc.stop()
# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName("FilterByPolymerChainType")
sc = SparkContext(conf=conf)

# ## Read in MMTF Files, filter and count
#
# #### * Not filter returns the opposite of a particular filter*

# In[3]:

path = "../../resources/mmtf_reduced_sample/"

structures = mmtfReader.read_sequence_file(path, sc).filter(
    ContainsPolymerChainType("DNA LINKING",
                             ContainsPolymerChainType.RNA_LINKING)).filter(
                                 NotFilter(ContainsLProteinChain())).filter(
                                     NotFilter(ContainsDSaccharideChain()))

print(f"Number of pure DNA and RNA entires: {structures.count()}")

# ## View Structures

# In[4]:

structure_names = structures.keys().collect()
view_structure(structure_names, style='sphere')

# ## Terminate Spark

# In[5]: