Exemplo n.º 1
0
    def test1(self):
        pdb = self.pdb.filter(ContainsLProteinChain()) \
                      .flatMap(StructureToPolymerChains()) \
                      .filter(ContainsLProteinChain())

        seq = secondaryStructureExtractor.get_dataset(pdb)

        self.assertTrue(seq.count() == 5)
Exemplo n.º 2
0
 def test1(self):
     pdb_1 = self.pdb.filter(ContainsLProteinChain())
     results_1 = pdb_1.keys().collect()
     self.assertTrue('2ONX' in results_1)
     self.assertFalse('1JLP' in results_1)
     self.assertTrue('5X6H' in results_1)
     self.assertFalse('5L2G' in results_1)
     self.assertFalse('2MK1' in results_1)
Exemplo n.º 3
0
    def test2(self):
        pdb_2 = self.pdb.filter(ContainsLProteinChain(exclusive=True))
        results_2 = pdb_2.keys().collect()

        self.assertTrue('2ONX' in results_2)
        self.assertFalse('1JLP' in results_2)
        self.assertFalse('5X6H' in results_2)
        self.assertFalse('5L2G' in results_2)
        self.assertFalse('2MK1' in results_2)
Exemplo n.º 4
0
    def test3(self):
        pdb_3 = self.pdb.flatMap(StructureToPolymerChains())
        pdb_3 = pdb_3.filter(ContainsLProteinChain())
        results_3 = pdb_3.keys().collect()

        self.assertTrue('2ONX.A' in results_3)
        self.assertFalse('1JLP.A' in results_3)
        self.assertTrue('5X6H.B' in results_3)
        self.assertFalse('5L2G.A' in results_3)
        self.assertFalse('5L2G.B' in results_3)
        self.assertFalse('2MK1.A' in results_3)
Exemplo n.º 5
0
#  ## Read MMTF Hadoop sequence file and
#
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:

path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader.read_sequence_file(
    path, sc).flatMap(StructureToPolymerChains()).filter(
        Pisces(sequenceIdentity,
               resolution)).filter(ContainsLProteinChain()).sample(
                   False, fraction, seed)

# ## Get content

# In[4]:

segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb,
                                                      segmentLength).cache()
print(f"original data   : {data.count()}")

# ## Drop Q3 and sequence duplicates

# In[5]:
Exemplo n.º 6
0
# ## Read in MMTF Files

# In[3]:

path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc)

# ## Filter proteins that cotinas Dna chain and L protein chain
#
# 1) Retain pdb entires that exclusively contain L-peptide chains
# 2) Retain pdb entries that exclusively contain L-Dna

# In[4]:

structures = pdb.filter(ContainsLProteinChain()).filter(ContainsDnaChain())

# ## Count number of entires

# In[5]:

count = structures.count()

print(f"Number of entires that contain L-protein and L-DNA: {count}")

# ## Visualize Structures

# In[7]:

structure_names = structures.keys().collect()
view_structure(structure_names)
sc = SparkContext(conf=conf)

#  ## Read MMTF Hadoop sequence file and
#
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:

path = "../../resources/mmtf_reduced_sample/"
fraction = 0.05
seed = 123

pdb = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains(False,
                             True)).filter(ContainsLProteinChain()).sample(
                                 False, fraction, seed)

# ## Extract Element "H" from Secondary Structure

# In[4]:

label = "H"
data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache()
print(f"original data   : {data.count()}")
data.show(10, False)

# ## Word2Vec encoded feature Vector

# In[6]:

#  ## Read MMTF Hadoop sequence file and 
#  
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:


path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader         .read_sequence_file(path, sc)         .flatMap(StructureToPolymerChains())         .filter(Pisces(sequenceIdentity, resolution))         .filter(ContainsLProteinChain())         .sample(False, fraction, seed)


# ## Get content

# In[4]:


segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()
print(f"original data   : {data.count()}")


# ## Drop Q3 and sequence duplicates

# In[5]:
Exemplo n.º 9
0
    "secondaryStructureElementDemo")
sc = SparkContext(conf=conf)

# ## Download protein (1STP)
#
# ### Note: Need to use SparkContext as parameter to download Mmtf files

# In[12]:

pdb = mmtfReader.download_mmtf_files(['1STP'], sc).cache()

# ## Map protein to polymer chains and apply LProteinChain filter

# In[13]:

pdb = pdb.flatMap(StructureToPolymerChains()).filter(ContainsLProteinChain())

# ## Extract secondary structure element 'E'

# In[14]:

ds = secondaryStructureElementExtractor.get_dataset(pdb, 'E', 6)

ds.show(50, False)

# ## Terminate Spark

# In[15]:

sc.stop()
Exemplo n.º 10
0
# input parameters
resolution = 2.0
minInteractions = 2
maxInteractions = 4
distanceCutoff = 3.0
bFactorCutoff = 1.645
includeWaters = True

# ## Read PDB and filter by resolution and only include proteins

# In[3]:

pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Resolution(minResolution=0.0, maxResolution=2.0)).filter(
    ContainsLProteinChain(exclusive=True))

# ## Setup criteria for metal interactions

# In[4]:

interactions_filter = InteractionFilter()
interactions_filter.set_distance_cutoff(3.0)
interactions_filter.set_normalized_b_factor_cutoff(1.645)
interactions_filter.set_min_interactions(2)
interactions_filter.set_max_interactions(4)
interactions_filter.set_query_groups(True, ["HOH"])
interactions_filter.set_query_elements(True, "O")  # Only use water oxygen
interactions_filter.set_target_elements(True, ["O", "N", "S"])

# ## Exclude "uninteresting" ligands