Пример #1
0
    def test1(self):
        pdb_1 = self.pdb.filter(Pisces(20, 2.0))
        results_1 = pdb_1.keys().collect()

        self.assertTrue('5X42' in results_1)
        self.assertTrue('4R4X' in results_1)
        self.assertFalse('2ONX' in results_1)
        self.assertFalse('1JLP' in results_1)
Пример #2
0
    def test2(self):
        pdb_2 = self.pdb.flatMap(StructureToPolymerChains())
        pdb_2 = pdb_2.filter(Pisces(20, 2.0))
        results_2 = pdb_2.keys().collect()

        self.assertTrue('5X42.B' in results_2)
        self.assertTrue('4R4X.A' in results_2)
        self.assertFalse('5X42.A' in results_2)
        self.assertFalse('2ONX.A' in results_2)
        self.assertFalse('1JLP.A' in results_2)
Пример #3
0
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "../../resources/mmtf_full_sample/"

# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")  
sc = SparkContext(conf=conf)                                   


# ## Read PDB and create PISCES non-redundant set

# In[14]:


pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = 30, resolution = 2.5))         


# ## Setup criteria for metal interactions

# In[15]:


# Chemical component codes of metals in different oxidation states
metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"}

interactions_filter = InteractionFilter(distanceCutoff = 3.0, minInteractions=4, maxInteractions=6)
interactions_filter.set_query_groups(True, metals)

# Exclude non-polar interactions
interactions_filter.set_target_elements(False, ['H','C','P'])
Пример #4
0
#  ## Read MMTF Hadoop sequence file and
#
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:

path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader.read_sequence_file(
    path, sc).flatMap(StructureToPolymerChains()).filter(
        Pisces(sequenceIdentity,
               resolution)).filter(ContainsLProteinChain()).sample(
                   False, fraction, seed)

# ## Get content

# In[4]:

segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb,
                                                      segmentLength).cache()
print(f"original data   : {data.count()}")

# ## Drop Q3 and sequence duplicates

# In[5]:
Пример #5
0

path = "../../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path, sc)


# ## Filter by representative protein chains at 40% sequence identity

# In[7]:


sequenceIdentity = 40
resolution = 2.0

pdb = pdb.filter(Pisces(sequenceIdentity, resolution))          .flatMap(StructureToPolymerChains())          .filter(Pisces(sequenceIdentity, resolution))          .filter(PolymerComposition(PolymerComposition.AMINO_ACIDS_20))


# ## Show top 10 structures

# In[8]:


pdb.top(10)


# ## Save representative set

# In[9]:


#  ## Read MMTF Hadoop sequence file and 
#  
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:


path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader         .read_sequence_file(path, sc)         .flatMap(StructureToPolymerChains())         .filter(Pisces(sequenceIdentity, resolution))         .filter(ContainsLProteinChain())         .sample(False, fraction, seed)


# ## Get content

# In[4]:


segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()
print(f"original data   : {data.count()}")


# ## Drop Q3 and sequence duplicates

# In[5]:
# ## Configure Spark Context

# In[18]:

conf = SparkConf().setMaster("local[*]").setAppName("MachineLearningDemo")

sc = SparkContext(conf=conf)

# ## Read MMTF File and create a non-redundant set (<=40% seq. identity) of L-protein clains

# In[19]:

pdb = mmtfReader.read_sequence_file('../../resources/mmtf_reduced_sample/',
                                    sc).flatMap(
                                        StructureToPolymerChains()).filter(
                                            Pisces(sequenceIdentity=40,
                                                   resolution=3.0))

# ## Get secondary structure content

# In[20]:

data = secondaryStructureExtractor.get_dataset(pdb)

# ## Define addProteinFoldType function

# In[21]:


def add_protein_fold_type(data, minThreshold, maxThreshold):
    '''
    Adds a column "foldType" with three major secondary structure class:
    "ProteinFoldDatasetCreatorDemo")

sc = SparkContext(conf=conf)

# ## Read MMTF Hadoop sequence file
#
# Create non-redundant set (<=40% seq. identity) if L-protein chains

# In[15]:

path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 40
resolution = 2.0

pdb = mmtfReader.read_sequence_file(path, sc).filter(
    Pisces(sequenceIdentity,
           resolution)).flatMap(StructureToPolymerChains()).filter(
               Pisces(sequenceIdentity,
                      resolution)).filter(ContainsLProteinChain())

# ## Get secondary structure content

# In[16]:

data = secondaryStructureExtractor.get_dataset(pdb)

# ## Classify chains by secondary structure type

# In[17]:

minThreshold = 0.05
maxThreshold = 0.15
resolution = 2.5
minInteractions = 4
maxInteractions = 6
distanceCutoff = 3.0

# chemical component codes of metals in different oxidation states
metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"}


# ## Read PDB and create PISCES non-redundant set

# In[12]:


pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = sequenceIdentityCutoff, resolution = resolution))         


# ## Setup criteria for metal interactions

# In[13]:


interactions_filter = InteractionFilter()
interactions_filter.set_distance_cutoff(distanceCutoff)
interactions_filter.set_min_interactions(minInteractions)
interactions_filter.set_max_interactions(maxInteractions)
interactions_filter.set_query_groups(True, metals)

#Exclude non-polar interactions
interactions_filter.set_target_elements(False, ['H','C','P'])
Пример #10
0
# ## Read PDB in MMTF format

# In[3]:

path = "../../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path, sc)

# # Use only representative structures

# In[4]:

seqId = 40
resolution = 2.0

pdb = pdb.filter(Pisces(seqId, resolution))

# ## Extract proteins with Zn interactions

# In[5]:

finder = groupInteractionExtractor("ZN", 3)

interactions = finder.get_dataset(pdb).cache()

# ## List the top 10 residue types that interact with Zn

# In[6]:

interactions.printSchema()