def test2(self):

        ds = customReportService.get_dataset(["ecNo"])
        self.assertTrue(
            str(ds.schema) ==
            "StructType(List(StructField(structureChainId,StringType,true),StructField(structureId,StringType,true),StructField(chainId,StringType,true),StructField(ecNo,StringType,true)))"
        )
        self.assertTrue(ds.count() > 130101)
    def test1(self):

        ds = customReportService.get_dataset(
            ["pmc", "pubmedId", "depositionDate"])
        self.assertTrue(
            str(ds.schema) ==
            "StructType(List(StructField(structureId,StringType,true),StructField(pmc,StringType,true),StructField(pubmedId,IntegerType,true),StructField(depositionDate,TimestampType,true)))"
        )
        self.assertTrue(ds.count() > 130101)
Exemplo n.º 3
0
    def __init__(self, whereClause, fields):

        # Check if fields are in a list or string
        if type(fields) == str:

            if ',' in fields:
                fields = fields.split(',')

            else:
                fields = [fields]

        # Get requested data columns
        dataset = customReportService.get_dataset(fields)

        # Check if the results contain chain level data
        self.chainLevel = "structureChainId" in dataset.columns

        # Create a temporary view of the dataset
        dataset.createOrReplaceTempView("table")

        # Create SparkSession
        spark = SparkSession.builder.getOrCreate()

        # Run SQL query
        if (self.chainLevel):
            # For chain level data
            sql = "SELECT structureChainID, structureId, chainId FROM table " \
                  + whereClause
            results = spark.sql(sql)

            # Add both PDB entry and chain level data, so chain-based data can be filtered
            self.pdbIds = results.distinct().rdd.map(lambda x: x[0]).collect()
            self.pdbIds += results.distinct().rdd.map(lambda x: x[1]).collect()

        else:
            # For PDB entry level data
            sql = "SELECT structureId FROM table " + whereCaluse
            results = spark.sql(sql)
            self.pdbIds = results.distinct().rdd.map(lambda x: x[0]).collect()

        self.pdbIds = list(set(self.pdbIds))
Exemplo n.º 4
0
# ## Configure Spark

# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName(
    "secondaryStructureSegmentDemo")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

# ## Retrieve PDB annotation:
# Binding addinities (Ki, Kd), group name of the ligand (hetId), and the Enzyme Classification number (ecNo)

# In[3]:

ds = customReportService.get_dataset(["Ki", "Kd", "hetId", "ecNo"])

# ## Show the schema of this dataset

# In[4]:

ds.printSchema()

# ## Filtering
#
# ### Select structures that either have Ki or Kd values(s) and are protein-serine/threonine kinases (EC 2.7.1.*)
#
#
# #### A. By using dataset operations

# In[5]:
Exemplo n.º 5
0
drugBank = drugBankDataset.get_open_drug_links()
drugBank.toPandas().head(10)

# ## Filter out DrugBank entries without StandardInChIKey

# In[3]:

drugBank = drugBank.filter(drugBank.StandardInChIKey.isNotNull())
drugBank.toPandas().head(5)

# ## Get PDB ligand annotations

# In[4]:

ligands = customReportService.get_dataset([
    "ligandId", "ligandMolecularWeight", "ligandFormula", "ligandSmiles",
    "InChIKey"
])
ligands.toPandas().head(10)

# ## Filter out DrugBank entries without InChIKey

# In[5]:

ligands = ligands.filter(ligands.InChIKey.isNotNull())
ligands.toPandas().head(5)

# ## Join ligand dataset with DrugBank info by InChIKey

# In[6]:

ligands = ligands.join(drugBank, ligands.InChIKey == drugBank.StandardInChIKey)