예제 #1
0
    def testExtractEntities(self):
        """Test case - extract entities"""
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                cacheFilePath=os.path.join(self.__workPath,
                                           "entity-data-test-cache.json"),
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=self.__testEntryCacheKwargs,
                objectLimit=self.__objectLimitTest,
            )
            eCount = obEx.getCount()
            logger.info("Entity count is %d", eCount)
            self.assertGreaterEqual(eCount, self.__objectLimitTest)

            objD = obEx.getObjects()
            for _, obj in objD.items():
                obEx.genPathList(obj, path=None)
            #
            pL = obEx.getPathList(filterList=False)
            logger.debug("Path list (unfiltered) %r", pL)
            #
            pL = obEx.getPathList()
            logger.debug("Path list %r", pL)
            obEx.setPathList(pL)
            if self.__verbose:
                for ky, obj in objD.items():
                    obEx.genValueList(obj, path=None)
                    tD = obEx.getValues()
                    logger.info("Index object %r %s", ky,
                                pprint.pformat(tD, indent=3, width=120))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #2
0
    def testExtractSelectedEntityContent(self):
        """Test case - extract selected entity content

        "reference_sequence_identifiers": [
                    {
                        "database_name": "UniProt",
                        "database_accession": "Q5SHN1",
                        "provenance_source": "SIFTS"
                    },
                    {
                        "database_name": "UniProt",
                        "database_accession": "Q5SHN1",
                        "provenance_source": "PDB"
                    }
                    ]
        """
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                cacheFilePath=os.path.join(
                    self.__workPath,
                    "entity-selected-content-test-cache.json"),
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=self.__testEntryCacheKwargs,
                # objectLimit=self.__objectLimitTest,
                objectLimit=None,
                selectionQuery={
                    "entity_poly.rcsb_entity_polymer_type": "Protein"
                },
                selectionList=[
                    "rcsb_id",
                    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers"
                ],
            )
            eCount = obEx.getCount()
            logger.info("Entity count is %d", eCount)
            #
            #
            if self.__objectLimitTest is not None:
                self.assertGreaterEqual(eCount, self.__objectLimitTest)
                objD = obEx.getObjects()
                for _, obj in objD.items():
                    obEx.genPathList(obj, path=None)
                #
                pL = obEx.getPathList(filterList=False)
                logger.debug("Path list (unfiltered) %r", pL)
                #
                pL = obEx.getPathList()
                logger.debug("Path list %r", pL)
                obEx.setPathList(pL)
                if self.__verbose:
                    for ky, obj in objD.items():
                        obEx.genValueList(obj, path=None)
                        tD = obEx.getValues()
                        logger.info("Index object %r %s", ky,
                                    pprint.pformat(tD, indent=3, width=120))

            objD = obEx.getObjects()
            # logger.info("objD.keys() %r", list(objD.keys()))
            totCount = 0
            difCount = 0
            pdbUnpIdD = defaultdict(int)
            siftsUnpIdD = defaultdict(int)
            pdbDifUnpIdD = defaultdict(int)
            for entityKey, eD in objD.items():
                try:
                    siftsS = set()
                    pdbS = set()
                    for tD in eD["rcsb_polymer_entity_container_identifiers"][
                            "reference_sequence_identifiers"]:
                        if tD["database_name"] == "UniProt":
                            if tD["provenance_source"] == "SIFTS":
                                siftsS.add(tD["database_accession"])
                                siftsUnpIdD[tD["database_accession"]] += 1
                            elif tD["provenance_source"] == "PDB":
                                pdbS.add(tD["database_accession"])
                                pdbUnpIdD[tD["database_accession"]] += 1
                        else:
                            logger.debug(
                                "No UniProt for %r",
                                eD["rcsb_polymer_entity_container_identifiers"]
                            )
                    logger.debug("PDB assigned sequence length %d", len(pdbS))
                    logger.debug("SIFTS assigned sequence length %d",
                                 len(siftsS))

                    if pdbS and siftsS:
                        totCount += 1
                        if pdbS != siftsS:
                            difCount += 1
                            for idV in pdbS:
                                pdbDifUnpIdD[idV] += 1

                except Exception as e:
                    logger.warning("No identifiers for %s with %s", entityKey,
                                   str(e))
            logger.info("Total %d differences %d", totCount, difCount)
            logger.info("Unique UniProt ids  PDB %d  SIFTS %d", len(pdbUnpIdD),
                        len(siftsUnpIdD))
            logger.info("Unique UniProt differences %d ", len(pdbDifUnpIdD))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()