def __extractCitations(self): """Test case - extract unique entity source and host taxonomies""" try: obEx = ObjectExtractor( self.__cfgOb, databaseName=self.__databaseName, collectionName=self.__collectionName, cacheFilePath=None, useCache=False, keyAttribute="entry", uniqueAttributes=["rcsb_id"], cacheKwargs=None, objectLimit=None, selectionQuery={}, selectionList=["rcsb_id", "citation"], ) eCount = obEx.getCount() logger.info("Entry count is %d", eCount) objD = obEx.getObjects() # for ky, eD in objD.items(): # logger.info("%s: %r", ky, eD) return objD except Exception as e: logger.exception("Failing with %s", str(e)) return {}
def __updateEntryInfo(self, cfgOb): """Get entry_info data""" rD = {} try: obEx = ObjectExtractor( cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_entry", useCache=False, keyAttribute="entry", uniqueAttributes=["rcsb_id"], selectionQuery={}, selectionList=[ "rcsb_id", "rcsb_entry_info.polymer_entity_count" ], ) # eCount = obEx.getCount() logger.info("Entry count is %d", eCount) objD = obEx.getObjects() for _, eD in objD.items(): rcsbId = eD["rcsb_id"] try: rD[rcsbId] = eD["rcsb_entry_info"] except Exception: pass except Exception as e: logger.exception("Failing with %s", str(e)) return rD
def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit): """Get all accessions assigned to input reference sequence database for the input polymerType. Returns: (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []}, "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []} """ try: obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=None, useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=None, objectLimit=fetchLimit, selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType}, selectionList=[ "rcsb_id", "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers", "rcsb_polymer_entity_container_identifiers.auth_asym_ids", "rcsb_entity_source_organism.ncbi_taxonomy_id", ], ) eCount = obEx.getCount() logger.info("Polymer entity count type %s is %d", polymerType, eCount) objD = obEx.getObjects() logger.info("Reading polymer entity count %d reference accession length %d ", eCount, len(objD)) # except Exception as e: logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e)) return objD
def testExtractDrugbankMapping(self): """Test case - extract Drugbank mapping""" try: obEx = ObjectExtractor( self.__cfgOb, databaseName="bird_chem_comp_core", collectionName="bird_chem_comp_core", cacheFilePath=os.path.join(self.__workPath, "drugbank-mapping-cache.json"), useCache=False, cacheKwargs=self.__testEntryCacheKwargs, keyAttribute="chem_comp", uniqueAttributes=["rcsb_id"], selectionQuery={ "rcsb_chem_comp_container_identifiers.drugbank_id": { "$exists": True } }, selectionList=[ "rcsb_id", "rcsb_chem_comp_container_identifiers", "rcsb_chem_comp_related" ], ) eCount = obEx.getCount() logger.info("Component count ifs %d", eCount) self.assertGreaterEqual(eCount, 3) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testExtractEntriesBefore(self): """Test case - extract entries subject to date restriction""" try: tU = TimeUtil() tS = tU.getTimestamp(useUtc=True, before={"days": 365 * 5}) tD = tU.getDateTimeObj(tS) obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_entry", useCache=False, keyAttribute="entry", uniqueAttributes=["rcsb_id"], selectionQuery={ "rcsb_accession_info.initial_release_date": { "$gt": tD } }, selectionList=["rcsb_id", "rcsb_accession_info"], ) eD = obEx.getObjects() eCount = obEx.getCount() logger.info("Entry count is %d", eCount) logger.info("Entries are %r", list(eD.keys())) self.assertGreaterEqual(eCount, 6) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, **kwargs): """Get all accessions assigned to input reference sequence database for the input polymerType. Returns: (dict): {"1abc_1": "rcsb_entity_container_identifiers": {"reference_sequence_identifiers": []}, "rcsb_polymer_entity_align": [], "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []} """ cachePath = kwargs.get("cachePath", ".") exDbDir = "exdb" cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3}) useCache = kwargs.get("useCache", True) fetchLimit = kwargs.get("fetchLimit", None) cacheFilePath = os.path.join(cachePath, exDbDir, "entity-poly-ref-seq-assign-cache.json") # try: obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=cacheFilePath, useCache=useCache, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=cacheKwargs, objectLimit=fetchLimit, selectionQuery={ "entity_poly.rcsb_entity_polymer_type": polymerType }, selectionList=[ "rcsb_id", "rcsb_entity_container_identifiers.reference_sequence_identifiers", "rcsb_entity_container_identifiers.auth_asym_ids", "rcsb_polymer_entity_align", "rcsb_entity_source_organism.ncbi_taxonomy_id", ], ) eCount = obEx.getCount() logger.info("Entity count is %d", eCount) objD = obEx.getObjects() logger.info( "Reading polymer entity entity count %d ref accession length %d ", eCount, len(objD)) # except Exception as e: logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e)) return objD
def __getReferenceData(self, databaseName, collectionName, selectD=None): logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD) obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, keyAttribute="rcsb_id", uniqueAttributes=["rcsb_id"], selectionQuery=selectD, ) docCount = obEx.getCount() logger.debug("Reference data match count %d", docCount) objD = obEx.getObjects() return objD
def __extractLigandNeighbors(self): """Extract unique chemical component ids involved in neighbor interactions with each polymer and branched entity instance.""" try: databaseName = "pdbx_core" collectionName = "pdbx_core_polymer_entity_instance" obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=None, useCache=False, keyAttribute="rcsb_id", uniqueAttributes=["rcsb_id"], cacheKwargs=None, objectLimit=None, # selectionQuery={"rcsb_polymer_entity_annotation.type": annotationType}, selectionQuery=None, selectionList=[ "rcsb_id", "rcsb_polymer_entity_instance_container_identifiers.entry_id", "rcsb_polymer_entity_instance_container_identifiers.entity_id", "rcsb_polymer_entity_instance_container_identifiers.asym_id", "rcsb_ligand_neighbors.ligand_comp_id", "rcsb_ligand_neighbors.ligand_is_bound", ], ) eCount = obEx.getCount() logger.info("Total neighbor count (%d)", eCount) rD = {} objD = obEx.getObjects() for _, peiD in objD.items(): try: entryId = peiD["rcsb_polymer_entity_instance_container_identifiers"]["entry_id"] entityId = peiD["rcsb_polymer_entity_instance_container_identifiers"]["entity_id"] ky = entryId + "_" + entityId for lnD in peiD["rcsb_ligand_neighbors"] if "rcsb_ligand_neighbors" in peiD else []: if "ligand_comp_id" in lnD and "ligand_is_bound" in lnD: rD.setdefault(ky, set()).add((lnD["ligand_comp_id"], lnD["ligand_is_bound"])) else: logger.warning("%s %s missing details lnD %r", entryId, entityId, lnD) except Exception as e: logger.exception("Failing with %s", str(e)) rD = {k: list(v) for k, v in rD.items()} logger.info("Unique instance %d", len(rD)) return rD except Exception as e: logger.exception("Failing with %s", str(e))
def getReferenceSequenceDetails(self): """Get reference protein sequence essential details (sequence, taxonomy, name, gene, ...)""" uD = None try: obEx = ObjectExtractor( self.__cfgOb, databaseName="uniprot_exdb", collectionName="reference_entry", useCache=False, keyAttribute="uniprot", uniqueAttributes=["rcsb_id"], selectionQuery={}, selectionList=[ "source_scientific", "taxonomy_id", "rcsb_id", "gene", "names", "sequence", ], ) # eCount = obEx.getCount() logger.info("Reference entry count is %d", eCount) objD = obEx.getObjects() rD = {} for rId, uD in objD.items(): taxId = uD["taxonomy_id"] sn = uD["source_scientific"] sequence = uD["sequence"] gn = None pn = None if "gene" in uD: for tD in uD["gene"]: if tD["type"] == "primary": gn = tD["name"] break for tD in uD["names"]: if tD["nameType"] == "recommendedName": pn = tD["name"] break rD[rId] = {"accession": rId, "taxId": taxId, "scientific_name": sn, "gene": gn, "name": pn, "sequence": sequence} except Exception as e: logger.exception("Failing uD %r with %s", uD, str(e)) # return rD
def getBranchedDetails(self): """Get branched entity details (BIRD mapping and WURCS descriptors)""" rD = {} try: # obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_branched_entity", useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], selectionQuery={}, selectionList=["rcsb_id", "pdbx_entity_branch_descriptor", "rcsb_branched_entity_container_identifiers"], ) # # eCount = obEx.getCount() # logger.info("Branched entity count is %d", eCount) objD = obEx.getObjects() rD = {} for _, eD in objD.items(): rcsbId = eD["rcsb_id"] # prdId = None try: pD = eD["rcsb_branched_entity_container_identifiers"] prdId = pD["prd_id"] except Exception: pass # wurcs = None try: for tD in eD["pdbx_entity_branch_descriptor"]: if tD["type"] == "WURCS": wurcs = tD["descriptor"] except Exception: pass if prdId or wurcs: rD[rcsbId] = {"prdId": prdId, "wurcs": wurcs} except Exception as e: logger.exception("Failing with %s", str(e)) return rD
def getChemCompAccessionMapping(self, referenceResourceName): """Get the accession code mapping between chemical component identifiers and identifier(s) for the input external reference resource. Args: referenceResourceName (str): resource name (e.g. DrugBank, ChEMBL, CCDC) Returns: dict: {referenceResourceId: chem_comp/bird_id, referenceResourceId: chem_comp/bird_id, ... } """ idD = {} try: databaseName = "bird_chem_comp_core" collectionName = "bird_chem_comp_core" selectD = { "rcsb_chem_comp_related.resource_name": referenceResourceName } selectionList = ["rcsb_id", "rcsb_chem_comp_related"] logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD) obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, keyAttribute="rcsb_id", uniqueAttributes=["rcsb_id"], selectionQuery=selectD, selectionList=selectionList, stripObjectId=True, ) logger.info("Reference data object count %d", obEx.getCount()) objD = obEx.getObjects() for _, doc in objD.items(): dL = doc["rcsb_chem_comp_related"] if "rcsb_chem_comp_related" in doc else [] for dD in dL: if dD["resource_name"] == referenceResourceName and "resource_accession_code" in dD: idD.setdefault(dD["resource_accession_code"], []).append(dD["comp_id"]) except Exception as e: logger.exception("Failing with %s", str(e)) return idD
def testExtractEntityTaxonomyContent(self): """Test case - extract unique entity source and host taxonomies""" try: obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", cacheFilePath=os.path.join(self.__workPath, "entity-taxonomy-test-cache.json"), useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=self.__testEntryCacheKwargs, # objectLimit=self.__objectLimitTest, objectLimit=None, selectionQuery=None, selectionList=[ "rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_host_organism.ncbi_taxonomy_id" ], ) eCount = obEx.getCount() logger.info("Polymer entity count is %d", eCount) taxIdS = set() objD = obEx.getObjects() for _, eD in objD.items(): try: for tD in eD["rcsb_entity_source_organism"]: taxIdS.add(tD["ncbi_taxonomy_id"]) except Exception: pass try: for tD in eD["rcsb_entity_host_organism"]: taxIdS.add(tD["ncbi_taxonomy_id"]) except Exception: pass logger.info("Unique taxons %d", len(taxIdS)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testExtractEntityTaxonomyContent(self): """Test case - extract unique entity source and host taxonomies""" tL = [] try: obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], selectionQuery=None, selectionList=["rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_host_organism.ncbi_taxonomy_id"], ) eCount = obEx.getCount() logger.info("Polymer entity count is %d", eCount) objD = obEx.getObjects() sD = {} hD = {} for rId, eD in objD.items(): try: for tD in eD["rcsb_entity_source_organism"]: sD.setdefault(rId, []).append(str(tD["ncbi_taxonomy_id"])) except Exception: pass try: for tD in eD["rcsb_entity_host_organism"]: hD.setdefault(rId, []).append(str(tD["ncbi_taxonomy_id"])) except Exception: pass for rId, taxIdL in sD.items(): tS = "|".join(sorted(set(taxIdL))) if tS: lS = "%s\t%s" % (rId, "|".join(sorted(set(taxIdL)))) tL.append(lS) self.__mU.doExport(self.__entityTaxonPath, tL, fmt="list") except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def __extractEntityAnnotationIdentifiers(self, annotationType): """Extract unique rcsb_polymer_entity_annotation ids for the input annotation type.""" try: databaseName = "pdbx_core" collectionName = "pdbx_core_polymer_entity" obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=None, useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=None, objectLimit=None, # selectionQuery={"rcsb_polymer_entity_annotation.type": annotationType}, selectionQuery=None, selectionList=[ "rcsb_id", "rcsb_polymer_entity_annotation.annotation_id", "rcsb_polymer_entity_annotation.type" ], ) eCount = obEx.getCount() logger.info( "For type %r polymer entity annotation object count is %d", annotationType, eCount) idS = set() objD = obEx.getObjects() for _, eD in objD.items(): try: for tD in eD["rcsb_polymer_entity_annotation"]: if tD["type"] == annotationType: idS.add(tD["annotation_id"]) except Exception: pass logger.info("Unique identifiers %d", len(idS)) return list(idS) except Exception as e: logger.exception("Failing with %s", str(e))
def __extractEntityTaxons(self): """Test case - extract unique entity source and host taxonomies""" try: obEx = ObjectExtractor( self.__cfgOb, databaseName=self.__databaseName, collectionName=self.__collectionName, cacheFilePath=None, useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=None, objectLimit=None, # selectionQuery={"entity.type": "polymer"}, selectionQuery=None, selectionList=[ "rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_host_organism.ncbi_taxonomy_id" ], ) eCount = obEx.getCount() logger.info("Polymer entity count is %d", eCount) taxIdS = set() objD = obEx.getObjects() for _, eD in objD.items(): try: for tD in eD["rcsb_entity_source_organism"]: taxIdS.add(tD["ncbi_taxonomy_id"]) except Exception: pass try: for tD in eD["rcsb_entity_host_organism"]: taxIdS.add(tD["ncbi_taxonomy_id"]) except Exception: pass logger.info("Unique taxons %d", len(taxIdS)) return list(taxIdS) except Exception as e: logger.exception("Failing with %s", str(e))
def testUpdateSelectedEntityContent(self): """Test case - update of selected entity reference sequence content""" try: databaseName = "pdbx_core" collectionName = "pdbx_core_polymer_entity" obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=os.path.join( self.__workPath, "entity-selected-content-test-cache.json"), useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=self.__testEntryCacheKwargs, objectLimit=self.__objectLimitTest, # objectLimit=None, selectionQuery={ "entity_poly.rcsb_entity_polymer_type": "Protein" }, selectionList=[ "rcsb_id", "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers" ], ) eCount = obEx.getCount() logger.info("Entity count is %d", eCount) objD = obEx.getObjects() updateDL = [] for entityKey, eD in objD.items(): try: selectD = {"rcsb_id": entityKey} tL = (eD["rcsb_polymer_entity_container_identifiers"] ["reference_sequence_identifiers"] if "reference_sequence_identifiers" in eD["rcsb_polymer_entity_container_identifiers"] else []) tL.append({ "database_accession": "1111111", "database_name": "PDB", "provenance_source": "RCSB" }) # updateD = { "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers": tL } updateDL.append({"selectD": selectD, "updateD": updateD}) except Exception as e: logger.exception("Failing with %s", str(e)) for ii, uD in enumerate(updateDL): logger.debug(" >>>> (%d) selectD %r updateD %r", ii, uD["selectD"], uD["updateD"]) # obUpd = ObjectUpdater(self.__cfgOb) numUpd = obUpd.update(databaseName, collectionName, updateDL) self.assertGreaterEqual(numUpd, len(updateDL)) logger.info("Update count is %d", numUpd) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testExtractSelectedEntityContent(self): """Test case - extract selected entity content "reference_sequence_identifiers": [ { "database_name": "UniProt", "database_accession": "Q5SHN1", "provenance_source": "SIFTS" }, { "database_name": "UniProt", "database_accession": "Q5SHN1", "provenance_source": "PDB" } ] """ try: obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", cacheFilePath=os.path.join( self.__workPath, "entity-selected-content-test-cache.json"), useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=self.__testEntryCacheKwargs, # objectLimit=self.__objectLimitTest, objectLimit=None, selectionQuery={ "entity_poly.rcsb_entity_polymer_type": "Protein" }, selectionList=[ "rcsb_id", "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers" ], ) eCount = obEx.getCount() logger.info("Entity count is %d", eCount) # # if self.__objectLimitTest is not None: self.assertGreaterEqual(eCount, self.__objectLimitTest) objD = obEx.getObjects() for _, obj in objD.items(): obEx.genPathList(obj, path=None) # pL = obEx.getPathList(filterList=False) logger.debug("Path list (unfiltered) %r", pL) # pL = obEx.getPathList() logger.debug("Path list %r", pL) obEx.setPathList(pL) if self.__verbose: for ky, obj in objD.items(): obEx.genValueList(obj, path=None) tD = obEx.getValues() logger.info("Index object %r %s", ky, pprint.pformat(tD, indent=3, width=120)) objD = obEx.getObjects() # logger.info("objD.keys() %r", list(objD.keys())) totCount = 0 difCount = 0 pdbUnpIdD = defaultdict(int) siftsUnpIdD = defaultdict(int) pdbDifUnpIdD = defaultdict(int) for entityKey, eD in objD.items(): try: siftsS = set() pdbS = set() for tD in eD["rcsb_polymer_entity_container_identifiers"][ "reference_sequence_identifiers"]: if tD["database_name"] == "UniProt": if tD["provenance_source"] == "SIFTS": siftsS.add(tD["database_accession"]) siftsUnpIdD[tD["database_accession"]] += 1 elif tD["provenance_source"] == "PDB": pdbS.add(tD["database_accession"]) pdbUnpIdD[tD["database_accession"]] += 1 else: logger.debug( "No UniProt for %r", eD["rcsb_polymer_entity_container_identifiers"] ) logger.debug("PDB assigned sequence length %d", len(pdbS)) logger.debug("SIFTS assigned sequence length %d", len(siftsS)) if pdbS and siftsS: totCount += 1 if pdbS != siftsS: difCount += 1 for idV in pdbS: pdbDifUnpIdD[idV] += 1 except Exception as e: logger.warning("No identifiers for %s with %s", entityKey, str(e)) logger.info("Total %d differences %d", totCount, difCount) logger.info("Unique UniProt ids PDB %d SIFTS %d", len(pdbUnpIdD), len(siftsUnpIdD)) logger.info("Unique UniProt differences %d ", len(pdbDifUnpIdD)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testExtractEntities(self): """Test case - extract entities""" try: obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", cacheFilePath=os.path.join(self.__workPath, "entity-data-test-cache.json"), useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=self.__testEntryCacheKwargs, objectLimit=self.__objectLimitTest, ) eCount = obEx.getCount() logger.info("Entity count is %d", eCount) self.assertGreaterEqual(eCount, self.__objectLimitTest) objD = obEx.getObjects() for _, obj in objD.items(): obEx.genPathList(obj, path=None) # pL = obEx.getPathList(filterList=False) logger.debug("Path list (unfiltered) %r", pL) # pL = obEx.getPathList() logger.debug("Path list %r", pL) obEx.setPathList(pL) if self.__verbose: for ky, obj in objD.items(): obEx.genValueList(obj, path=None) tD = obEx.getValues() logger.info("Index object %r %s", ky, pprint.pformat(tD, indent=3, width=120)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def getProteinSequenceDetails(self, minSeqLen=0): """Get protein sequence and taxonomy data (required to build protein sequence fasta file)""" missingSrcD = {} rD = {} try: unpEx = UniProtExtractor(self.__cfgOb) unpD = unpEx.getReferenceSequenceDetails() # obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], selectionQuery={ "entity_poly.rcsb_entity_polymer_type": "Protein" }, selectionList=[ "rcsb_id", "rcsb_entity_source_organism", "rcsb_polymer_entity.rcsb_source_part_count", "rcsb_polymer_entity.rcsb_source_taxonomy_count", "rcsb_polymer_entity.src_method", "entity_poly", "rcsb_polymer_entity_align", ], ) # eCount = obEx.getCount() logger.info("Polymer entity count is %d", eCount) objD = obEx.getObjects() rD = {} for rId, eD in objD.items(): try: pD = eD["entity_poly"] seqS = pD["pdbx_seq_one_letter_code_can"] seqLen = len(seqS) except Exception: logger.warning("%s no one-letter-code sequence", rId) # if seqLen < minSeqLen: continue # srcMethod = None try: pD = eD["rcsb_polymer_entity"] srcMethod = pD["src_method"] except Exception: pass # if "rcsb_entity_source_organism" not in eD: logger.debug( "%s No source information (%r) skipping (seqLen %d)", rId, srcMethod, seqLen) continue try: sL = [] for tD in eD["rcsb_entity_source_organism"]: srcName = tD[ "scientific_name"] if "scientific_name" in tD else None if "beg_seq_num" in tD and "end_seq_num" in tD: begSeqNum = tD["beg_seq_num"] endSeqNum = tD["end_seq_num"] if tD[ "end_seq_num"] <= seqLen else seqLen else: begSeqNum = 1 endSeqNum = seqLen srcId = tD["pdbx_src_id"] srcType = tD["source_type"] taxId = tD[ "ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in tD else -1 if srcName and taxId == -1: missingSrcD.setdefault(srcName, []).append(rId) orgName = tD[ "ncbi_scientific_name"] if "ncbi_scientific_name" in tD else "" sL.append({ "srcId": srcId, "taxId": taxId, "orgName": orgName, "entitySeqBeg": begSeqNum, "entitySeqEnd": endSeqNum }) if len(sL) == 1: sL[0]["entitySeqBeg"] = 1 sL[0]["entitySeqEnd"] = seqLen except Exception as e: logger.exception("Failing for (%r) tD %r with %s", rId, tD, str(e)) # try: pD = eD["rcsb_polymer_entity"] partCount = pD["rcsb_source_part_count"] except Exception: logger.warning("%s no source part count", rId) partCount = 1 try: pD = eD["rcsb_polymer_entity"] taxCount = pD["rcsb_source_taxonomy_count"] except Exception: if srcType == "synthetic": taxCount = 0 else: logger.warning( "%s (srcName %r) no source taxonomy count type %r", rId, srcName, srcType) if srcName: taxCount = 1 else: taxCount = 0 # uDL = [] try: for tD in eD["rcsb_polymer_entity_align"]: uD = {} if tD["reference_database_name"] in [ "UniProt", "GenBank", "PIR", "EMBL", "NORINE", "PRF" ]: uD["refDbId"] = tD["reference_database_accession"] uD["refDbName"] = tD["reference_database_name"] uD["provSource"] = tD["provenance_source"] if tD["reference_database_accession"] in unpD: uD.update( unpD[tD["reference_database_accession"]]) aL = [] for qD in tD["aligned_regions"]: if qD["entity_beg_seq_id"] + qD[ "length"] - 1 > seqLen: qD["length"] = seqLen - qD[ "entity_beg_seq_id"] + 1 srcId = self.__getSourcePart( rId, sL, qD["entity_beg_seq_id"], qD["length"]) aL.append({ "srcId": srcId, "entitySeqBeg": qD["entity_beg_seq_id"], "refSeqBeg": qD["ref_beg_seq_id"], "length": qD["length"] }) uD["alignList"] = aL uDL.append(uD) else: logger.info("%s reference database %s", rId, tD["reference_database_name"]) except Exception: pass rD[rId] = { "alignmentL": uDL, "sourceOrgL": sL, "partCount": partCount, "taxCount": taxCount, "sequence": seqS, "seqLen": seqLen } except Exception as e: logger.exception("Failing with %s", str(e)) return rD, missingSrcD