def get_compound_dot_dict(self, inchikey: str) -> NestedDotDict: """ Fetches info and put into a dict. Args: inchikey: Returns: **Only** ``molecule_chembl_id``, ``pref_name``, "and ``molecule_structures`` are guaranteed to exist """ # CHEMBL kind = self.get_query_type(inchikey) if kind == QueryType.smiles: results = list( self.api.molecule.filter( molecule_structures__canonical_smiles__flexmatch=inchikey). only( ["molecule_chembl_id", "pref_name", "molecule_structures"])) assert len(results) == 1, f"{len(results)} matches for {inchikey}" result = results[0] else: result = self.api.molecule.get(inchikey) if result is None: raise ValueError(f"Result for compound {inchikey} is null!") ch = NestedDotDict(result) # molecule_hierarchy can have the actual value None if ch.get("molecule_hierarchy") is not None: parent = ch["molecule_hierarchy"]["parent_chembl_id"] if parent != ch["molecule_chembl_id"]: ch = NestedDotDict(self.api.molecule.get(parent)) return ch
def should_include(self, lookup: str, compound: ChemblCompound, data: NestedDotDict, target: Target) -> bool: bad_flags = { "potential missing data", "potential transcription error", "outside typical range", } if (data.get_as("data_validity_comment", lambda s: s.lower()) in bad_flags or data.req_as( "standard_relation", str) not in ["=", "<", "<="] or data.req_as("assay_type", str) != "B" or data.get("target_tax_id") is None or data.get_as("target_tax_id", int) not in self.tax or data.get("pchembl_value") is None or data.req_as("pchembl_value", float) < self.config.min_pchembl): return False if data.get("data_validity_comment") is not None: logger.warning( f"Activity annotation for {lookup} has flag '{data.get('data_validity_comment')} (ok)" ) # The `target_organism` doesn't always match the `assay_organism` # Ex: see assay CHEMBL823141 / document CHEMBL1135642 for h**o sapiens in xenopus laevis # However, it's often something like yeast expressing a human / mouse / etc receptor # So there's no need to filter by it assay = self.api.assay.get(data.req_as("assay_chembl_id", str)) confidence_score = assay.get("confidence_score") if confidence_score is None or confidence_score < self.config.min_confidence_score: return False if target.type.is_trash or target.type.is_strange and self.config.min_confidence_score > 3: logger.warning(f"Excluding {target} with type {target.type}") return False return True
def process(self, lookup: str, compound: ChemblCompound, data: NestedDotDict) -> Sequence[H]: """ Args: lookup: compound: data: Returns: """ if data.get("target_chembl_id") is None: logger.debug(f"target_chembl_id missing from mechanism '{data}' for compound {lookup}") return [] chembl_id = data["target_chembl_id"] target_obj = TargetFactory.find(chembl_id, self.api) if not self.should_include(lookup, compound, data, target_obj): return [] # traverse() will return the source target if it's a non-traversable type (like DNA) # and the subclass decided whether to filter those # so don't worry about that here ancestors = self.traversal_strategy(target_obj) lst = [] for ancestor in ancestors: lst.extend(self.to_hit(lookup, compound, data, ancestor)) return lst
def find(cls, chembl: str) -> Target: """ Args: chembl: Returns: """ targets = cls.api().target.filter(target_chembl_id=chembl) assert len(targets) == 1, f"Found {len(targets)} targets for {chembl}" target = NestedDotDict(targets[0]) return cls( chembl=target["target_chembl_id"], name=target.get("pref_name"), type=TargetType.of(target["target_type"]), )
def test_settings(self): toml = NestedDotDict.read_toml(get_test_resource("settings.toml")) x = Settings.load(toml) assert x.taxon == 1111 assert x.min_pchembl == 15 assert x.min_confidence_score == 2 assert x.min_phase == 0 assert str(x.cache_path) == "~" assert x.n_retries == 100 assert not x.fast_save assert x.timeout_sec == 0
def process(self, lookup: str, compound: ChemblCompound, indication: NestedDotDict) -> IndicationHit: """ Args: lookup: compound: indication: Returns: """ return IndicationHit( indication.req_as("drugind_id", str), compound.chid, compound.inchikey, lookup, compound.name, object_id=indication.req_as("mesh_id", str), object_name=indication.req_as("mesh_heading", str).strip("\n"), max_phase=indication.req_as("max_phase_for_ind", int), )
def get_target(self, chembl: str) -> NestedDotDict: """ Queries for the target. Args: chembl: Returns: """ targets = self.api.target.filter(target_chembl_id=chembl) assert len(targets) == 1 return NestedDotDict(targets[0])
def search_for( what: What, compounds: Union[Sequence[str], PurePath], config: Union[None, Mapping[str, Any], Path], ) -> Tup[pd.DataFrame, Sequence[Triple]]: """ Args: what: compounds: config: Returns: """ if isinstance(compounds, (PurePath, str)): compounds = Path(compounds).read_text(encoding="utf8").splitlines() compounds = [c.strip() for c in compounds if len(c.strip()) > 0] if config is None: settings = Settings.load(NestedDotDict({})) elif isinstance(config, PurePath): settings = Settings.load(NestedDotDict.read_toml(config)) elif isinstance(config, NestedDotDict): settings = config else: settings = Settings.load(NestedDotDict(config)) settings.set() compounds = list(compounds) api = ChemblApi.wrap(Chembl) taxonomy = TaxonomyCaches.load(settings.taxon) hits = what.clazz(api, settings, taxonomy).find_all(compounds) # collapse over and sort the triples triples = sorted(list({hit.to_triple() for hit in hits})) df = pd.DataFrame([ pd.Series({f: getattr(h, f) for f in what.clazz.hit_fields()}) for h in hits ]) return df, triples
def test_mocked(self): api = ChemblApi.mock( {"target": ChemblEntrypoint.mock({"DAT": { "x": "" }})}) dotdict = NestedDotDict({"x": ""}) assert api.target is not None assert api.target.get("DAT") is not None assert isinstance(api.target.get("DAT"), NestedDotDict) assert api.target.get("DAT") == dotdict with pytest.raises(KeyError): assert api.target.get("fasw") assert isinstance(api.target.filter(), ChemblFilterQuery) assert isinstance(api.target.filter().only([]), ChemblFilterQuery) z = list(api.target.filter().only([])) assert z == [dotdict]
def find(self, lookup: str) -> Sequence[H]: """ Args: lookup: Returns: """ form = self.get_compound(lookup) results = self.query(form) hits = [] for result in results: result = NestedDotDict(result) hits.extend(self.process(lookup, form, result)) return hits
def load(cls, data: NestedDotDict) -> Settings: # 117571 if IN_CLI: cache_path = ( Path(__file__).parent.parent.parent / "tests" / "resources" / ".mandos-cache" ) else: cache_path = Path.home() / ".mandos" / "chembl" return Settings( data.get_as("is_testing", bool, False), data.get_as("mandos.taxon", int, 7742), data.get_as("mandos.min_pchembl", float, 6.0), data.get_as("mandos.min_confidence_score", int, 4), data.get_as("mandos.min_phase", int, 3), data.get_as("chembl.cache_path", Path, cache_path), data.get_as("chembl.n_retries", int, 1), data.get_as("chembl.fast_save", bool, True), data.get_as("chembl.timeout_sec", int, 1), )
def to_hit(self, lookup: str, compound: ChemblCompound, data: NestedDotDict, target: Target) -> Sequence[MechanismHit]: # these must match the constructor of the Hit, # EXCEPT for object_id and object_name, which come from traversal x = NestedDotDict( dict( record_id=data["mec_id"], compound_id=compound.chid, inchikey=compound.inchikey, compound_name=compound.name, compound_lookup=lookup, action_type=data["action_type"], direct_interaction=data["direct_interaction"], description=data["mechanism_of_action"], exact_target_id=data["target_chembl_id"], )) return [ MechanismHit(**x, object_id=target.chembl, object_name=target.name) ]
def _extract(self, lookup: str, compound: ChemblCompound, data: NestedDotDict) -> NestedDotDict: # we know these exist from the query organism = data.req_as("target_organism", str) tax_id = data.req_as("target_tax_id", int) tax = self.tax.req(tax_id) if organism != tax.name: logger.warning(f"Target organism {organism} is not {tax.name}") return NestedDotDict( dict( record_id=data.req_as("activity_id", str), compound_id=compound.chid, inchikey=compound.inchikey, compound_name=compound.name, compound_lookup=lookup, taxon_id=tax.id, taxon_name=tax.name, pchembl=data.req_as("pchembl_value", float), std_type=data.req_as("standard_type", str), src_id=data.req_as("src_id", str), exact_target_id=data.req_as("target_chembl_id", str), ))
def _process(self, match: ProteinHit, target: NestedDotDict) -> Sequence[GoHit]: terms = set() if target.get("target_components") is not None: for comp in target["target_components"]: if comp.get("target_component_xrefs") is not None: for xref in comp["target_component_xrefs"]: if xref["xref_src_db"] == f"Go{self.go_type.name.capitalize()}": terms.add((xref["xref_id"], xref["xref_name"])) hits = [] for xref_id, xref_name in terms: hits.append( GoHit( None, compound_id=match.compound_id, inchikey=match.inchikey, compound_lookup=match.compound_lookup, compound_name=match.compound_name, object_id=xref_id, object_name=xref_name, go_type=self.go_type.name, protein_hit=match, )) return hits
def from_toml_file( cls, path: PathLike, *, warn: Union[bool, Callable[[Response], Any]] = True) -> Notifier: return cls.from_dict(NestedDotDict.read_toml(path), warn=warn)
def __init__(self): path = os.environ.get("VALARDAGGER_CONFIG", Path("/etc", "valardagger.toml")) self.config = NestedDotDict.read_toml(path) self.valar = valarpy.Valar() self.model = None
def test_empty(self): toml = NestedDotDict.read_toml( get_test_resource("settings-empty.toml")) x = Settings.load(toml) assert x.min_phase == 3
def get(self, arg: str) -> Optional[NestedDotDict]: return NestedDotDict(get_items[arg])
def get(self, arg: str) -> Optional[NestedDotDict]: return NestedDotDict(getattr(obj, "get")(arg))
def __getitem__(self, item: int) -> NestedDotDict: return NestedDotDict(query[item])
def __iter__(self) -> Iterator[NestedDotDict]: return iter([NestedDotDict(x) for x in query])