def fetch_all_by_trgm_sim(self, smiles, *expr, **kwargs): """ Returns all fragments that are similar to the given SMILES string using trigam similarity (similar to LINGO). Parameters ---------- threshold : float, default=0.6 Similarity threshold that will be used for searching. limit : int, default=25 Maximum number of hits that will be returned. Returns ------- resultset : list List of tuples (Fragment, similarity) containing the chemical components and the calculated trigram similarity. Queried Entities ---------------- Fragment Examples -------- >>>> FragmentAdaptor().fetch_all_by_trgm_sim('Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CC[NH+](CC5)C') [(<Fragment(STI)>, 0.883721), (<Fragment(NIL)>, 0.73913), (<Fragment(PRC)>, 0.738095), (<Fragment(406)>, 0.666667), (<Fragment(J07)>, 0.604167), (<Fragment(AD5)>, 0.6), (<Fragment(AAX)>, 0.6), (<Fragment(VX6)>, 0.6)] Requires -------- .. important:: `pg_trgm <http://www.postgresql.org/docs/current/static/pgtrgm.html>`_ PostgreSQL extension. """ session = Session() threshold = kwargs.get('threshold', 0.6) # SET THE SIMILARITY THRESHOLD FOR THE INDEX session.execute( text("SELECT set_limit(:threshold)").execution_options( autocommit=True).params(threshold=threshold)) similarity = func.similarity(Fragment.ism, smiles).label('similarity') sim_thresh = func.show_limit().label('sim_thresh') query = self.query.add_columns(similarity, sim_thresh) query = query.filter(and_(Fragment.like(smiles), *expr)) # KNN-GIST query = query.order_by(Fragment.ism.op('<->')(smiles)) if kwargs.get('limit'): query = query.limit(kwargs['limit']) results = query.all() #session.close() return results #query
def pdb_atom_names(self): """ """ session = Session() query = session.query( chem_comp_fragment_atoms.c.hit, func.array_agg(chem_comp_fragment_atoms.c.pdb_name)) query = query.filter(chem_comp_fragment_atoms.c.chem_comp_fragment_id == self.chem_comp_fragment_id) query = query.group_by(chem_comp_fragment_atoms.c.hit) return query.all()
def abstracts(self): """ Returns the abstract(s) of the journal articles that are associated with this PDB entry. """ session = Session() statement = select([citations], and_(citations.c.pubmed_id==cast(XRef.xref, Integer), XRef.source=='PubMed', XRef.entity_type=='Structure', XRef.entity_id==self.structure_id)) return session.execute(statement).fetchall()
def disordered_regions(self, *expr): """ Returns a list of disordered regions inside this Chain (if any). """ session = Session() statement = select([disordered_regions], and_(disordered_regions.c.pdb==self.Biomolecule.Structure.pdb, disordered_regions.c.pdb_chain_id==self.pdb_chain_asu_id, *expr)) result = session.execute(statement).fetchall() return result
def pdbstring(self, **kwargs): """ Returns the binding site environment of the ligand as PDB string. :param biomolecule_id: The biomolecule_id of the assembly that this binding site is part of - required to pick the right atom partition table. The biomolecule_id of the parent ligand will be used if missing. """ biomolecule_id = kwargs.get('biomolecule_id', self.Ligand.biomolecule_id) fn = func.credo.binding_site_pdbstring(biomolecule_id, self.ligand_id) with closing(Session()) as session: return session.query(fn).scalar()
def fetch_all_by_sim_oe(self, smiles, *expr, **kwargs): """ Returns all Chemical Components that match the given SMILES string with at least the given similarity threshold using chemical fingerprints. Parameters ---------- smi : str The query rdmol in SMILES format. threshold : float, default=0.5 The similarity threshold that will be used for searching. fp : {'circular','atompair','torsion'} RDKit fingerprint type to be used for similarity searching. *expr : BinaryExpressions, optional SQLAlchemy BinaryExpressions that will be used to filter the query. Queried Entities ---------------- ChemComp, ChemCompOEFP Returns ------- hits : list List of tuples in the form (ChemComp, similarity) Examples -------- Requires -------- .. important:: OpenEye cartridge. """ session = Session() threshold = kwargs.get('threshold') metric = kwargs.get('metric', 'tanimoto') fp = kwargs.get('fp', 'circular') limit = kwargs.get('limit', 100) # set the similarity threshold for the selected metric if threshold: statement = text( "SELECT openeye.set_oefp_similarity_limit(:threshold, :metric)" ) session.execute( statement.params(threshold=threshold, metric=metric)) if fp == 'circular': query = func.openeye.make_circular_fp(smiles) target = ChemCompOEFP.circular_fp elif fp == 'maccs166': query = func.openeye.make_maccs166_fp(smiles) target = ChemCompOEFP.maccs166_fp elif fp == 'path': query = func.openeye.make_path_fp(smiles) target = ChemCompOEFP.path_fp elif fp == 'tree': query = func.openeye.make_tree_fp(smiles) target = ChemCompOEFP.tree_fp else: raise ValueError( "cannot create fingerprint: type {0} does not exist.".format( fp)) # compile similarity metric and the correspoding GIST index / KNN-GIST if metric == 'tanimoto': similarity = func.openeye.tanimoto(query, target) index = func.openeye.tanimoto_is_above_limit(target, query) orderby = target.op('OPERATOR(openeye.<%%>)')(query) # escape % elif metric == 'dice': similarity = func.openeye.dice(query, target) index = func.openeye.dice_is_above_limit(target, query) orderby = target.op('OPERATOR(openeye.<#>)')(query) elif metric == 'manhattan': similarity = func.openeye.manhattan(query, target) index = func.openeye.manhattan_is_above_limit(target, query) orderby = target.op('OPERATOR(openeye.<~>)')(query) elif metric == 'cosine': similarity = func.openeye.cosine(query, target) index = func.openeye.cosine_is_above_limit(target, query) orderby = target.op('OPERATOR(openeye.<@>)')(query) elif metric == 'euclidean': similarity = func.openeye.euclidean(query, target) index = func.openeye.euclidean_is_above_limit(target, query) orderby = target.op('OPERATOR(openeye.<->)')(query) else: raise ValueError( "{} is not a valid similarity metric.".format(metric)) query = ChemComp.query.add_column(similarity) query = query.join('OEFP').filter(and_(index, *expr)) query = query.order_by(orderby) return query
def do(controller): """ """ # timer to clock functions and parts of the program timer = Timer() timer.start("app") # get the controller command cmd = controller.command # get the command line arguments and options args = controller.pargs insert = binding_site_fuzcav.insert() tracker = fuzcav.get_tracker() # get the fuzcav side chain representative table from the credoscript metadata metadata.reflect(schema='bio', only=('fuzcav_rep_sc_atoms', )) fuzcav_rep_sc_atoms = Table('bio.fuzcav_rep_sc_atoms', metadata, autoload=True) timer.start() session = Session() # get all ligands that have more than 7 heavy atoms and no clashes query = session.query(Ligand.ligand_id, Ligand.biomolecule_id) query = query.filter( and_(Ligand.num_hvy_atoms >= 7, Ligand.is_clashing == False)) if args.incremental: # subquery to get the current max ligand_id from the binding_site_fuzcav table sq = session.query( func.max(binding_site_fuzcav.c.ligand_id).label( 'ligand_id')).subquery('sq') # only include new ligands query = query.filter(Ligand.ligand_id > sq.c.ligand_id) ligand_ids = query.order_by(Ligand.ligand_id).all() # debug how much time it took to get all contacts app.log.debug( "all new ligand identifiers retrieved in {0:.2f} seconds.".format( timer.elapsed())) # query = BindingSiteResidue.query.join('Peptide', 'Atoms') #query = query.join(Peptide, Peptide.residue_id==BindingSiteResidue.residue_id) #query = query.join(Atom, Atom.residue_id==Peptide.residue_id) query = query.outerjoin( fuzcav_rep_sc_atoms, and_(fuzcav_rep_sc_atoms.c.res_name == Peptide.res_name, fuzcav_rep_sc_atoms.c.atom_name == Atom.atom_name)) query = query.filter( and_( Peptide.is_non_std == False, or_(Atom.atom_name == 'CA', fuzcav_rep_sc_atoms.c.atom_name != None))) query = query.with_entities(Peptide.res_name, Atom) if args.progressbar: bar = ProgressBar(widgets=[ 'Binding Sites: ', SimpleProgress(), ' ', Percentage(), Bar() ], maxval=len(ligand_ids)).start() # iterate through ligands for counter, row in enumerate(ligand_ids, 1): if args.progressbar: bar.update(counter) ligand_id, biomolecule_id = row.ligand_id, row.biomolecule_id timer.start() # get all the fuzcav atoms (either CA or representative) # important to use the proper atom partition! atoms = query.filter( and_(BindingSiteResidue.ligand_id == ligand_id, Atom.biomolecule_id == biomolecule_id)).all() # debug how much time it took to get all contacts app.log.debug("all FuzCav atoms retrieved in {0:.2f} seconds.".format( timer.elapsed())) # ignore hits with too few peptides if len(atoms) < 14: app.log.debug("Ligand {} has only {} FuzCav atoms and will be " "ignored.".format(ligand_id, len(atoms))) continue # get the calpha atom and its features for each residue calphas = ((np.array(atom.coords, dtype=float), (fuzcav.FEATURES[res_name])) for res_name, atom in atoms if atom.atom_name == 'CA') # get the representative atom and its features for each residue representatives = ( (np.array(atom.coords, dtype=float), (fuzcav.FEATURES[res_name])) for res_name, atom in atoms if atom.atom_name == fuzcav.REPRESENTATIVES[res_name]) timer.start() calphafp = fuzcav.make_fp(calphas, tracker) repfp = fuzcav.make_fp(representatives, tracker) # debug how much time it took to get all contacts app.log.debug("fingerprints generated in {0:.2f} seconds.".format( timer.elapsed())) # insert the fingerprints into the table if not args.dry_run: engine.execute(insert, ligand_id=ligand_id, calphafp=calphafp.tolist(), repfp=repfp.tolist()) # finish the optional progress bar if args.progressbar: bar.finish() session.close()
class Base(object): """ Declarative base model that is inherited by all CREDO models. """ # automatically reflect the table __table_args__ = {'autoload':True} # attach a query object to every model that queries itself query = Session.query_property(BaseQuery) @ClassProperty @classmethod def __meta__(cls): """ Returns the metadata information of this class as ordered dictionary. """ mapper = cls.__mapper__ meta = [] # get the column data type for every column name # this has to be done in a for loop to catch the error that might occur # if the entity has data type stemming from an extension for key in mapper.c.keys(): try: meta.append((str(key), str(mapper.c[key].type))) except (NotImplementedError, CompileError): meta.append((str(key), "CUSTOM")) return meta def _repr_list_(self): """ Returns a list of values for this entity in proper order. """ return [getattr(self,k) for k in self._sa_class_manager.mapper.c.keys()] def _repr_dict_(self): """ Returns a dictionary (column name, data) representation of this entity. """ return dict((k, getattr(self,k)) for k in self._sa_class_manager.mapper.c.keys()) def _repr_html_(self): """ Returns a HTML representation (table) of the entity. Only used in IPython notebooks. """ data = self._repr_dict_().items() rows = ''.join("<tr><th>{}</th><td>{}</td></tr>".format(k,v) for k,v in data) table = "<table>{}</table>".format(rows) return table @classmethod def get_cls(cls): return cls @property def __data__(self): """ Returns a list of values for this entity in proper order. """ return self._repr_list_() @property def _pkey(self): """ Returns the value of the primary key. Also works for composite keys. """ return tuple(getattr(self, c.name) for c in self.__mapper__.primary_key) @property def _entity_id(self): """ Returns the first column of the primary key as scalar value. Used in the PyMOL API. """ return self._pkey[0]
def fetch_all_by_sim(self, smi, *expr, **kwargs): """ Returns all fragments that match the given SMILES string with at least the given similarity threshold using chemical fingerprints. Parameters ---------- smi : str The query rdmol in SMILES format. threshold : float, default=0.5 The similarity threshold that will be used for searching. fp : {'circular','atompair','torsion','maccs','layered','avalon'} RDKit fingerprint type to be used for similarity searching. *expr : BinaryExpressions, optional SQLAlchemy BinaryExpressions that will be used to filter the query. Queried Entities ---------------- Fragments, FragmentRDFP Returns ------- hits : list List of tuples in the form (Fragment, similarity) Examples -------- >>> #PENDING Requires -------- .. important:: `RDKit <http://www.rdkit.org>`_ PostgreSQL cartridge. """ session = Session() threshold = kwargs.get('threshold', 0.5) metric = kwargs.get('metric', 'tanimoto') fp = kwargs.get('fp', 'circular') if fp == 'circular': query = func.rdkit.morganbv_fp(smi, 2).label('queryfp') target = FragmentRDFP.circular_fp elif fp == 'torsion': query = func.rdkit.torsionbv_fp(smi).label('queryfp') target = FragmentRDFP.torsion_fp elif fp == 'atompair': query = func.rdkit.atompairbv_fp(smi).label('queryfp') target = FragmentRDFP.atompair_fp elif fp == 'maccs': query = func.rdkit.maccs_fp(smi).label('queryfp') target = FragmentRDFP.maccs_fp elif fp == 'layered': query = func.rdkit.layered_fp(smi).label('queryfp') target = FragmentRDFP.layered_fp elif fp == 'avalon': query = func.rdkit.avalon_fp(smi).label('queryfp') target = FragmentRDFP.avalon_fp else: msg = "The fingerprint type [{0}] does not exist.".format(fp) raise RuntimeError(msg) # set the similarity threshold for the index if metric == 'tanimoto': session.execute( text("SET rdkit.tanimoto_threshold=:threshold"). execution_options(autocommit=True).params(threshold=threshold)) sim_thresh = func.current_setting( 'rdkit.tanimoto_threshold').label('sim_thresh') similarity = func.rdkit.tanimoto_sml(query, target).label('similarity') index = func.rdkit.tanimoto_sml_op(query, target) elif metric == 'dice': session.execute( text("SET rdkit.dice_threshold=:threshold").execution_options( autocommit=True).params(threshold=threshold)) sim_thresh = func.current_setting('rdkit.dice_threshold').label( 'sim_thresh') similarity = func.rdkit.dice_sml(query, target).label('similarity') index = func.rdkit.dice_sml_op(query, target) query = self.query.add_columns(similarity, sim_thresh) query = query.join('RDFP').filter(and_(index, *expr)) query = query.order_by('similarity DESC') if kwargs.get('limit'): query = query.limit(kwargs['limit']) #.all( #print query.statement results = query.all() #session.close() return results # query