def match(smiles): group_ids = [] if smiles.find('.') > -1: # ignore complex return group_ids py_mol = pybel.readstring('smi', smiles) py_mol.removeh() formula = py_mol.formula atom_set = set(Formula.read(formula).atomdict.keys()) if not atom_set <= {'C', 'H', 'O', 'N', 'F', 'Cl', 'Br'}: return group_ids if not ('C' in atom_set and {'H', 'F', 'Cl', 'Br'} & atom_set != set()): return group_ids if atom_set == {'C', 'H'}: group_ids.append(smarts_id['hydrocarbon']) for s in ['[CX2]', '[CX3]', 'c', '[#6;v0,v1,v2,v3]']: if pybel.Smarts(s).findall(py_mol) != []: break else: group_ids.append(smarts_id['alkane']) if pybel.Smarts('[OH]').findall(py_mol).__len__() > 1: group_ids.append(smarts_id['diol']) for name, smarts in smarts_dict.items(): if pybel.Smarts(smarts).findall(py_mol): group_ids.append(smarts_id[smarts]) return group_ids
def index(self, smiles): # bridged atoms bridg_Matcher = pybel.Smarts('[x3]') # spiro atoms spiro_Matcher = pybel.Smarts('[x4]') # linked rings RR_Matcher = pybel.Smarts('[R]!@[R]') # separated rings R_R_Matcher = pybel.Smarts('[R]!@*!@[R]') rd_mol: Mol = Chem.MolFromSmiles(smiles) py_mol = pybel.readstring('smi', smiles) index = [ py_mol.OBMol.NumHvyAtoms(), int(round(py_mol.molwt, 1) * 10), self.get_shortest_wiener(rd_mol)[0], Chem.CalcNumRotatableBonds(Chem.AddHs(rd_mol)), len(bridg_Matcher.findall(py_mol)), len(spiro_Matcher.findall(py_mol)), len(RR_Matcher.findall(py_mol)), len(R_R_Matcher.findall(py_mol)), ] + \ list(self.get_ring_info(py_mol)) return np.array(index)
def run(): inputfile=pybel.readfile(sys.argv[1].split(".")[-1],sys.argv[1]) value=() for mol in inputfile: descvalues=mol.calcdesc() value= value+(descvalues.get('TPSA'),) value= value+(descvalues.get('HBD'),) value= value+(descvalues.get('logP'),) value= value+(descvalues.get('MW'),) value= value+(descvalues.get('tbonds'),) value= value+(descvalues.get('nF'),) value= value+(descvalues.get('bonds'),) value= value+(descvalues.get('atoms'),) value= value+(descvalues.get('HBA1'),) value= value+(descvalues.get('HBA2'),) value= value+(descvalues.get('sbonds'),) value= value+(descvalues.get('dbonds'),) value= value+(descvalues.get('MR'),) value= value+(descvalues.get('abonds'),) smarts = pybel.Smarts("[+]") num=smarts.findall(mol) value= value+(len(num),) smarts = pybel.Smarts("[-]") num=smarts.findall(mol) value= value+(len(num),) model=joblib.load('volume_model/volume.pkl') for result in model.predict(value): return round(result,2)
def get_properties_ext(mol): HBD = pybel.Smarts("[!#6;!H0]") HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" + "!$(*~N=O);X1,X2]),$([#7;v3;" + "!$([nH]);!$(*(-a)-a)])]") calc_desc_dict = mol.calcdesc() try: logp = calc_desc_dict['logP'] except: logp = calc_desc_dict['LogP'] return { "molwt": mol.molwt, "logp": logp, "donors": len(HBD.findall(mol)), "acceptors": len(HBA.findall(mol)), "psa": calc_desc_dict['TPSA'], "mr": calc_desc_dict['MR'], "rotbonds": mol.OBMol.NumRotors(), "can": mol.write("can").split()[0].strip( ), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) "inchi": mol.write("inchi").strip(), "inchi_key": get_inchikey(mol).strip(), "rings": len(mol.sssr), "atoms": mol.OBMol.NumHvyAtoms(), "spectrophore": OBspectrophore(mol), }
def main(substrate, group, sub_id, group_id, position, ref): # ========================> Generate Smiles < ========================= # if not group: smiles = substrate else: if position == 1: # ============= Put group at the ortho positions ============== # smiles = substrate[0:2] + "(" + group + ")" + substrate[2:] elif position == 2: # ============= Put group at the meta positions =============== # smiles = substrate[0:3] + "(" + group + ")" + substrate[3:] elif position == 3: # ============= Put group at the para positions =============== # smiles = substrate[0:4] + "(" + group + ")" + substrate[4:] print(smiles) # ============== Generate Folder Name ========== # folder = ip.iupac_name(smiles, substrate, group, sub_id, group_id, position) print folder os.system('mkdir %s' % folder) os.chdir(folder) os.system('mkdir scratch') # ============== Copy data to folder =========== # #TODO # ============== Generate Molecule ========== # mol = confab.gen3d(smiles) # ============ Driving Coordinate Idx Generation ==============# smarts1 = pybel.Smarts(substrate) smarts2 = pybel.Smarts("cccccc") r1 = smarts1.findall(mol) r2 = smarts2.findall(mol) #============ confab and align to reference ==================# r1 = [r1[0][-2], r1[0][-1]] r2 = list(r2[0]) mol = confab.confab(mol, r1, r2) # => align to substructure <=# path = os.getcwd() + "/tmp.xyz" if ref is not None: cmd_str = "obabel %s %s -O %s -s %s --align" % (ref, path, path, substrate) print cmd_str os.system(cmd_str) #read aligned geom mol = confab.read_molecules(path, single=False) mol = mol[1] #============ do zstruct ===================================== # zstruct2.zstruct1(mol, r1, r2, None, doOne=False, doTwo=True)
def __init__(self, ec, substrate, product): self.ec = ec self.substrate = substrate self.product = product try: self.smarts_subs = pybel.Smarts(substrate) self.smarts_prod = pybel.Smarts(product) except IOError: logging.warning("failed parsing: %s >> %s" % (substrate, product)) self.smarts_subs = None self.smarts_prod = None
def teste(): phenol = pybel.Smarts("[OH]c1ccccc1") # phenol ethyl = pybel.Smarts("[#6][#6]") # ethyl group # mol = pybel.readfile("mol2", "resources/base.mol2") found_total = 0 found = [] for mol in pybel.readfile("mol2", "resources/base.mol2"): finded = phenol.findall(mol) found_total += len(finded) found.append(finded) return mol.draw(True)
def count(self, smilesstr): ## groups, include, evalkw, brackets, quotes, unquote = self.commonattr() haskw, hasbracket, hasquote = self.matchedpatt(groups) if include is None: include = [True] * len(groups) ## mol = pybel.readstring('smi', smilesstr) mol.addh() molecule = mol # copy reference; keyword for userdef.py 29.09.2015 abundances = pd.Series([np.nan] * len(groups), index=groups.index) ## SMARTS search for key in groups.index[~haskw & ~hasbracket & ~hasquote]: abundances[key] = len(pybel.Smarts(groups[key]).findall(mol)) ## evaluate eval keyword for key in groups.index[haskw]: # untested abundances[key] = round(eval(evalkw.search(groups[key]).group(1))) ## evaluated quoted expressions for key in groups.index[hasquote]: # untested abundances[key] = round(eval( unquote(groups[key]).format(**groups))) ## evaluate expressions orderedexpr = self.__orderexpr(groups, hasbracket, brackets) for key in orderedexpr: #groups.index[hasbracket]: abundances[key] = round(eval(groups[key].format(**abundances))) ## return abundances[include].astype(int)
def pattern(): global SMARTS PATTERNS = [] for smarts in SMARTS: el = pybel.Smarts(smarts) PATTERNS.append(el) return PATTERNS
def integer_fp(smile, smartsqfile='SMARTSFileFull.json'): '''Creates an integer-valued fingerprint. Probably takes longer since it is a VERY hacked version, but it generates data for SVMs.''' rdmol = Chem.MolFromSmiles(smile) if rdmol: smartsData = open(os.path.join(CHEMPATH, smartsqfile)) encodedSmartsDict = json.load(smartsData) smartsData.close() smartsDict = {k.encode('utf-8'): v.encode('utf-8').split(': ')[1].strip(' \n') for k, v in encodedSmartsDict.iteritems()} fpt = [0] * len(smartsDict.keys()) cpd = pybel.readstring('smi', smile) for k, v in smartsDict.iteritems(): query = pybel.Smarts(v) fpt[int(k) - 1] = len(query.findall(cpd)) #SMARTS querying directly for chirality does not give RS isomerism, so adding it at the end. May get flexed to #_check_reverse for consistency. rect, sini = chem.flag_chiral(rdmol) fpt.append(rect) fpt.append(sini) return fpt else: return None
def get_abundances(self, smilesstr=None): ## main body mol = pybel.readstring('smi', smilesstr) ## store SIMPOL patterns in ordered dictionary abundances = OrderedDict() for key, patt in self.smartspatt.items()[0:]: abundances[key] = 0 if 'nomatch' in patt else \ len(pybel.Smarts(patt).findall(mol)) ## find auxiliary patterns aux = OrderedDict() for key, patt in self.smartsaux.items(): aux[key] = len(pybel.Smarts(patt).findall(mol)) ## combine and return pandas series abundances[ 'H'] = aux['h1'] + 2 * aux['h2'] + 3 * aux['h3'] + 4 * aux['h4'] return pd.Series(abundances)
def matchatoms(self, smilesstr): ## groups, include, evalkw, brackets, quotes, unquote = self.commonattr() haskw, hasbracket, hasquote = self.matchedpatt(groups) ## mol = pybel.readstring('smi', smilesstr) mol.addh() molecule = mol # copy reference; keyword for userdef.py 29.09.2015 tups = OrderedDict(zip(groups.index, [None] * len(groups))) ## SMARTS search for key in groups.index[~haskw & ~hasbracket & ~hasquote]: tups[key] = set(pybel.Smarts(groups[key]).findall(mol)) ## evaluate eval keyword for key in groups.index[haskw]: tups[key] = eval(evalkw.search(groups[key]).group(1)) ## evaluate quoted expressions for key in groups.index[hasquote]: # untested tups[key] = self.__substitute(mol, groups[key], quotes, groups) ## evaluate expressions orderedexpr = self.__orderexpr(groups, hasbracket, brackets) for key in orderedexpr: #groups.index[hasbracket] tups[key] = self.__substitute(mol, groups[key], brackets, tups) usetups = OrderedDict([(k, v) for (k, v) in tups.items() if include.ix[k]]) alltups = reduce(set.union, usetups.values()) allatoms = reduce(add, map(list, alltups), []) atomicmass = set([(atom.type, atom.atomicmass) for atom in mol.atoms if atom.idx in allatoms]) ## idxlabel = 'atom' atomtype = pd.DataFrame([(atom.idx, atom.type) for atom in mol.atoms], columns=[idxlabel, 'type']).set_index(idxlabel) matched = self.__atomtable(atomtype, usetups) ## return (matched, atomicmass)
def detect_dihedrals(mol: pybel.Molecule) -> List[DihedralInfo]: """Detect the bonds to be treated as rotors. We use the more generous definition from RDKit: https://github.com/rdkit/rdkit/blob/1bf6ef3d65f5c7b06b56862b3fb9116a3839b229/rdkit/Chem/Lipinski.py#L47%3E It matches pairs of atoms that are connected by a single bond, both bonds have at least one other bond that is not a triple bond and they are not part of the same ring. Args: mol: Molecule to assess Returns: List of dihedral angles. Most are defined """ dihedrals = [] # Compute the bonding graph g = get_bonding_graph(mol) # Get the indices of backbond atoms backbone = set(i for i, d in g.nodes(data=True) if d['z'] > 1) # Step 1: Get the bonds from a simple matching smarts = pybel.Smarts('[!$(*#*)&!D1]-&!@[!$(*#*)&!D1]') for i, j in smarts.findall(mol): dihedrals.append(get_dihedral_info(g, (i - 1, j - 1), backbone)) return dihedrals
def get_multiring_atoms_bonds(self, rdk_mol: Mol, smiles): ''' Not used ''' atom_ring_times = [0] * rdk_mol.GetNumAtoms() bond_ring_times = [0] * rdk_mol.GetNumBonds() # TODO GetRingInfo gives SymmetricSSSR, not TRUE SSSR ri = rdk_mol.GetRingInfo() for id_atoms in ri.AtomRings(): for ida in id_atoms: atom_ring_times[ida] += 1 for id_bonds in ri.BondRings(): for idb in id_bonds: bond_ring_times[idb] += 1 n_atoms_multiring = len(list(filter(lambda x: x > 1, atom_ring_times))) n_bonds_multiring = len(list(filter(lambda x: x > 1, bond_ring_times))) py_mol = pybel.readstring('smi', smiles) if ri.NumRings() != len(py_mol.sssr): print( 'WARNING: SymmetricSSSR not equal to TRUE SSSR in rdkit. Use Openbabel instead:', smiles) n_atoms_multiring = pybel.Smarts('[R2]').findall(py_mol).__len__() n_bonds_multiring = n_atoms_multiring - 1 return n_atoms_multiring, n_bonds_multiring
def smarts(s): if not isinstance(s, bytes): s = s.encode('ascii') try: return pybel.Smarts(s) except IOError as e: # Convert pybel's IOError (?!) into a ValueError raise ValueError(str(e))
def _update(self): if self.showFragments and self.fragmentSmiles: loop = self.__update_items(self._items, self._widgets) elif self.colorFragments and self.selectedFragment: pattern = pybel.Smarts(self.selectedFragment) loop = self.__update_items(self._items, self._widgets, pattern) else: loop = self.__update_items(self._items, self._widgets) self.__schedule(loop)
def db_select_molecules(cls=None, subcls=None, type=None, subtype=None, tags=[], substructure="") -> pd.DataFrame: """Get a summary frame of molecules in the database :param tags: a list of tags of the db records (if multiple an 'OR' is taken) :type tags: list :param substructure: substructure SMARTS string :type substructure: str :return: pandas.core.frame.DataFrame """ db = db_connect() tags_coll = db['tags'] mols_coll = db['molecules'] feats_coll = db['qchem_descriptors'] tags_cur = tags_coll.find({'tag': {'$in': tags}} if tags else {}) tags_df = pd.DataFrame(tags_cur) filter = {} if cls != "" and cls is not None: filter['metadata.class'] = cls if subcls != "" and subcls is not None: filter['metadata.subclass'] = subcls if type != "" and type is not None: filter['metadata.type'] = type if subtype != "" and subtype is not None: filter['metadata.subtype'] = subtype filter['_id'] = {'$in': tags_df.molecule_id.tolist()} mols_cur = mols_coll.find(filter) mols_df = pd.DataFrame(mols_cur) if 'name' not in mols_df.columns: mols_df['name'] = None if substructure: pattern = pybel.Smarts(substructure) mols_df['pybel_mol'] = mols_df['can'].map(lambda can: pybel.readstring("smi", can)) mols_df = mols_df[mols_df['pybel_mol'].map(lambda mol: bool(pattern.findall(mol)))] mols_df = mols_df.drop('pybel_mol', axis=1) # merge tags in an outer way df = pd.merge(mols_df, tags_df, how='outer', left_on='_id', right_on='molecule_id', suffixes=('', '_tag')) # make tags into a list of tags df['metadata_str'] = df['metadata'].map(repr) grouped = df.groupby(['can', 'metadata_str']) # groupby tags df = pd.concat([grouped['metadata', 'molecule_id', 'name'].first(), grouped['tag'].apply(list)], axis=1).reset_index().drop('metadata_str', axis=1) # fetch ids df['_ids'] = df['molecule_id'].map(lambda mid: [item['_id'] for item in feats_coll.find( {'molecule_id': ObjectId(mid)}, {'_id': 1}) ]) df['num_conformers'] = df['_ids'].map(len) return df
def index(self, smiles): double_double = pybel.Smarts('*=**=*') double_triple = pybel.Smarts('*=**#*') double_tert = pybel.Smarts('*=*[CX4;H0]') triple_tert = pybel.Smarts('*#*[CX4;H0]') r7wired = pybel.Smarts('C1=CC=CC=CC1') py_mol = pybel.readstring('smi', smiles) index = [ len(double_double.findall(py_mol)), len(double_triple.findall(py_mol)), len(double_tert.findall(py_mol)), len(triple_tert.findall(py_mol)), len(r7wired.findall(py_mol)), ] return np.array(index)
def rearrange_smiles(aa_smiles): '''Rewrite an amino-acid smiles to start with the N-term and end with the C-term.''' mol = pybel.readstring('smi', aa_smiles) n_term_pat = pybel.Smarts('[$(NCC(O)=O)]') c_term_pat = pybel.Smarts('[$(OC(=O)CN)]') #Find location of start and end atoms n_term_idx = n_term_pat.findall(mol)[0][0] c_term_idx = c_term_pat.findall(mol)[0][0] #Rewrite smiles N-term first, then C-term rearranger = openbabel.OBConversion() rearranger.SetInAndOutFormats('smi', 'smi') rearranger.AddOption('f', openbabel.OBConversion.OUTOPTIONS, str(n_term_idx)) rearranger.AddOption('l', openbabel.OBConversion.OUTOPTIONS, str(c_term_idx)) outmol = openbabel.OBMol() rearranger.ReadString(outmol, aa_smiles) return rearranger.WriteString(outmol).strip()
def find_smarts_hb(dict_smarts, smarts_hb, lig_mol): # find atoms which can form a hydrogen bond and matches smarts hb_ph = set() for smart in dict_smarts[smarts_hb]: sma = pybel.Smarts(smart) all_coords = sma.findall(lig_mol) for coords in all_coords: for coor in coords: hb_mol = lig_mol.atoms[coor - 1] hb_ph.add(hb_mol.coords) return hb_ph
def smarts_search(mollist, smarts): ret = [] query = pybel.Smarts(smarts) print query for mol in mollist.all(): try: smiles = pybel.readstring("smi", str(mol.SMILES)) if query.findall(smiles): ret.append(mol) except: pass return ret
def compile_smarts(): SMARTS = [ '[#6+0!$(*~[#7,#8,F]),SH0+0v2,s+0,S^3,Cl+0,Br+0,I+0]', '[a]', '[!$([#1,#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,*+1,*+2,*+3])]', '[!$([#6,H0,-,-2,-3]),$([!H0;#7,#8,#9])]', '[r]' ] __PATTERNS = [] for smarts in SMARTS: __PATTERNS.append(pybel.Smarts(smarts)) return __PATTERNS
def pybel_neutralise(pybmol): pattern = pyb.Smarts( "[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]") match_list = pattern.findall(pybmol) for match in match_list: atom = pybmol.GetAtom(match[0]) charge = atom.GetFormalCharge() hnum = atom.GetImplicitHCount() atom.SetFormalCharge(0) atom.SetImplicitHCount(hnum - charge) return pybmol
def dict_toxi(self): mol_list = [] for smi in [self.smiles]: l = [] mol = pb.readstring("smi", str(smi)) for k, v in d.iteritems(): k_smarts = pb.Smarts(k) n = len(k_smarts.findall(mol)) if n == 0: l.append(0) else: for each in v: d_list = [] v_smarts = pb.Smarts(each) d_list.append(len(v_smarts.findall(mol))) if n > sum(d_list): l.append(1) elif n == sum(d_list): l.append(0) mol_list.append(sum(l)) return mol_list
def substructure_embedding(mol, pattern): obmol, atom_map = (oasa.pybel_bridge.PybelConverter. oasa_to_pybel_molecule_with_atom_map(mol)) if not isinstance(pattern, pybel.Smarts): pattern = pybel.Smarts(pattern) matches = pattern.findall(obmol) rev_map = dict(map(reversed, atom_map.iteritems())) return [map(rev_map.get, match) for match in matches]
def is_alkane(py_mol) -> bool: import pybel from .formula import Formula atom_set = set(Formula(py_mol.formula).atomdict.keys()) if atom_set != {'C', 'H'}: return False for s in ['[CX2]', '[CX3]', 'c', '[#6;v0,v1,v2,v3]']: if pybel.Smarts(s).findall(py_mol) != []: return False else: return True
def find_smarts(dict_smarts, smarts, lig_mol): # find groups of atoms which matches smarts coords_interaction = [] for smart in dict_smarts[smarts]: sma = pybel.Smarts(smart) all_coords = sma.findall(lig_mol) for coords in all_coords: c_ph = [] for coor in coords: c_mol = lig_mol.atoms[coor - 1] c_ph.append(c_mol.coords) coords_interaction.append(c_ph) return coords_interaction
def unique_toxi(self): mol_list = [] for smi in [self.smiles]: l = [] mol = pb.readstring("smi", str(smi)) for toxi in all_toxi: smarts = pb.Smarts(toxi) if len(smarts.findall(mol)) > 0: l.append(1) else: l.append(0) mol_list.append(sum(l)) return mol_list
def atoms_nitrophenols(mol, phenol, nitro): ## returns phenols for which nitro groups are found in same ring def is_part_of_ring(ring, group): ring_atoms = atom_indices_ring(r, mol) #not the most efficient return len([idx for idx in group if idx in ring_atoms]) > 0 def atom_indices_ring(ring, mol): return [a.idx for a in mol.atoms if ring.IsInRing(a.idx)] def atom_indices_group(groups): return list(chain.from_iterable(groups)) _phenol = pybel.Smarts(phenol).findall(mol) _nitro = pybel.Smarts(nitro).findall(mol) _rings = [ring for ring in mol.sssr if ring.IsAromatic()] atomlist_ring = [] # list of rings atomlist_nitrophenol = [] # list of (nitro)phenol groups for r in _rings: part = {'phenol': [], 'nitro': []} for x in _phenol: if not is_part_of_ring(r, x): continue part['phenol'].append(x) for y in _nitro: if not is_part_of_ring(r, y): continue part['nitro'].append(y) if part['phenol'] and part['nitro']: atomlist_ring.append( set( atom_indices_ring(r, mol) + atom_indices_group(part['phenol']) + atom_indices_group(part['nitro']))) atomlist_nitrophenol += part['phenol'] # returning of atomlist_ring is optional # but the phenol groups are what are really counted so this is what is returned return atomlist_nitrophenol
def MatchPlattsBGroups(self, smiles): # Load functional group database current_dir = os.getcwd() filepath = os.path.join(current_dir, 'groups.xls') wb = xlrd.open_workbook(filepath) wb.sheet_names() data = wb.sheet_by_name(u'PlattsB') col1 = data.col_values(0) col2 = data.col_values(1) col3 = data.col_values(2) databaseB = [] for (SMART, name, B) in zip(col1, col2, col3): databaseB.append(functionalgroup(SMART, name, B)) platts_B = 0 mol = pybel.readstring("smi", smiles) for x in databaseB: # Initialize with dummy SMLES to check for validity of real one smarts = pybel.Smarts("CC") smarts.obsmarts = ob.OBSmartsPattern() success = smarts.obsmarts.Init(x.smarts.__str__()) if success: smarts = pybel.Smarts(x.smarts.__str__()) else: print "Invalid SMARTS pattern", x.smarts.__str__() break matched = smarts.findall(mol) x.num = len(matched) if (x.num > 0): print "Found group", x.smarts.__str__( ), 'named', x.name, 'with contribution', x.value, 'to B', x.num, 'times' platts_B += (x.num) * (x.value) self.B = platts_B + 0.071