def name2id(name, outformat): import cirpy import chemspipy import queryDevice idstring = None source = None if idstring is None: source = 'NCI' idstring = cirpy.resolve(name, outformat) if idstring is None: source = 'ChemSpi' chemspid = chemspipy.find_one(name) try: smiles = chemspid.smiles idstring = cirpy.resolve(smiles, outformat) except AttributeError: idstring = None if idstring is None: source = 'NCI-pattern-match' idstring = cirpy.resolve(name, outformat,['name_pattern']) if idstring is None: source = None idstring = str(idstring) try: idstring = (idstring.rstrip(),source) except AttributeError: idstring = (idstring[0].rstrip(),source) print 'There were multiple results for: ', name, ' using: ', idstring[0], '\n', idstring return idstring
def search(self, query): print('Connected to ChemSpider API') print("Searching started") print("Searching for: " + query) i = 0 results = [] for result in self.cs.search(query): if i > 5: break print("Compound " + str(i)) formula = str(result.molecular_formula) csid = str(result.csid) inchi = result.inchi name = result.common_name cas = cirpy.resolve(inchi, 'cas') iupac_name = cirpy.resolve(inchi, 'iupac_name') if type(cas) is list: c_cas = query sim_cas = difflib.get_close_matches(str(c_cas), cas, 3, 0) print(sim_cas) cas_ = sim_cas[0] else: cas_ = cas image = result.image_url print(image) i = i + 1 result_line = {'csid': csid, 'name': name, 'iupac_name': iupac_name, 'cas': cas_, 'inchi': inchi, \ 'formula': formula, 'image': image} results.append(result_line) print("Searching finished") print(results) return results
def test_tnt_smiles_custom_resolvers(self): """Test custom resolvers return the expected result.""" self.assertEqual( resolve('2,4,6-trinitrotoluene', 'smiles', ['name_by_opsin', 'name_by_cir']), 'Cc1c(cc(cc1[N+]([O-])=O)[N+]([O-])=O)[N+]([O-])=O') self.assertEqual( resolve('2,4,6-trinitrotoluene', 'smiles', ['name_by_cir', 'name_by_opsin']), 'Cc1c(cc(cc1[N+]([O-])=O)[N+]([O-])=O)[N+]([O-])=O')
def test_tnt_smiles_custom_resolvers(self): """Test custom resolvers return the expected result.""" self.assertEqual( resolve('2,4,6-trinitrotoluene', 'smiles', ['name_by_opsin', 'name_by_cir']), 'Cc1c(cc(cc1[N+]([O-])=O)[N+]([O-])=O)[N+]([O-])=O' ) self.assertEqual( resolve('2,4,6-trinitrotoluene', 'smiles', ['name_by_cir', 'name_by_opsin']), 'Cc1c(cc(cc1[N+]([O-])=O)[N+]([O-])=O)[N+]([O-])=O' )
def name2molecule(oname, smiles_code): oname = "".join(oname.split()) pdbfile = cirpy.resolve(smiles_code, "pdb") pdb_output = open("PDB/" + oname + ".pdb", "w+") pdb_output.write(pdbfile) pdb_output.close() molfile = cirpy.resolve(smiles_code, "mol") mol_output = open("MOL/" + oname + ".mol", "w+") mol_output.write(molfile) mol_output.close() ans = True return None
def parse_names(self): for name in self.names: smiles = cirpy.resolve(name, 'smiles', ['name_by_opsin']) if smiles is not None: return Species().fromSMILES(smiles) else: for name in self.names: smiles = cirpy.resolve(name, 'smiles', ['name_by_cir']) if smiles is not None: return Species().fromSMILES(smiles) else: raise ConversionError( 'Could not resolve name for species {}.'.format( self.prime_id))
def process_bioactive_identifier(request): cas_no = request.GET.get('cas_number') inchikey = request.GET.get('inchikey', '').strip() obj = None if cas_no: obj = Bioactive.objects.filter( chemical_properties__synonyms__icontains=cas_no).first() elif inchikey: obj = Bioactive.objects.filter(inchikey__exact=inchikey).first() if obj: data = { 'object_exists': obj.get_absolute_url(), 'object_exists_name': str(obj), } return JsonResponse(data) try: iupac_name = None if cas_no: smiles = cirpy.query(cas_no, 'smiles')[0].value if '.' in smiles: smiles = [i for i in smiles.split('.') if len(i) > 5][0] pcp_query = pcp.get_compounds(smiles, 'smiles')[0] if not pcp_query.iupac_name: iupac_name = cirpy.resolve(smiles, 'iupac_name', ['smiles']) else: pcp_query = pcp.get_compounds(inchikey, 'inchikey')[0] if not pcp_query.iupac_name: iupac_name = cirpy.resolve(inchikey, 'iupac_name', ['stdinchikey']) if not pcp_query.cid: raise IndexError except (IndexError, pcp.BadRequestError): return JsonResponse({'error': 'No compound found for this CAS number'}) data = { 'chemical_name': Bioactive.scrape_compound_name(pcp_query.cid), 'iupac_name': pcp_query.iupac_name or iupac_name or 'n/a', 'inchikey': pcp_query.inchikey, 'structure_url': 'https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid={}&t=l'. format(pcp_query.cid), 'hidden_cid': pcp_query.cid, 'smiles': pcp_query.isomeric_smiles or pcp_query.canonical_smiles or '', } return JsonResponse(data)
def main(): """Run main procedure.""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("identifiers", nargs="+") args = parser.parse_args() atomnos = [] atomcoords = [] for identifier in args.identifiers: try: nos, _, coords = read_xyz(cirpy.resolve(identifier, "xyz")) except AttributeError: nos, _, coords = read_xyz(identifier) atomnos.append(nos[-1]) atomcoords.append(coords[-1] - np.mean(coords[-1], axis=0)) curnos = atomnos[0] curcoords = atomcoords[0] for nos, coords in zip(atomnos[1:], atomcoords[1:]): curdim = curcoords.max(axis=0) - curcoords.min(axis=0) extradim = coords.max(axis=0) - coords.min(axis=0) axis = curdim.argmin() v = np.zeros(3) v[axis] = (curdim[axis] + extradim[axis]) / 2 + 2.83 coords = coords + v print(write_xyz(nos, coords))
def LoadMutagenicity(): compounds = [] print('Reading existing database') with open(constant.DATA + 'data.csv', newline='') as files: data = csv.reader(files, delimiter=';', quotechar=';') for i,comp in enumerate(data): compounds.append(comp[1]) compounds = np.array(compounds) print('Loading new data') suppl = Chem.SmilesMolSupplier('./smiles_cas_N7090.smi') with open('new_data copy.csv', 'w', newline='') as files: f = csv.writer(files) for compound in suppl: try: smile = str(cirpy.resolve(compound.GetProp('_Name'), 'smiles')) s = compound.GetProp('_Name')+';'+ smile +';'+ str(compound.GetProp('0')) index = np.searchsorted(compounds, smile) if index< len(compounds) and compounds[index] == smile: print('Skipped') continue f.writerow(s) print(s) except AttributeError as e: print(e) continue
def cas_to_smile(cas_id): cas_id = str(cas_id).strip() smile=cirpy.resolve(cas_id,"smiles") if smile is None: return '' else: return smile
def parse_page(soup): for x in soup.find_all("td"): t = x.get("class") if t is not None and "term2TD" in t: if "CAS No." in x.next: a0 = x a1 = x.next a2 = x.next.next cas = a2.text[1:] if "Density" in x.next: a0 = x a1 = x.next a2 = x.next.next density = a2.text if "TDENL" in x.next: a0 = x a1 = x.next a2 = x.next.next temperature = a2.text if "Molecular Wt." in x.next: a0 = x a1 = x.next a2 = x.next.next weight = a2.text if density == "NA": density = None smiles = cirpy.resolve(cas, "smiles") return (cas, density, temperature, weight, smiles)
def set_chem_data(self): for d in self.drugs_data: try: pcp_query = pcp.get_compounds(d['cid_number'], 'cid')[0] smiles = pcp_query.canonical_smiles d.update({ 'smiles': smiles, 'inchikey': pcp_query.inchikey, 'iupac_name': pcp_query.iupac_name or cirpy.resolve(smiles, 'iupac_name', ['smiles']), 'chemical_properties': dict_from_query_object(smiles, pcp_query, additional=True), }) if len(smiles.split('.')) > 1: d.update({ 'cid_number_2': pcp.get_compounds(smiles.split('.')[0], 'smiles')[0].cid }) except (IndexError, TypeError, pcp.BadRequestError): self.drugs_data.remove(d) return self.drugs_data
def LoadMutagenicity(): compounds = [] print('Reading existing database') with open(constant.DATA + 'data.csv', newline='') as files: data = csv.reader(files, delimiter=';', quotechar=';') for i, comp in enumerate(data): compounds.append(comp[1]) compounds = np.array(compounds) print('Loading new data') suppl = Chem.SmilesMolSupplier('./smiles_cas_N7090.smi') with open('new_data copy.csv', 'w', newline='') as files: f = csv.writer(files) for compound in suppl: try: smile = str(cirpy.resolve(compound.GetProp('_Name'), 'smiles')) s = compound.GetProp('_Name') + ';' + smile + ';' + str( compound.GetProp('0')) index = np.searchsorted(compounds, smile) if index < len(compounds) and compounds[index] == smile: print('Skipped') continue f.writerow(s) print(s) except AttributeError as e: print(e) continue
def parse_cas(self): smiles = cirpy.resolve(self.cas, 'smiles', ['cas_number']) if smiles is None: raise ConversionError( 'Could not resolve CAS number for species {}.'.format( self.prime_id)) else: return Species().fromSMILES(smiles)
def query_inchi(chem): """Returns the stdinchi of the chem via cirpy""" print 'Query for inchi' try: return cirpy.resolve(chem, 'stdinchi') except urllib2.URLError: print 'Sleeping for inchikey' time.sleep(1) query_inchi(chem)
def smiles2stdinchikey(smiles): import cirpy import queryDevice if smiles is None: return None stdinchikey = cirpy.resolve(smiles, 'stdinchikey') return stdinchikey
def fetch_name(s): """ Return IUPAC name for a given smiles or inchi string. Requires cirpy module and internet connection >>> print fetch_name('C=O') FORMALDEHYDE """ import cirpy frm = get_format(s) if frm == 'smi': name = cirpy.resolve(s,'iupac_name',resolvers=['smiles']) elif frm == 'inchi': name = cirpy.resolve(s,'iupac_name',resolvers=['inchi']) elif frm == 'xyz': mol = get_mol(s) name = cirpy.resolve(mol.write('inchi').strip(),'iupac_name',resolvers=['inchi']) else: name = None return name
def cas_odor_url(url_chemical): #Implement the crawling descriptor_list = [] descriptor_list_final = [] page = request_func(url_chemical) if url_chemical != 'http://www.thegoodscentscompany.com/data/rw1109421.html': if page.find('table','cheminfo').find('tbody').find('td','radw11') is not None: cas_n = page.find('table','cheminfo').find('tbody').find('td','radw11').text else: cas_n = 'No Cas' tags_cheminfo = page.find_all('table', class_ = 'cheminfo') for tags in tags_cheminfo: #descriptor_list = [] for tag in tags.find_all('td'): if (tag.has_attr('class')) and (tag.attrs['class'][0] == 'radw5'): if 'Odor Description' in tag.get_text(): string = tag.get_text().replace('Odor Description:', '').lower() descriptor = descriptor_formatter(string) descriptor_list.append(descriptor) #descriptor_list = itertools.chain(*descriptor_list) #descriptor_list = list(chain.from_iterable(descriptor_list)) #descriptor_list = list(set(descriptor_list)) descriptor_list_final = list(set(itertools.chain(*descriptor_list))) #print(descriptor_list_final) if cas_n != 'No Cas': if cirpy.resolve(cas_n, 'smiles'): smiles_str = cirpy.resolve(cas_n, 'smiles') else: smiles_str = 'No Smiles' else: if cirpy.resolve(cas_n, 'smiles'): smiles_str = cirpy.resolve(cas_n, 'smiles') else: smiles_str = 'No Smiles' output_dict = {'cas_number': cas_n, 'descriptors':descriptor_list_final, 'smile_string':smiles_str, 'page':url_chemical} else: print('No page') output_dict = {'cas_number': 'No Page', 'descriptors':'No Page', 'smile_string':'No Page', 'page':'No Page'} return output_dict
def find_smiles(cas): ''' Find SMILES representation using CirPy Inputs: - Cas_number (str): CAS number in original format Outputs: - SMILES (str): original SMILES code ''' return cirpy.resolve(cas, 'smiles')
def resolve_structure(compound): """ Resolves a compound structure using CIRPY """ try: smiles = cirpy.resolve(compound, 'smiles') return smiles except URLError: log.warning( 'Cannot connect to Chemical Identify Resolver - chemical names may not be resolved.' ) return compound
def test_cml(self): """Test CML file format is resolved.""" cmlstring = resolve('Aspirin', 'cml') cml = etree.fromstring(cmlstring) self.assertEqual(cml.tag, '{http://www.xml-cml.org/schema/cml2/core}list') self.assertEqual( len( cml.findall( './/{http://www.xml-cml.org/schema/cml2/core}molecule')), 1)
def query_smiles(chem): print 'Query for smiles' try: return cirpy.resolve(chem, 'smiles') except urllib2.URLError: print 'Sleeping for smiles' time.sleep(1) query_smiles(chem) except Exception as e: print e return None
def canonicalize_smiles(result): # Run NCI CIR to get chemical names print('SMILES before cirpy: %s' % result.smiles) if result.smiles: canon_smiles = cirpy.resolve(result.smiles, 'smiles') if canon_smiles: result.smiles = canon_smiles print('SMILES after cirpy: %s' % result.smiles) return result
def fetch_smiles(s): """ Returns the smiles string for a given chemical name. Requires cirpy module and internet connection >>> fetch_smiles('methane') 'C' """ import cirpy if cirpy: return cirpy.resolve(s,'smiles') else: return None
def resolve_via_cirpy(identifier, target, source): try: converted = caches[source][target].get(identifier) if converted is None: sourcehint = 'cas_number' if source == 'cas' else source sourcehints = ['name_by_opsin', 'name_by_cir'] if sourcehint == 'name' else [sourcehint] converted = cirpy.resolve(identifier, target, sourcehints) caches[source][target].set(identifier, converted) return converted except HTTPError as err: if err.code == 504 or err.code == 408: raise CirpyError(504, "Timeout while waiting for identifier resolution service") raise CirpyError(500, "HTTPError while communicating with identifier resolution service" + err.reason)
def cactus_search(comp_name, type): result = cirpy.resolve(comp_name, type) synonyms = "" if result: if type == 'stdinchikey': return result.replace('InChIKey=', '') if type == 'names': for synonym in result: if get_relevant_synonym(synonym): synonyms = synonyms + ';' + synonym return synonyms return result
def fetch_IUPAC_name(s): """ Return IUPAC name for a given smiles or inchi string. Requires cirpy module and internet connection >>> print(fetch_IUPAC_name('C=O')) FORMALDEHYDE """ try: import cirpy except: r = 'cirpy module not installed, see http://cirpy.readthedocs.io/' return frm = get_format(s) if frm == 'smi': name = cirpy.resolve(s,'iupac_name',resolvers=['smiles']) elif frm == 'inchi': name = cirpy.resolve(s,'iupac_name',resolvers=['inchi']) elif frm == 'xyz': mol = get_mol(s) name = cirpy.resolve(mol.write('inchi').strip(),'iupac_name',resolvers=['inchi']) else: name = None return name
def fetch_inchi(s): """ Returns the smiles string for a given chemical name. Requires cirpy module and internet connection >>> fetch_inchi('methane') 'InChI=1/CH4/h1H4' """ try: import cirpy except: r = 'requires_cirpy' if cirpy: r = cirpy.resolve(s,'inchi') return r
def get_smiles(com, user=None): smiles_array = [] for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["smiles"]): if iden["identifier"] not in smiles_array: smiles_array.append(iden["identifier"]) if smiles_array: return smiles_array ids_completed = [] for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["chemspider", "chemspider id", "chemspider identifier", "cs id", "csid"]): if iden["identifier"] not in ids_completed and user is not None: ids_completed.append(iden["identifier"]) for cs_com in gnomics.objects.compound.Compound.chemspider_compound(com, user): if cs_com.smiles not in smiles_array: gnomics.objects.compound.Compound.add_identifier(com, identifier = cs_com.smiles, language = None, identifier_type = "SMILES", source = "ChemSpider") smiles_array.append(cs_com.smiles) elif iden["identifier"] not in ids_completed and user is None: ids_completed.append(iden["identifier"]) print("Cannot use ChemSpider conversion when user is None. Please create and pass a valid user with a ChemSpider security token to this method.") for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["chebi", "chebi id", "chebi identifier"]): if iden["identifier"] not in ids_completed: ids_completed.append(iden["identifier"]) for sub_com in gnomics.objects.compound.Compound.chebi_entity(com): if sub_com.get_smiles() not in smiles_array: gnomics.objects.compound.Compound.add_identifier(com, identifier = sub_com.get_smiles(), language = None, identifier_type = "SMILES", source = "ChEBI") smiles_array.append(sub_com.get_smiles()) for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["cas", "cas registry", "cas registry number", "cas rn"]): if iden["identifier"] not in ids_completed: ids_completed.append(iden["identifier"]) smiles = cirpy.resolve(iden["identifier"], "smiles") if smiles not in smiles_array and smiles is not None and smiles != "None": gnomics.objects.compound.Compound.add_identifier(com, identifier = smiles, language = None, identifier_type = "SMILES", source = "CIR") smiles_array.append(smiles) if smiles_array: return smiles_array for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["kegg compound", "kegg compound id", "kegg compound identifier", "kegg", "kegg compound accession", "kegg id", "kegg identifier", "kegg accession"]): gnomics.objects.compound.Compound.chebi_id(com) return gnomics.objects.compound.Compound.smiles(com)
def fetch_inchi(s): """ Returns the smiles string for a given chemical name. Requires cirpy module and internet connection >>> fetch_inchi('methane') 'InChI=1/CH4/h1H4' """ try: import cirpy except: r = 'cirpy module not installed, see http://cirpy.readthedocs.io/' return if cirpy: r = cirpy.resolve(s,'inchi') return r
def fetch_smiles(s): """ Returns the smiles string for a given chemical name. Requires cirpy module and internet connection >>> fetch_smiles('methane') 'C' """ try: import cirpy except: r = 'cirpy module not installed, see http://cirpy.readthedocs.io/' return if cirpy: return cirpy.resolve(s,'smiles') else: return None
def get_standard_inchi_key(com, user=None): inchi_key_array = [] for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["standard inchi key", "standard iupac international chemical id key", "standard iupac international chemical identifier key", "stdinchikey"]): if iden["identifier"] not in inchi_key_array: inchi_key_array.append(iden["identifier"]) if inchi_key_array: return inchi_key_array ids_completed = [] for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["chemspider", "chemspider id", "chemspider identifier", "cs id", "csid"]): if iden["identifier"] not in ids_completed and user is not None: ids_completed.append(iden["identifier"]) for sub_com in gnomics.objects.compound.Compound.chemspider_compound(com, user): temp_inchi_key = sub_com.stdinchikey if temp_inchi_key not in inchi_key_array: gnomics.objects.compound.Compound.add_identifier(com, identifier = temp_inchi_key, identifier_type = "Standard InChI Key", language = None, source = "ChemSpider") inchi_key_array.append(temp_inchi_key) elif user is None: print("Cannot use ChemSpider conversion when user is None. Please create and pass a valid user with a ChemSpider security token to this method.") for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["cas", "cas registry", "cas registry number", "cas rn"]): if iden["identifier"] not in ids_completed: ids_completed.append(iden["identifier"]) temp_inchi_key = cirpy.resolve(iden["identifier"], "stdinchikey") if temp_inchi_key not in inchi_key_array: gnomics.objects.compound.Compound.add_identifier(com, identifier = temp_inchi_key, identifier_type = "Standard InChI Key", language = None, source = "CIR") inchi_key_array.append(temp_inchi_key) for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["chembl", "chembl compound", "chembl compound id", "chembl compound identifier", "chembl id", "chembl identifier"]): if iden["identifier"] not in ids_completed: ids_completed.append(iden["identifier"]) temp_inchi_key = gnomics.objects.compound.Compound.chembl_molecule(com)[0]["molecule_structures"]["standard_inchi_key"] if temp_inchi_key not in inchi_key_array and temp_inchi is not None and temp_inchi != "None": gnomics.objects.compound.Compound.add_identifier(com, identifier = temp_inchi_key, identifier_type = "Standard InChI Key", language = None, source = "ChEMBL") inchi_key_array.append(temp_inchi_key) return inchi_key_array
def resolve_via_cirpy(identifier, target, source): try: converted = caches[source][target].get(identifier) if converted is None: sourcehint = 'cas_number' if source == 'cas' else source sourcehints = ['name_by_opsin', 'name_by_cir' ] if sourcehint == 'name' else [sourcehint] converted = cirpy.resolve(identifier, target, sourcehints) caches[source][target].set(identifier, converted) return converted except HTTPError as err: if err.code == 504 or err.code == 408: raise CirpyError( 504, "Timeout while waiting for identifier resolution service") raise CirpyError( 500, "HTTPError while communicating with identifier resolution service" + err.reason)
def resolveID(file, column): # TODO Consider incorporation of this function in load_csv() """ Resolves chemical ID using cripy package from NCI. Accepts csv file path and name (as string) and string of column header to be resolved. Returns dataframe with added column containing smiles.""" if isinstance(file, pd.core.frame.DataFrame): df = file elif isinstance(file, str): df = pd.read_csv(file) # read csv file # for i in df.head(0): # look at all columns # try: # pd.DataFrame(list(map(Chem.MolFromSmiles, csv[i]))) # # pd.DataFrame(list(map(cirpy.resolve(,'smiles'), csv[i]))) # df[i].apply(cirpy.resolve, args=()) # s.apply(subtract_custom_value, args=(5,)) # from functools import partial # # mapfunc = partial(my_function, ip=ip) # map(mapfunc, volume_ids) # smiles_col = csv[i] # # except Exception: # pass for i, row in enumerate(df.itertuples(), 1): # iterate through dataframe c = row.Index id = df.loc[c, column] # get cas number from df # print('Resolving', id) # look up the CAS, convert to smiles df.loc[c, 'smiles'] = cirpy.resolve(id, 'smiles') # store in df # provides output text # if df.loc[c, 'smiles'] == None: # print('No SMILES found') # print() # # else: # print('smiles found :)') # print() # drop if smiles was not found df3 = df.dropna() # print(df3.head(5)) return df3
def __init__(self, cas_or_aa, min_atoms=6): """ Initialize using cas numbers OR amino acid name Requires openmoltools.openeye and cirpy Arguments cas_or_aa (list of strings) either cas number or name of amino acid Optional Arguments min_atoms (int) - a minimum number of atoms for substructure match (default: 6) Creates class variables: self.cas_or_aa (list of strings) input representing molecules to be combined self.smiles_strings (list of strings) smiles representation of molecules to be combined self.ligands (list of OEMol) openeye molecule representation of molecules to be combined self.title (string) used as an identifier for input group of molecules self.min_atoms (int) minimum number of common atoms to constitute a substructure match (default: 6) """ self.cas_or_aa = cas_or_aa self.smiles_strings = [] self.ligands = [] for cas in cas_or_aa: smiles = cirpy.resolve(cas, 'smiles') self.smiles_strings.append(smiles) ligand = openeye.smiles_to_oemol(smiles) ligand = openeye.get_charges(ligand, strictStereo=False) self.ligands.append(ligand) self.title = self.cas_or_aa[0] + "_and_analogs" self.min_atoms = min_atoms self.common_substructure = None self.dual_topology = None self.each_molecule_N = [] self.mapping_dictionaries = [] self.pdb_filename = None self.ffxml_filename = None
def __init__(self, cas_or_aa, min_atoms=6): """ Initialize using cas numbers OR amino acid name Requires openmoltools.openeye and cirpy Arguments cas_or_aa (list of strings) either cas number or name of amino acid Optional Arguments min_atoms (int) - a minimum number of atoms for substructure match (default: 6) Creates class variables: self.cas_or_aa (list of strings) input representing molecules to be combined self.smiles_strings (list of strings) smiles representation of molecules to be combined self.ligands (list of OEMol) openeye molecule representation of molecules to be combined self.title (string) used as an identifier for input group of molecules self.min_atoms (int) minimum number of common atoms to constitute a substructure match (default: 6) """ self.cas_or_aa = cas_or_aa self.smiles_strings = [] self.ligands = [] for cas in cas_or_aa: smiles = cirpy.resolve(cas,'smiles') self.smiles_strings.append(smiles) ligand = openeye.smiles_to_oemol(smiles) ligand = openeye.get_charges(ligand, strictStereo=False) self.ligands.append(ligand) self.title = self.cas_or_aa[0]+"_and_analogs" self.min_atoms = min_atoms self.common_substructure = None self.dual_topology = None self.each_molecule_N = [] self.mapping_dictionaries = [] self.pdb_filename = None self.ffxml_filename = None
def find_iupac_names(): df_ = read_df() chems = df_.groupby("chemical_").size().sort_values()[::-1].index.values iupac_names = [] num_found = 0 for i, chem in enumerate(chems): if not chem: continue print chem iupac = cirpy.resolve(chem, "iupac_name") if iupac: print "Found:", iupac num_found += 1 else: print "Not found." iupac_names.append(iupac) print "{} / {}".format(num_found, i + 1) print "" time.sleep(1) return iupac_names
def add_structures(result): # Run OPSIN with tempfile.NamedTemporaryFile(delete=False) as tf: for record in result['records']: for name in record.get('names', []): tf.write(('%s\n' % name).encode('utf-8')) subprocess.call([ app.config['OPSIN_PATH'], '--allowRadicals', '--wildcardRadicals', '--allowAcidsWithoutAcid', '--allowUninterpretableStereo', tf.name, '%s.result' % tf.name ]) with open('%s.result' % tf.name) as res: structures = [line.strip() for line in res] i = 0 for record in result['records']: for name in record.get('names', []): if 'smiles' not in record and structures[i]: log.debug('Resolved with OPSIN: %s = %s', name, structures[i]) record['smiles'] = structures[i] i += 1 os.remove(tf.name) os.remove('%s.result' % tf.name) # For failures, use NCI CIR (with local cache of results) for record in result['records']: for name in record.get('names', []): if 'smiles' not in record: local_entry = ChemDict.query.filter_by(name=name).first() if local_entry: log.debug('Resolved with local dict: %s = %s', name, local_entry.smiles) if local_entry.smiles: record['smiles'] = local_entry.smiles else: smiles = cirpy.resolve( chem_normalize(name).encode('utf-8'), 'smiles') log.debug('Resolved with CIR: %s = %s', name, smiles) db.session.add(ChemDict(name=name, smiles=smiles)) if smiles: record['smiles'] = smiles return result
def LoadAMES(): compounds = [] print('Reading existing database') with open(constant.DATA + 'data.csv', newline='') as files: data = csv.reader(files, delimiter=';', quotechar=';') for i,comp in enumerate(data): compounds.append(comp[1]) compounds = np.array(compounds) print('Loading new data') suppl = Chem.SDMolSupplier('./AMESdata.sdf') with open('new_data.csv', 'w', newline='') as files: f = csv.writer(files) for compound in suppl: smile = str(cirpy.resolve(compound.GetProp('IDNUMBER'), 'smiles')) s = compound.GetProp('IDNUMBER')+';'+ smile +';'+compound.GetProp('AMES_Activity') index = np.searchsorted(compounds, smile) if index< len(compounds) and compounds[index] == smile: print('Skipped') continue f.writerow(s) print(s)
def resolve(self): res = cirpy.resolve(self.name, 'xyz') print("Resolved chemical identification of %s" % (self.name)) r = res.split("\n") self.natoms = int(r[0]) self.frag = [0] * self.natoms for i, line in enumerate(r): if i <= 1: continue elif i < self.natoms + 2: tmp = [ float(line.split()[1]), float(line.split()[2]), float(line.split()[3]) ] self.coord.append(tmp) self.atom.append(line.split()[0]) self.mass.append(element.ELEMENTS[self.atom[-1]].mass) else: continue return
def canonicalize_smiles(smiles, sanitize=True, iso=False, SLN=False): """Canonicalize given SMILES string The function is a wrapper around RDKIT function :argumnts: smiles -- (string) a compound in SMILES format sanitize -- (bool) sanitize the molecule iso -- (bool) include isomeric data in SMILES SLN -- (bool) is the molecule given in SLN format :return: canonicalized SMILES """ if SLN: smiles_ = cirpy.resolve(smiles, "smiles") mol = Chem.MolToSmiles( Chem.MolFromSmiles(smiles_), canonical=True, isomericSmiles=iso ) else: mol = Chem.MolToSmiles( Chem.MolFromSmiles(smiles), canonical=True, isomericSmiles=iso ) mol = Chem.MolFromSmiles(mol) if sanitize: mol.UpdatePropertyCache(strict=False) mol = Chem.RemoveHs( mol, implicitOnly=False, updateExplicitCount=True, sanitize=True ) Chem.SanitizeMol( mol, Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, catchErrors=False ) AllChem.AssignStereochemistry( mol, cleanIt=True, force=True, flagPossibleStereoCenters=True ) return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=iso) else: return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=iso)
def download_cas_to_mol(molecule_cas, sanitize=True): """ Download molecule via cas, add hydrogens, clean up """ sdf_str = cirpy.resolve(molecule_cas, 'sdf3000', get_3d=True) mol = sdbs_util.sdfstr_to_mol(sdf_str) mol = Chem.AddHs(mol) # this is not a good place to do this # # FOR INSANE REASONS I DONT UNDERSTAND we get # # INITROT -- Rotation about 1 4 occurs more than once in Z-matrix # # and supposeldy reordering helps # np.random.seed(0) # mol = Chem.RenumberAtoms(mol, np.random.permutation(mol.GetNumAtoms()).astype(int).tolist()) #mol.SetProp("_Name", molecule_cas) # rough geometry Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL) AllChem.EmbedMolecule(mol, AllChem.ETKDG()) return mol
def resolve_cached(x, rtype): return cirpy.resolve(x, rtype)
def MOL(SMILES): molfile = None try: molfile = cirpy.resolve(SMILES, "mol") except HTTPError, e: print "HTTPError: %s " % e.code
import cirpy import sys import pandas as pd #molecule = sys.argv[1] df=pd.read_csv('SAMPL4.csv') for molecule in df.NAME[:24]: smiles_code = cirpy.resolve(molecule, 'smiles') print smiles_code,molecule pdbfile = cirpy.resolve(smiles_code, 'pdb') oname = ''.join(molecule.split()) file_output = open(oname + '.pdb', "w+") file_output.write(pdbfile) file_output.close()
def iupac(SMILES): name = None try: name = cirpy.resolve(SMILES, "iupac_name") except HTTPError, e: print "HTTPError: %s" % e.code
def test_invalid_representation_resolve(self): """Test that HTTPError is raised when an invalid representation is specified.""" with self.assertRaises(HTTPError): resolve('Morphine', 'ogiuewrgpw')
def test_tnt_smiles(self): """Test that TNT smiles resolves the expected result.""" self.assertEqual( resolve('2,4,6-trinitrotoluene', 'smiles'), 'Cc1c(cc(cc1[N+]([O-])=O)[N+]([O-])=O)[N+]([O-])=O' )
def test_cml(self): """Test CML file format is resolved.""" cmlstring = resolve('Aspirin', 'cml') cml = etree.fromstring(cmlstring) self.assertEqual(cml.tag, '{http://www.xml-cml.org/schema/cml2/core}list') self.assertEqual(len(cml.findall('.//{http://www.xml-cml.org/schema/cml2/core}molecule')), 1)
def test_pdb(self): """Test PDB file format is resolved.""" result = resolve('Aspirin', 'pdb') self.assertIn('HEADER', result) self.assertIn('ATOM', result) self.assertIn('CONECT', result)
def test_no_results_resolve(self): """Test that None is returned when there are no results.""" self.assertEqual(resolve('aruighaelirugaerg', 'inchi'), None)
import cirpy from simtk.openmm import app import builder import mdtraj as md import pymbar import scipy.interpolate import os import pandas as pd import glob filenames = glob.glob("./data/equil/*.pdb") filename_munger = lambda filename: os.path.splitext(os.path.split(filename)[1])[0].split("_") cirpy.resolve("71-23-8", "formula") data = [] for pdb_filename in filenames: cas, n_molecules, temperature, stage = filename_munger(pdb_filename) print(cas, temperature) dcd_filename = "./data/production/%s_%s_%s_production.dcd" % (cas, n_molecules, temperature) try: traj = md.load(dcd_filename, top=pdb_filename) except IOError: continue if traj.unitcell_lengths is None: continue rho = md.geometry.density(traj) [t0, g, Neff] = pymbar.timeseries.detectEquilibration(rho) mu = rho[t0:].mean() sigma = rho[t0:].std() * Neff ** -0.5 forcefield = app.ForceField("./data/ffxml/%s.xml" % cas) system, charges = builder.build_simulation(traj, forcefield) temperature = float(temperature) dielectric = md.geometry.static_dielectric(traj, charges, temperature)
def test_alanine_smiles(self): """Test that alanine smiles resolves the expected result.""" self.assertEqual(resolve('Alanine', 'smiles'), 'C[C@H](N)C(O)=O')
def handleword(word): #compare the word (to lower) to the chem list if word.lower() in chemlist: #if we've already failed the word if word in failedlist: print word+" is in my failed list... not trying it again" mention.reply('I\'m pretty sure '+word+' is either a chemical compound or a portion of what you ment but I wasn\'t able to find a CAS number for it.\n\nIf this is a problem, please '+footerGen(mention.permalink)) return 0 #This is a new or unfailed word else: print "Resolving "+word #Look up the CAS number, smiles and formula of the compound from CIR try: cas_num = cirpy.resolve(word,'cas') smiles = cirpy.resolve(word,'smiles') formula = cirpy.resolve(word,'formula') except: cas_num = False #The chemical is defined if we have at least one CAS number if cas_num: #Some compounds may have multiple CAS numbers so we want to handle the grammar isare = " is" if len(cas_num) > 1: isare = "s are" if len(cas_num[0])>1: #Build the WebBoook links and the list for multiple CAS numbers formattedcas = ', '.join(cas_num) link = "" for cas in cas_num: link = link+"["+cas+"](http://webbook.nist.gov/cgi/cbook.cgi?ID="+cas+"&Units=SI) " else: #Build the WebBoook link and the list for one CAS number formattedcas = cas_num link = "" link = link+"["+cas_num+"](http://webbook.nist.gov/cgi/cbook.cgi?ID="+cas_num+"&Units=SI)" #wolfram portion of the query waeo = wap.WolframAlphaEngine(appid, server) query = waeo.CreateQuery(word) result = waeo.PerformQuery(query) waeqr = wap.WolframAlphaQueryResult(result) pods = waeqr.Pods() structureimage ="" propertiesimage="" propertiestext="" #See Wolfram|Alpha API docs for descriptions of pods for pod in pods: if str(pod[1][1]) == "Structure diagram": structureimage = pod[6][3][1][1] if str(pod[1][1]) == "Basic properties": propertiesimage = pod[6][3][1][1] propertiestext = pod[6][3][5][1] #Build the reply mention.reply('How about some more info on '+word+':\n\nThe CAS number'+isare+' '+formattedcas+'\n\nThe chemical structure is '+smiles+'\n\nChemical formula: '+formula+'\n\nNIST WebBook '+link+'.\n\n\nThe following is from Wolfram|Alpha:\n\n[structure image]('+structureimage+')\n\n[basic properties image]('+propertiesimage+')\n\nBasic properties: '+propertiestext+'\n\n\n\nProvided by your friendly neighborhood Chemistry_Bot\n\n'+footerGen(mention.permalink)) #Add the mention ot the already done set already_done.add(mention.id) print "Success on "+word #Add the mention to the commented on file with open("commentedonchem.txt", "a") as commentedfile: commentedfile.write(mention.id+'\n') return 1 else: #If a chemical was in our list but didn't resolve in the CIR we record it in the failed file print "Failure on "+word with open("failedat.txt","a") as failedat: failedat.write(word+'\n') mention.reply('I\'m pretty sure '+word+' is either a chemical compound or a portion of what you ment but I wasn\'t able to find a CAS number for it.\n\nIf this is a problem, please '+footerGen(mention.permalink)) return 0 else: #The word is not in our chemlist print word+' not in my list' return 0
X["is_good"] = X_is_good X = X[X.is_good] X["n_components"] = X.components.apply(lambda x: len(x.split("__"))) X = X[X.n_components == 1] X.dropna(axis=1, how='all', inplace=True) X["n_heavy_atoms"] = X.components.apply(lambda x: thermoml_lib.count_atoms(name_to_formula[x])) X = X[X.n_heavy_atoms <= 10] X.dropna(axis=1, how='all', inplace=True) X["n_atoms"] = X.components.apply(lambda x: thermoml_lib.count_atoms(name_to_formula[x], which_atoms=which_atoms)) X = X[X.n_atoms <= 100] X.dropna(axis=1, how='all', inplace=True) X["smiles"] = X.components.apply(lambda x: cirpy.resolve(x, "smiles")) # This should be cached via sklearn. X = X[X.smiles != None] X = X.ix[X.smiles.dropna().index] X["cas"] = X.components.apply(lambda x: thermoml_lib.get_first_entry(cirpy.resolve(x, "cas"))) # This should be cached via sklearn. X = X[X.cas != None] X = X.ix[X.cas.dropna().index] X["Pressure, kPa"] = 101.325 # Assume everything within range is comparable. mu = X.groupby(["components", "smiles", "cas", "Temperature, K", "Pressure, kPa"])[experiments].mean() sigma = X.groupby(["components", "smiles", "cas", "Temperature, K", "Pressure, kPa"])[experiments].std().dropna() mu = mu.dropna()