def test_organic_user_db(): db = ChemicalMetadataDB(elements=False, main_db=None, user_dbs=[ os.path.join( folder, 'chemical identifiers example user db.tsv') ]) for CAS, d in db.CAS_index.items(): assert CAS_from_any(d.CASs) == d.CASs # Check something was loaded assert len(db.CAS_index) > 100 # Check smiles are unique / can lookup by smiles for smi, d in db.smiles_index.items(): if not smi: continue assert CAS_from_any('smiles=' + smi) == d.CASs # Check formula is formatted right assert all([ i.formula == serialize_formula(i.formula) for i in db.CAS_index.values() ]) # Check CAS validity assert all([check_CAS(i.CASs) for i in db.CAS_index.values()]) # MW checker for i in db.CAS_index.values(): formula = serialize_formula(i.formula) atoms = nested_formula_parser(formula, check=False) mw_calc = molecular_weight(atoms) assert_allclose(mw_calc, i.MW, atol=0.05) for CAS, d in db.CAS_index.items(): assert CAS_from_any('InChI=1S/' + d.InChI) == int_to_CAS(CAS) for CAS, d in db.CAS_index.items(): assert CAS_from_any('InChIKey=' + d.InChI_key) == int_to_CAS(CAS) # Test the pubchem ids which aren't -1 for CAS, d in db.CAS_index.items(): if d.pubchemid != -1: assert CAS_from_any('PubChem=' + str(d.pubchemid)) == int_to_CAS(CAS) CAS_lenth = len(db.CAS_index) assert CAS_lenth == len(db.smiles_index) assert CAS_lenth == len(db.InChI_index) assert CAS_lenth == len(db.InChI_key_index)
def test_database_formulas(): # Failures are thing slike 3He, C2D4Br2, C14H18N3NaO10[99Tc], [1H]I # The fix here is adding an isotope db and making the formula parser handle isotopes as well. # This worked until isotopes were added to formulas assert all([ i.formula == serialize_formula(i.formula) for i in pubchem_db.CAS_index.values() ])
def test_inorganic_db(): db = ChemicalMetadataDB( elements=False, main_db=None, user_dbs=[os.path.join(folder, 'Inorganic db.tsv')]) # Check CAS lookup for CAS, d in db.CAS_index.items(): assert CAS_from_any(d.CASs) == d.CASs # Try ro check formula lookups for formula, d in db.formula_index.items(): if formula in set(['H2MgO2', 'F2N2']): # Formulas which are not unique by design continue assert CAS_from_any(formula) == d.CASs # Check smiles are unique / can lookup by smiles for smi, d in db.smiles_index.items(): if not smi: continue assert CAS_from_any('smiles=' + smi) == d.CASs # Check formula is formatted right assert all([ i.formula == serialize_formula(i.formula) for i in db.CAS_index.values() ]) # Check CAS validity assert all([check_CAS(i.CASs) for i in db.CAS_index.values()]) # MW checker for i in db.CAS_index.values(): formula = serialize_formula(i.formula) atoms = nested_formula_parser(formula, check=False) mw_calc = molecular_weight(atoms) assert_allclose(mw_calc, i.MW, atol=0.05)
def test_db_vs_ChemSep(): """The CAS numbers are checked, as are most of the chemical formulas. Some chemical structural formulas aren't supported by the current formula parser and are ignored; otherwise it is a very effective test. DO NOT TRY TO OPTimizE THis FUNCTION - IT HAS ALREADY BEEN TRIED AND FAILED AT. THE TIME IS ONLY TAKEN py the PARSE function. EVEN THAT HAS BEEN REDUCED By 80% by using cElementTree instead of ElementTree. """ import xml.etree.cElementTree as ET folder = os.path.join(os.path.dirname(__file__), 'Data') tree = ET.parse(os.path.join(folder, 'chemsep1.xml')) root = tree.getroot() data = {} for child in root: CAS, name, smiles, formula = None, None, None, None for i in child: tag = i.tag if CAS is None and tag == 'CAS': CAS = i.attrib['value'] elif name is None and tag == 'CompoundID': name = i.attrib['value'] elif smiles is None and tag == 'Smiles': smiles = i.attrib['value'] elif formula is None and tag == 'StructureFormula': formula = i.attrib['value'] # CAS = [i.attrib['value'] if ][0] # name = [i.attrib['value'] for i in child if i.tag ][0] # smiles = [i.attrib['value'] for i in child if i.tag == ] # formula = [i.attrib['value'] for i in child if i.tag == 'StructureFormula'][0] try: if '-' in formula: formula = None else: formula = serialize_formula(formula) except: pass if smiles: smiles = smiles[0] else: smiles = None data[CAS] = {'name': name, 'smiles': smiles, 'formula': formula} for CAS, d in data.items(): hit = pubchem_db.search_CAS(CAS) assert hit.CASs == CAS for CAS, d in data.items(): assert CAS_from_any(CAS) == CAS for CAS, d in data.items(): f = d['formula'] if f is None or f == '1,4-COOH(C6H4)COOH' or d['name'] == 'Air': continue assert pubchem_db.search_CAS(CAS).formula == f
def _search_chemical(ID, autoload): ID_arg = ID ID = ID.strip() ID_lower = ID.lower() if ID in periodic_table: '''Special handling for homonuclear elements. Search '1'> H, 'H'> H, monotomic CAS > H but "Hydrogen"> H2. pubchem_db does not contain atomic numbers, so searching in the periodic table is necessary. ''' if (ID in periodic_table._symbol_to_elements or ID in periodic_table._number_to_elements or ID in periodic_table._CAS_to_elements): obj = pubchem_db.search_CAS(periodic_table[ID].CAS) else: obj = pubchem_db.search_CAS(periodic_table[ID].CAS_standard) return obj if check_CAS(ID): CAS_lookup = pubchem_db.search_CAS(ID, autoload) if CAS_lookup: return CAS_lookup # handle the case of synonyms CAS_alternate_loopup = pubchem_db.search_name(ID, autoload) if CAS_alternate_loopup: return CAS_alternate_loopup if not autoload: return search_chemical(ID, autoload=True) raise ValueError( 'A valid CAS number (%s) was recognized, but is not in the database' % (ID)) ID_len = len(ID) if ID_len > 9: inchi_search = False # normal upper case is 'InChI=1S/' if ID_lower[0:9] == 'inchi=1s/': inchi_search = ID[9:] elif ID_lower[0:8] == 'inchi=1/': inchi_search = ID[8:] if inchi_search: inchi_lookup = pubchem_db.search_InChI(inchi_search, autoload) if inchi_lookup: return inchi_lookup else: if not autoload: return search_chemical(ID, autoload=True) raise ValueError( 'A valid InChI name (%s) was recognized, but it is not in the database' % (inchi_search)) if ID_lower[0:9] == 'inchikey=': inchi_key_lookup = pubchem_db.search_InChI_key(ID[9:], autoload) if inchi_key_lookup: return inchi_key_lookup else: if not autoload: obj = search_chemical(ID, autoload=True) return obj raise ValueError( 'A valid InChI Key (%s) was recognized, but it is not in the database' % (inchi_key_lookup)) if ID_len > 8: if ID_lower[0:8] == 'pubchem=': pubchem_lookup = pubchem_db.search_pubchem(ID[8:], autoload) if pubchem_lookup: return pubchem_lookup else: if not autoload: return search_chemical(ID, autoload=True) raise ValueError( 'A PubChem integer (%s) identifier was recognized, but it is not in the database.' % (ID[8:])) if ID_len > 7: if ID_lower[0:7] == 'smiles=': smiles_lookup = pubchem_db.search_smiles(ID[7:], autoload) if smiles_lookup: return smiles_lookup else: if not autoload: return search_chemical(ID, autoload=True) raise ValueError( 'A SMILES identifier (%s) was recognized, but it is not in the database.' % (ID[7:])) # Try the smiles lookup anyway # Parsing SMILES is an option, but this is faster # Pybel API also prints messages to console on failure smiles_lookup = pubchem_db.search_smiles(ID, autoload) if smiles_lookup: return smiles_lookup try: formula_query = pubchem_db.search_formula(serialize_formula(ID), autoload) if formula_query and type(formula_query) == ChemicalMetadata: return formula_query except: pass # Try a direct lookup with the name - the fastest name_lookup = pubchem_db.search_name(ID, autoload) if name_lookup: return name_lookup # Permutate through various name options ID_no_space = ID.replace(' ', '') ID_no_space_dash = ID_no_space.replace('-', '') for name in [ID, ID_no_space, ID_no_space_dash]: for name2 in [name, name.lower()]: name_lookup = pubchem_db.search_name(name2, autoload) if name_lookup: return name_lookup if ID[-1] == ')' and '(' in ID: # # Try to match in the form 'water (H2O)' first_identifier, second_identifier = ID[0:-1].split('(', 1) try: CAS1 = search_chemical(first_identifier, autoload) CAS2 = search_chemical(second_identifier, autoload) assert CAS1 == CAS2 CAS = CAS1 return CAS except: pass if not autoload: return _search_chemical(ID, autoload=True) raise ValueError('Chemical name (%s) not recognized' % (ID))
def _search(self, ID): if not ID: raise ValueError('ID cannot be empty') ID = ID.replace('_', ' ') ID_lower = ID.lower() ID_len = len(ID) if ID_len > 9: inchi_search = False # normal upper case is 'InChI=1S/' if ID_lower[0:9] == 'inchi=1s/': inchi_search = ID[9:] elif ID_lower[0:8] == 'inchi=1/': inchi_search = ID[8:] if inchi_search: inchi_lookup = self.search_InChI(inchi_search) if inchi_lookup: return inchi_lookup raise LookupError( 'A valid InChI name was recognized, but it is not in the database' ) if ID_lower[0:9] == 'inchikey=': inchi_key_lookup = self.search_InChI_key(ID[9:]) if inchi_key_lookup: return inchi_key_lookup raise LookupError( 'A valid InChI Key was recognized, but it is not in the database' ) if ID_len > 8: if ID_lower[0:8] == 'pubchem=': pubchem_lookup = self.search_pubchem(ID[8:]) if pubchem_lookup: return pubchem_lookup raise LookupError( 'A PubChem integer identifier was recognized, but it is not in the database.' ) if ID_len > 7: if ID_lower[0:7] == 'smiles=': smiles_lookup = self.search_smiles(ID[7:]) if smiles_lookup: return smiles_lookup raise LookupError( 'A SMILES identifier was recognized, but it is not in the database.' ) # Permutate through various name options ID_search = spaceout_words(ID).lower() for name in (ID_lower, ID_search): name_lookup = self.search_name(name) if name_lookup: return name_lookup if check_CAS(ID): CAS_lookup = self.search_CAS(ID) if CAS_lookup: return CAS_lookup # Handle the case of synonyms CAS_alternate_loopup = self.search_name(ID) if CAS_alternate_loopup: return CAS_alternate_loopup raise LookupError( 'a valid CAS number was recognized, but its not in the database' ) try: formula = serialize_formula(ID) except: pass else: formula_query = self.search_formula(formula) if formula_query: return formula_query raise LookupError(f'chemical {repr(ID)} not recognized')