Пример #1
0
def test_dissociation_reactions():

    # Check there's only one dissociation reaction for each product
    assert len(df['Electrolyte Formula']) == len(set(df['Electrolyte Formula'].values.tolist()))

    # Check the chemicals match up with the database
    for name, CAS, formula in zip(df['Electrolyte name'], df['Electrolyte CAS'], df['Electrolyte Formula']):
        assert CAS_from_any(CAS) == CAS
        assert pubchem_db.search_CAS(CAS).formula == serialize_formula(formula)

    # Check the anions match up with the database
    for formula, CAS, charge in zip(df['Anion formula'], df['Anion CAS'], df['Anion charge']):
        assert CAS_from_any(CAS) == CAS
        assert CAS_from_any(formula) == CAS
        hit = pubchem_db.search_CAS(CAS)
        assert hit.charge == charge
        assert hit.formula == serialize_formula(formula)

    # Check the cations match up with the database
    for formula, CAS, charge in zip(df['Cation formula'], df['Cation CAS'], df['Cation charge']):
        assert CAS_from_any(CAS) == CAS
        assert CAS_from_any(formula) == CAS
        hit = pubchem_db.search_CAS(CAS)
        assert hit.charge == charge
        assert hit.formula == serialize_formula(formula)

    # Check the charges and counts of ions sums to zero
    for an_charge, an_count, cat_charge, cat_count in zip(df['Anion charge'].tolist(), df['Anion count'].tolist(), df['Cation charge'].tolist(), df['Cation count'].tolist()):
    # for index, row in df.iterrows():
    #     an_charge = row['Anion charge']
    #     an_count = row['Anion count']
    #     cat_charge = row['Cation charge']
    #     cat_count = row['Cation count']
        err = an_charge*an_count + cat_charge*cat_count
        assert err == 0

    # Check the reactant counts and product counts sum to be equal and conserve
    # moles
    #for index, row in df.iterrows():
    for elec, cat, cat_count, an, an_count in zip(df['Electrolyte Formula'].tolist(), df['Cation formula'].tolist(),
                                                  df['Cation count'].tolist(), df['Anion formula'].tolist(),
                                                  df['Anion count'].tolist()):
        elec = nested_formula_parser(elec)
        #elec = nested_formula_parser(row['Electrolyte Formula'])
        cat = nested_formula_parser(cat)
        #cat = nested_formula_parser(row['Cation formula'])
        #cat_count = row['Cation count']
        an = nested_formula_parser(an)
        #an = nested_formula_parser(row['Anion formula'])
        #an_count = row['Anion count']
        product_counter = Counter()
        for _ in range(cat_count):
            product_counter.update(cat)
        for _ in range(an_count):
            product_counter.update(an)
        assert dict(product_counter.items()) == elec
Пример #2
0
def test_organic_user_db():
    db = ChemicalMetadataDB(elements=False,
                            main_db=None,
                            user_dbs=[
                                os.path.join(
                                    folder,
                                    'chemical identifiers example user db.tsv')
                            ])
    for CAS, d in db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs
    # Check something was loaded
    assert len(db.CAS_index) > 100

    # Check smiles are unique / can lookup by smiles
    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    # Check formula is formatted right
    assert all([
        i.formula == serialize_formula(i.formula)
        for i in db.CAS_index.values()
    ])

    # Check CAS validity
    assert all([check_CAS(i.CASs) for i in db.CAS_index.values()])

    # MW checker
    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        assert_allclose(mw_calc, i.MW, atol=0.05)

    for CAS, d in db.CAS_index.items():
        assert CAS_from_any('InChI=1S/' + d.InChI) == int_to_CAS(CAS)

    for CAS, d in db.CAS_index.items():
        assert CAS_from_any('InChIKey=' + d.InChI_key) == int_to_CAS(CAS)

    # Test the pubchem ids which aren't -1
    for CAS, d in db.CAS_index.items():
        if d.pubchemid != -1:
            assert CAS_from_any('PubChem=' +
                                str(d.pubchemid)) == int_to_CAS(CAS)

    CAS_lenth = len(db.CAS_index)
    assert CAS_lenth == len(db.smiles_index)
    assert CAS_lenth == len(db.InChI_index)
    assert CAS_lenth == len(db.InChI_key_index)
Пример #3
0
def test_inorganic_db():
    db = ChemicalMetadataDB(
        elements=False,
        main_db=None,
        user_dbs=[os.path.join(folder, 'Inorganic db.tsv')])

    # Check CAS lookup
    for CAS, d in db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs

    # Try ro check formula lookups
    for formula, d in db.formula_index.items():
        if formula in set(['H2MgO2', 'F2N2']):
            # Formulas which are not unique by design
            continue
        assert CAS_from_any(formula) == d.CASs

    # Check smiles are unique / can lookup by smiles
    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    # Check formula is formatted right
    assert all([
        i.formula == serialize_formula(i.formula)
        for i in db.CAS_index.values()
    ])

    # Check CAS validity
    assert all([check_CAS(i.CASs) for i in db.CAS_index.values()])

    # MW checker
    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        assert_allclose(mw_calc, i.MW, atol=0.05)
Пример #4
0
def process(init_data, use_cache=True):
    '''
    
    Examples
    --------
    
    >>> res = process({'CAS': '10170-69-1', 'synonyms': ['14267-36-8', 'NSC 22319'], 'name': 'Manganese, decacarbonyldi-, (Mn-Mn)'})
    >>> res['inchi'], res['smiles'], res['cid'], res['CAS']
    ('InChI=1S/10CO.2Mn/c10*1-2;;', '[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[Mn].[Mn]', 517769, '10170-69-1')
    '''
    # print(locals())
    init_data = init_data.copy()
    cc = cc_CAS = cc_name = cc_inchi = cc_inchikey = cc_smiles = cc_synonyms = cc_deprecated_CASs = None
    if 'CAS' in init_data:
        try:
            cc = common_chemistry_data(init_data['CAS'])
            cc_CAS, cc_name, cc_inchi, cc_inchikey, cc_smiles, cc_synonyms, cc_deprecated_CASs = cc
        except ValueError:
            # Compund is not in common chemistry; this is OK
            pass

    cid = iupac_name = p_MW = p_inchi = p_inchikey = p_smiles = p_formula = p_synonyms = None

    if init_data.get('mol', None) is not None:
        # If not in common chemistry or no InChi there, but if we have a mol file, get the inchi and inchikey for the
        # pubchem lookup
        mol = Chem.MolFromMolFile(init_data['mol'])
        if mol is not None:
            init_data['inchi'] = MolToInchi(mol)
            init_data['inchikey'] = InchiToInchiKey(init_data['inchi'])

    can_search_pubchem = (init_data.get('pubchem') is not None
                          or init_data.get('CASRN', cc_CAS) is not None
                          or init_data.get('inchi', cc_inchi) is not None
                          or init_data.get('inchikey', cc_inchikey) is not None
                          or init_data.get('smiles', cc_smiles) is not None)

    if can_search_pubchem:
        try:
            p = find_pubchem_from_ids(
                pubchem=init_data.get('pubchem'),
                CASRN=init_data.get('CASRN', cc_CAS),
                inchi=init_data.get('inchi', cc_inchi),
                inchikey=init_data.get('inchikey', cc_inchikey),
                smiles=init_data.get('smiles', cc_smiles),
                use_cache=use_cache)
        except Exception as e:
            p = None
            print(e, 'exception')
        if p is not None:
            cid, iupac_name, p_MW, p_inchi, p_inchikey, p_smiles, p_formula, p_synonyms = p
    # print(locals())
    mol = None
    # Be aware some smiles descriptions are wrong
    # Start with user overridding
    if 'mol' in init_data:
        mol = Chem.MolFromMolFile(init_data['mol'])
    if mol is None and 'smiles' in init_data:
        mol = Chem.MolFromSmiles(init_data['smiles'])
    if mol is None and 'inchi' in init_data:
        mol = MolFromInchi(
            init_data['inchi']) if init_data['inchi'].startswith(
                "InChI=1S/") else MolFromInchi("InChI=1S/" +
                                               init_data['inchi'])
    # Trust common chemistry next
    if mol is None and cc_smiles is not None:
        mol = Chem.MolFromSmiles(cc_smiles)
    if mol is None and cc_inchi is not None:
        mol = MolFromInchi(cc_inchi) if cc_inchi.startswith(
            "InChI=1S/") else MolFromInchi("InChI=1S/" + cc_inchi)
    # Did we pull up the structure from pubchem??
    if mol is None and p_smiles is not None:
        mol = Chem.MolFromSmiles(p_smiles)
    if mol is None and p_inchi is not None:
        mol = MolFromInchi(p_inchi) if p_inchi.startswith(
            "InChI=1S/") else MolFromInchi("InChI=1S/" + p_inchi)
    if mol is None:
        raise ValueError("No structure found")

    smiles = Chem.MolToSmiles(mol, True)
    inchi = MolToInchi(mol)
    inchikey = InchiToInchiKey(inchi)
    #MW = Descriptors.ExactMolWt(mol)
    formula = CalcMolFormula(mol, True, True)
    formula = serialize_formula(formula)
    MW = molecular_weight(nested_formula_parser(formula))

    # print(inchi, cc_inchi, p_inchi)
    # print(inchikey, cc_inchikey, p_inchikey)
    # print(smiles, cc_smiles, p_smiles)

    # output values
    if 'pubchem' in init_data:
        cid = init_data['pubchem']
    elif cid is None:
        cid = -1

    if cc_CAS is not None:
        CAS = cc_CAS
    elif 'CAS' in init_data:
        CAS = init_data['CAS']
    else:
        raise ValueError("CAS culd not be found")

    if 'formula' in init_data:
        # Override rdkit
        formula = init_data['formula']

    if 'MW' in init_data:
        # Override rdkit
        MW = init_data['MW']

    if 'smiles' in init_data:
        smiles = init_data['smiles']
    if 'inchi' in init_data:
        inchi = init_data['inchi']
    if 'inchikey' in init_data:
        inchikey = init_data['inchikey']

    if inchikey == '*' or smiles == '*' or inchi == '*':
        raise ValueError("Failure in rdkit")

    # Do we have a name specified in the settings?
    if 'name' in init_data:
        name = init_data['name']
    elif cc_name is not None:
        name = cc_name
    elif iupac_name is not None:
        name = iupac_name
    else:
        raise ValueError("There is no name for this compound")

    synonyms = []
    if cc_synonyms is not None:
        synonyms += cc_synonyms
    if cc_deprecated_CASs is not None:
        synonyms += cc_deprecated_CASs
    if p_synonyms is not None:
        synonyms += p_synonyms
    if 'synonyms' in init_data:
        synonyms += init_data['synonyms']
    synonyms = list(set(synonyms))
    if name in synonyms:
        synonyms.remove(name)
    if synonyms:

        def key_sort_str(s):
            return len(s), s.lower()

        synonyms = sorted(synonyms, key=key_sort_str)
        # synonyms = natsorted(synonyms)
    # synonyms = []

    return {
        'cid': cid,
        'CAS': CAS,
        'formula': formula,
        'MW': MW,
        'smiles': smiles,
        'inchi': inchi,
        'inchikey': inchikey,
        'name': name,
        'synonyms': synonyms
    }