Exemplo n.º 1
0
# Take the compound names and convert to SMILES for more refined filtering (I used cirpy, this can also be done with OpenEye)
df["SMILES"] = df.components.apply(lambda x: resolve_cached(x, "smiles")
                                   )  # This should be cached via sklearn.
df = df[df.SMILES != None]
df = df[
    df["SMILES"].str.contains('=O') ==
    False]  # Getting rid of data sets with C=O and C=C occurrences (and triple bonding; I think that's what `#` is in SMILES)
df = df[df["SMILES"].str.contains('#') == False]
df = df[df["SMILES"].str.contains('O=') == False]
df = df[df["SMILES"].str.contains('=C') == False]
df = df[df["SMILES"].str.contains('C=') == False]
df.dropna(subset=["SMILES"], inplace=True)
df = df.ix[df.SMILES.dropna().index]

# Create CAS and InChI identifiers as well
df["cas"] = df.components.apply(lambda x: thermoml_lib.get_first_entry(
    resolve_cached(x, "cas")))  # This should be cached via sklearn.
df["InChI"] = df.components.apply(
    lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "stdinchikey")))
df = df[df.cas != None]
df = df.ix[df.cas.dropna().index]

# Neither names (components) nor smiles are unique.  Use CAS to ensure consistency.
cannonical_smiles_lookup = df.groupby("cas").SMILES.first()
cannonical_components_lookup = df.groupby("cas").components.first()

df["SMILES"] = df.cas.apply(lambda x: cannonical_smiles_lookup[x])
df["components"] = df.cas.apply(lambda x: cannonical_components_lookup[x])

# Extract rows with temperature between 250 and 400 K
df = df[df['Temperature, K'] > 250.]
df = df[df['Temperature, K'] < 400.]
Exemplo n.º 2
0
df = df[df.n_heavy_atoms1 > 0]
df = df[df.n_heavy_atoms2 > 0]
df.dropna(axis=1, how='all', inplace=True)

df["SMILES1"] = df.x1.apply(lambda x: resolve_cached(x, "smiles"))  # This should be cached via sklearn.
df = df[df.SMILES1 != None]
df.dropna(subset=["SMILES1"], inplace=True)
df = df.ix[df.SMILES1.dropna().index]
df["SMILES2"] = df.x2.apply(lambda x: resolve_cached(x, "smiles"))  # This should be cached via sklearn.
df = df[df.SMILES2 != None]
df.dropna(subset=["SMILES2"], inplace=True)
df = df.ix[df.SMILES2.dropna().index]

    
df["cas1"] = df.x1.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "cas")))  # This should be cached via sklearn.
df["InChI1"] = df.x1.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "stdinchikey")))
df = df[df.cas1 != None]
df = df.ix[df.cas1.dropna().index]
df["cas2"] = df.x2.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "cas")))  # This should be cached via sklearn.
df["InChI2"] = df.x2.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "stdinchikey")))
df = df[df.cas2 != None]
df = df.ix[df.cas2.dropna().index]


# Neither names (components) nor smiles are unique.  Use CAS to ensure consistency.
cannonical_smiles_lookup1 = df.groupby("cas1").SMILES1.first()
cannonical_components_lookup1 = df.groupby("cas1").x1.first()
cannonical_smiles_lookup2 = df.groupby("cas2").SMILES2.first()
cannonical_components_lookup2 = df.groupby("cas2").x2.first()
Exemplo n.º 3
0
df = df[df.n_heavy_atoms > 0]
df = df[df.n_heavy_atoms <= 10]
df.dropna(axis=1, how='all', inplace=True)

df["smiles"] = df.components.apply(lambda x: resolve_cached(x, "smiles"))  # This should be cached via sklearn.
df = df[df.smiles != None]
df = df[df["smiles"].str.contains('=O') == False] # Getting rid of data sets with C=O and C=C occurrences
df = df[df["smiles"].str.contains('#') == False]
df = df[df["smiles"].str.contains('O=') == False]
df = df[df["smiles"].str.contains('=C') == False]
df = df[df["smiles"].str.contains('C=') == False]
df.dropna(subset=["smiles"], inplace=True)
df = df.ix[df.smiles.dropna().index]

    
df["cas"] = df.components.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "cas")))  # This should be cached via sklearn.
df = df[df.cas != None]
df = df.ix[df.cas.dropna().index]

# Neither names (components) nor smiles are unique.  Use CAS to ensure consistency.
cannonical_smiles_lookup = df.groupby("cas").smiles.first()
cannonical_components_lookup = df.groupby("cas").components.first()


df["smiles"] = df.cas.apply(lambda x: cannonical_smiles_lookup[x])
df["components"] = df.cas.apply(lambda x: cannonical_components_lookup[x])

# Extract rows with temperature between 128 and 399 K
df = df[df['Temperature, K'] > 250.]
df = df[df['Temperature, K'] < 400.]
X["n_atoms"] = X.formula.apply(lambda formula_string : thermoml_lib.count_atoms(formula_string))
X["n_heavy_atoms"] = X.formula.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, heavy_atoms))
X["n_desired_atoms"] = X.formula.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, desired_atoms))
X["n_other_atoms"] = X.n_atoms - X.n_desired_atoms

X = X[X.n_other_atoms == 0]
X = X[X.n_heavy_atoms > 0]
X = X[X.n_heavy_atoms <= 10]
X.dropna(axis=1, how='all', inplace=True)

X["smiles"] = X.components.apply(lambda x: cirpy.resolve(x, "smiles"))  # This should be cached via sklearn.
X = X[X.smiles != None]
X = X.ix[X.smiles.dropna().index]
    
X["cas"] = X.components.apply(lambda x: thermoml_lib.get_first_entry(cirpy.resolve(x, "cas")))  # This should be cached via sklearn.
X = X[X.cas != None]
X = X.ix[X.cas.dropna().index]

# Neither names (components) nor smiles are unique.  Use CAS to ensure consistency.
cannonical_smiles_lookup = X.groupby("cas").smiles.first()
cannonical_components_lookup = X.groupby("cas").components.first()

X["smiles"] = X.cas.apply(lambda x: cannonical_smiles_lookup[x])
X["components"] = X.cas.apply(lambda x: cannonical_components_lookup[x])

X = X[X["Temperature, K"] > 270]
X = X[X["Temperature, K"] < 330]

X = X[X["Pressure, kPa"] > 100.]
X = X[X["Pressure, kPa"] < 102.]
Exemplo n.º 5
0
X = X[X.n_other_atoms == 0]

counts_data["1.  Druglike Elements"] = X.count()[experiments]

X = X[X.n_heavy_atoms > 0]
X = X[X.n_heavy_atoms <= 10]
X.dropna(axis=1, how='all', inplace=True)

counts_data["2.  Heavy Atoms"] = X.count()[experiments]

X["smiles"] = X.components.apply(lambda x: cirpy.resolve(x, "smiles"))  # This should be cached via sklearn.
X = X[X.smiles != None]
X = X.ix[X.smiles.dropna().index]
    
X["cas"] = X.components.apply(lambda x: thermoml_lib.get_first_entry(cirpy.resolve(x, "cas")))  # This should be cached via sklearn.
X = X[X.cas != None]
X = X.ix[X.cas.dropna().index]

# Neither names (components) nor smiles are unique.  Use CAS to ensure consistency.
cannonical_smiles_lookup = X.groupby("cas").smiles.first()
cannonical_components_lookup = X.groupby("cas").components.first()

X["smiles"] = X.cas.apply(lambda x: cannonical_smiles_lookup[x])
X["components"] = X.cas.apply(lambda x: cannonical_components_lookup[x])


X = X[X["Temperature, K"] > 270]
X = X[X["Temperature, K"] < 330]

counts_data["3.  Temperature"] = X.count()[experiments]
Exemplo n.º 6
0
df = df[df["SMILES1"].str.contains('=C') == False]
df = df[df["SMILES1"].str.contains('C=') == False]
df.dropna(subset=["SMILES1"], inplace=True)
df = df.ix[df.SMILES1.dropna().index]
df["SMILES2"] = df.x2.apply(lambda x: resolve_cached(x, "smiles"))  # This should be cached via sklearn.
df = df[df.SMILES2 != None]
df = df[df["SMILES2"].str.contains('=O') == False] # Getting rid of data sets with C=O and C=C occurrences
df = df[df["SMILES2"].str.contains('#') == False]
df = df[df["SMILES2"].str.contains('O=') == False]
df = df[df["SMILES2"].str.contains('=C') == False]
df = df[df["SMILES2"].str.contains('C=') == False]
df.dropna(subset=["SMILES2"], inplace=True)
df = df.ix[df.SMILES2.dropna().index]

    
df["cas1"] = df.x1.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "cas")))  # This should be cached via sklearn.
df["InChI1"] = df.x1.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "stdinchikey")))
df = df[df.cas1 != None]
df = df.ix[df.cas1.dropna().index]
df["cas2"] = df.x2.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "cas")))  # This should be cached via sklearn.
df["InChI2"] = df.x2.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "stdinchikey")))
df = df[df.cas2 != None]
df = df.ix[df.cas2.dropna().index]


# Neither names (components) nor smiles are unique.  Use CAS to ensure consistency.
cannonical_smiles_lookup1 = df.groupby("cas1").SMILES1.first()
cannonical_components_lookup1 = df.groupby("cas1").x1.first()
cannonical_smiles_lookup2 = df.groupby("cas2").SMILES2.first()
cannonical_components_lookup2 = df.groupby("cas2").x2.first()