Пример #1
0
def tran_feat_composition(
    df,
    var_formula="FORMULA",
    preset_name="magpie",
    append=True,
    ignore_errors=True,
    **kwargs,
):
    r"""Featurize a dataset using matminer

    Featurize chemical composition using matminer package.

    Args:
        df (DataFrame): Data to featurize
        var_formula (string): Column in df with chemical formula; formula
            given as string
        append (bool): Append results to original columns?
        preset_name (string): Matminer featurization preset

    Kwargs:
        ignore_errors (bool): Do not throw an error while parsing formulae; set to
            True to return NaN's for invalid formulae.

    Notes:
        - A pre-processor and wrapper for matminer.featurizers.composition

    References:
        Ward, L., Dunn, A., Faghaninia, A., Zimmermann, N. E. R., Bajaj, S., Wang, Q., Montoya, J. H., Chen, J., Bystrom, K., Dylla, M., Chard, K., Asta, M., Persson, K., Snyder, G. J., Foster, I., Jain, A., Matminer: An open source toolkit for materials data mining. Comput. Mater. Sci. 152, 60-69 (2018).

    Examples:
        >>> import grama as gr
        >>> from grama.tran import tf_feat_composition
        >>> (
        >>>     gr.df_make(FORMULA=["C6H12O6"])
        >>>     >> gr.tf_feat_composition()
        >>> )

    """
    ## Check invariants

    ## Featurize
    featurizer = ElementProperty.from_preset(preset_name=preset_name)
    df_res = StrToComposition().featurize_dataframe(
        df[[var_formula]],
        var_formula,
        ignore_errors=ignore_errors,
    )
    df_res = featurizer.featurize_dataframe(
        df_res,
        col_id="composition",
        ignore_errors=ignore_errors,
        **kwargs,
    )
    df_res.drop(columns=[var_formula, "composition"], inplace=True)

    ## Concatenate as necessary
    if append:
        df_res = concat((df, df_res), axis=1)

    return df_res
Пример #2
0
    data_3, "composition")

os_feat = OxidationStates()
data_3 = os_feat.featurize_dataframe(data_3, "composition_oxid")

from matminer.featurizers.structure import DensityFeatures

df_feat = DensityFeatures()
data_3 = df_feat.featurize_dataframe(data_3, "structure")

unwanted_columns = [
    "elasticity", "material_id", "nsites", "compliance_tensor",
    "elastic_tensor", "elastic_tensor_original", "K_Voigt", "G_Voigt",
    "K_Reuss", "G_Reuss", "warnings"
]
data_4 = data_3.drop(unwanted_columns, axis=1)

# In[ ]:

# Additional data cleaning after some trial runs
y = data_4['K_VRH'].values
excluded = [
    "G_VRH", "K_VRH", "elastic_anisotropy", "pretty_formula", "poisson_ratio",
    "structure", "composition", "composition_oxid", "G_Voigt_Reuss_Hill",
    "K_Voigt_Reuss_Hill", "homogeneous_poisson", "universal_anisotropy"
]

X = data_4.drop(excluded, axis=1)

# The final row of excluded lable list is to minimize training interference.
# An exceptionally good or near perfect linear fit was first obtained without dropping them, which is abnormal.
Пример #3
0
'''
Block 3 - Loading and making predictions
'''

# Saving Id and Formula to an output dataframe odf
odf = pd.DataFrame()  # output dataframe
odf['Id'] = fdf['Id']
odf['Reduced Formula'] = fdf['Reduced Formula']

excluded = [
    'Id', 'Reduced Formula', 'composition', 'composition_oxid',
    'HOMO_character', 'HOMO_element', 'LUMO_character', 'LUMO_element'
]

# A few additional adjustments
fdf = fdf.drop(excluded, axis=1)
fdf = fdf.replace([np.inf, -np.inf], np.nan)
fdf = fdf.fillna(0)

# No target variables, therefore:
X = fdf

# Normalizing the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# Check dimensionality
print('There are {} possible descriptors:\n{}'.format(X.shape[1],
                                                      X.columns.values))
from matminer.featurizers.composition import OxidationStates

df = CompositionToOxidComposition().featurize_dataframe(df, "composition")

os_feat = OxidationStates()
df = os_feat.featurize_dataframe(df, "composition_oxid")

from matminer.featurizers.structure import DensityFeatures

df_feat = DensityFeatures()
df = df_feat.featurize_dataframe(df, col_id='structure')

y = df['K_VRH'].values
excluded = ["G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id",
            "poisson_ratio", "structure", "composition", "composition_oxid"]
X = df.drop(excluded, axis=1)
print("There are {} possible descriptors:\n\n{}".format(X.shape[1], X.columns.values))

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

lr = LinearRegression()
lr.fit(X, y)
print(lr.score(X, y))
print(np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X))))

from sklearn.model_selection import KFold, cross_val_score

crossvalidation = KFold(n_splits=10, shuffle=False,
                        random_state=1)
Пример #5
0
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

df = load_dataset("glass_ternary_landolt")

df = df.rename(columns={"formula": "composition"})
df = df[["composition", "gfa"]]

df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
    df, "composition")
df["composition"] = [c.reduced_formula for c in df["composition_obj"]]
df = df.drop(columns=["composition_obj"])

# print("Ground truth")
# print(df[df["composition"]=="ZrTi9"])  # should be False in final dataframe also!!
# print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also!
# print(df["gfa"].value_counts())    # proportion is about 5000 GFA 2054 no GFA
# raise ValueError

unique = df["composition"].unique()
print(len(df))
print(len(unique))

problem_compositions = []
new_df_dict = {"composition": [], "gfa": []}
for c in tqdm(unique):
    df_per_comp_gfa = df[df["composition"] == c]
Пример #6
0
'''
df = load_dataframe_from_json('data/Batteries_raw.json')

# Select the working ion among {Li, Al, Zr, Mg}
select = 'Li'

# Initial filter based on the selected element
from matminer.featurizers.conversions import StrToComposition
fdf = StrToComposition().featurize_dataframe(df, 'Ion')

select_at = fdf["composition"].apply(lambda x: x.get_atomic_fraction(select))
fdf = fdf[select_at == 1]

# Debug
print("Remaining samples: {}".format(fdf.describe))
fdf = fdf.drop(['composition'], axis=1)

## Initial conversion to matminer objects
from matminer.featurizers.conversions import StrToComposition
fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula')

from matminer.featurizers.conversions import CompositionToOxidComposition
fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition')

print("The initial dataset has {}".format(fdf.shape))
# fdf.to_csv(r"Batteries_feat.csv", index = None, header = True)
'''
Block 2 - Featurization
'''
#
# -- start F1