def test_small_moddata_featurization(): """ This test creates a new MODData from the MP 2018.6 structures. """ data_file = Path(__file__).parent.joinpath("data/MP_2018.6_small.zip") # Loading pickles can be dangerous, so lets at least check that the MD5 matches # what it was when created assert (get_sha512_of_file(data_file) == "37bd4f8ce6f29c904a13e5670dd53af9a8779094727052ec85ccd6362b1b3765" "ac613426331811b3f626242896d87c3f6bc1884cc5545875b5ae66a712f9e218") old = MODData.load(data_file) structures = old.structures targets = old.targets names = old.names new = MODData(structures, targets, target_names=names) new.featurize(fast=False) new_cols = sorted(new.df_featurized.columns.tolist()) old_cols = sorted(old.df_featurized.columns.tolist()) for i in range(len(old_cols)): print(new_cols[i], old_cols[i]) assert new_cols[i] == old_cols[i] np.testing.assert_array_equal(old_cols, new_cols) for col in new.df_featurized.columns: np.testing.assert_almost_equal( new.df_featurized[col].to_numpy(), old.df_featurized[col].to_numpy(), )
def test_small_moddata_featurization(small_moddata): """ This test creates a new MODData from the MP 2018.6 structures. """ old = small_moddata structures = old.structures targets = old.targets names = old.names new = MODData(structures, targets, target_names=names) new.featurize(fast=False, n_jobs=1) new_cols = sorted(new.df_featurized.columns.tolist()) old_cols = sorted(old.df_featurized.columns.tolist()) for i in range(len(old_cols)): print(new_cols[i], old_cols[i]) assert new_cols[i] == old_cols[i] np.testing.assert_array_equal(old_cols, new_cols) for col in new.df_featurized.columns: np.testing.assert_almost_equal( new.df_featurized[col].to_numpy(), old.df_featurized[col].to_numpy(), )
def featurize(task, n_jobs=1): import warnings warnings.filterwarnings("ignore", category=RuntimeWarning) from modnet.preprocessing import MODData from modnet.featurizers.presets import DeBreuck2020Featurizer from matminer.datasets import load_dataset if task == "matbench_elastic": df_g = load_dataset("matbench_log_gvrh") df_k = load_dataset("matbench_log_kvrh") df = df_g.join(df_k.drop("structure", axis=1)) else: df = load_dataset(task) mapping = { col: col.replace(" ", "_").replace("(", "").replace(")", "") for ind, col in enumerate(df.columns) } df.rename(columns=mapping, inplace=True) targets = [ col for col in df.columns if col not in ("id", "structure", "composition") ] if "structure" not in df.columns: featurizer = CompositionOnlyFeaturizer() else: featurizer = DeBreuck2020Featurizer(fast_oxid=True) try: materials = df["structure"] if "structure" in df.columns else df[ "composition"].map(Composition) except KeyError: raise RuntimeError( f"Could not find any materials data dataset for task {task!r}!") data = MODData( materials=materials.tolist(), targets=df[targets].values, target_names=targets, featurizer=featurizer, ) data.featurize(n_jobs=n_jobs) os.makedirs("./precomputed", exist_ok=True) data.save(f"./precomputed/{task}_moddata.pkl.gz") return data
def test_small_moddata_composition_featurization(small_moddata_composition): """ This test creates a new MODData from the MP 2018.6 structures. """ reference = small_moddata_composition compositions = reference.compositions new = MODData(materials=compositions) new.featurize(fast=False, n_jobs=1) new_cols = sorted(new.df_featurized.columns.tolist()) ref_cols = sorted(reference.df_featurized.columns.tolist()) for i in range(len(ref_cols)): # print(new_cols[i], ref_cols[i]) assert new_cols[i] == ref_cols[i] for col in new.df_featurized.columns: np.testing.assert_almost_equal( new.df_featurized[col].to_numpy(), reference.df_featurized[col].to_numpy(), )
materials = train_df[ "structure"] if "structure" in train_df.columns else train_df[ "composition"].map(Composition) except KeyError: raise RuntimeError( f"Could not find any materials data dataset for task {task!r}!" ) fast_oxid_featurizer = DeBreuck2020Featurizer(fast_oxid=True) train_data = MODData( materials=materials.tolist(), targets=train_df[targets].values, target_names=targets, featurizer=fast_oxid_featurizer, ) train_data.featurize(n_jobs=32) train_data.feature_selection(n=-1, use_precomputed_cross_nmi=True) # create model targets_hierarchy = [[[field for field in targets]]] weights = {field: 1 for field in targets} model = EnsembleMODNetModel(targets_hierarchy, weights) # fit model if USE_GA: # you can either use a GA for hyper-parameter optimization or... from modnet.hyper_opt import FitGenetic ga = FitGenetic(train_data) model = ga.run( size_pop=20,