Python MODData.featurize 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: modnet.preprocessing

클래스/타입: MODData

메소드/함수: featurize

hotexamples.com에서의 예제들: 5

Python MODData.featurize - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 modnet.preprocessing.MODData.featurize에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

MODData(7)

get_featurized_df(6)

load(6)

featurize(5)

get_optimal_descriptors(5)

feature_selection(3)

split(3)

df_featurized(1)

get_target_df(1)

load_precomputed(1)

save(1)

예제 #1

파일 보기

파일: test_preprocessing.py 프로젝트: wwchung91/modnet

def test_small_moddata_featurization():
    """ This test creates a new MODData from the MP 2018.6 structures. """
    data_file = Path(__file__).parent.joinpath("data/MP_2018.6_small.zip")

    # Loading pickles can be dangerous, so lets at least check that the MD5 matches
    # what it was when created
    assert (get_sha512_of_file(data_file) ==
            "37bd4f8ce6f29c904a13e5670dd53af9a8779094727052ec85ccd6362b1b3765"
            "ac613426331811b3f626242896d87c3f6bc1884cc5545875b5ae66a712f9e218")

    old = MODData.load(data_file)
    structures = old.structures
    targets = old.targets

    names = old.names
    new = MODData(structures, targets, target_names=names)
    new.featurize(fast=False)

    new_cols = sorted(new.df_featurized.columns.tolist())
    old_cols = sorted(old.df_featurized.columns.tolist())

    for i in range(len(old_cols)):
        print(new_cols[i], old_cols[i])
        assert new_cols[i] == old_cols[i]

    np.testing.assert_array_equal(old_cols, new_cols)

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            old.df_featurized[col].to_numpy(),
        )

예제 #2

파일 보기

def test_small_moddata_featurization(small_moddata):
    """ This test creates a new MODData from the MP 2018.6 structures. """

    old = small_moddata
    structures = old.structures
    targets = old.targets

    names = old.names
    new = MODData(structures, targets, target_names=names)
    new.featurize(fast=False, n_jobs=1)

    new_cols = sorted(new.df_featurized.columns.tolist())
    old_cols = sorted(old.df_featurized.columns.tolist())

    for i in range(len(old_cols)):
        print(new_cols[i], old_cols[i])
        assert new_cols[i] == old_cols[i]

    np.testing.assert_array_equal(old_cols, new_cols)

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            old.df_featurized[col].to_numpy(),
        )

예제 #3

파일 보기

파일: run_benchmark.py 프로젝트: ml-evs/modnet-matbench

def featurize(task, n_jobs=1):
    import warnings

    warnings.filterwarnings("ignore", category=RuntimeWarning)
    from modnet.preprocessing import MODData
    from modnet.featurizers.presets import DeBreuck2020Featurizer
    from matminer.datasets import load_dataset

    if task == "matbench_elastic":
        df_g = load_dataset("matbench_log_gvrh")
        df_k = load_dataset("matbench_log_kvrh")
        df = df_g.join(df_k.drop("structure", axis=1))
    else:
        df = load_dataset(task)

    mapping = {
        col: col.replace(" ", "_").replace("(", "").replace(")", "")
        for ind, col in enumerate(df.columns)
    }
    df.rename(columns=mapping, inplace=True)

    targets = [
        col for col in df.columns
        if col not in ("id", "structure", "composition")
    ]

    if "structure" not in df.columns:
        featurizer = CompositionOnlyFeaturizer()
    else:
        featurizer = DeBreuck2020Featurizer(fast_oxid=True)

    try:
        materials = df["structure"] if "structure" in df.columns else df[
            "composition"].map(Composition)
    except KeyError:
        raise RuntimeError(
            f"Could not find any materials data dataset for task {task!r}!")

    data = MODData(
        materials=materials.tolist(),
        targets=df[targets].values,
        target_names=targets,
        featurizer=featurizer,
    )
    data.featurize(n_jobs=n_jobs)
    os.makedirs("./precomputed", exist_ok=True)
    data.save(f"./precomputed/{task}_moddata.pkl.gz")
    return data

예제 #4

파일 보기

def test_small_moddata_composition_featurization(small_moddata_composition):
    """ This test creates a new MODData from the MP 2018.6 structures. """

    reference = small_moddata_composition
    compositions = reference.compositions

    new = MODData(materials=compositions)
    new.featurize(fast=False, n_jobs=1)

    new_cols = sorted(new.df_featurized.columns.tolist())
    ref_cols = sorted(reference.df_featurized.columns.tolist())

    for i in range(len(ref_cols)):
        # print(new_cols[i], ref_cols[i])
        assert new_cols[i] == ref_cols[i]

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            reference.df_featurized[col].to_numpy(),
        )

예제 #5

파일 보기

파일: run.py 프로젝트: hackingmaterials/matbench

            materials = train_df[
                "structure"] if "structure" in train_df.columns else train_df[
                    "composition"].map(Composition)
        except KeyError:
            raise RuntimeError(
                f"Could not find any materials data dataset for task {task!r}!"
            )

        fast_oxid_featurizer = DeBreuck2020Featurizer(fast_oxid=True)
        train_data = MODData(
            materials=materials.tolist(),
            targets=train_df[targets].values,
            target_names=targets,
            featurizer=fast_oxid_featurizer,
        )
        train_data.featurize(n_jobs=32)
        train_data.feature_selection(n=-1, use_precomputed_cross_nmi=True)

        # create model
        targets_hierarchy = [[[field for field in targets]]]
        weights = {field: 1 for field in targets}
        model = EnsembleMODNetModel(targets_hierarchy, weights)

        # fit model

        if USE_GA:
            # you can either use a GA for hyper-parameter optimization or...
            from modnet.hyper_opt import FitGenetic
            ga = FitGenetic(train_data)
            model = ga.run(
                size_pop=20,