'CoulombMatrix', 'SineCoulombMatrix', 'OrbitalFieldMatrix', 'MinimumRelativeDistances', 'ElectronicRadialDistributionFunction' ] FEATUREIZE_THESE_COLUMNS = ["formula", "structure"] MULTIINDEX = True if MULTIINDEX: TARGET = ('Input Data', TARGET) # actual pipeline: df_init = load_castelli_perovskites() if LIMIT and LIMIT < len(df_init): df_init = df_init.iloc[np.random.choice(len(df_init), LIMIT, replace=False)] featzer = Featurize(ignore_cols=IGNORE_THESE_COLUMNS, exclude=EXCLUDED_FEATURIZERS, multiindex=MULTIINDEX, drop_featurized_col=True) df = featzer.auto_featurize(df_init, input_cols=FEATUREIZE_THESE_COLUMNS, guess_oxidstates=True) prep = PreProcess(target=TARGET) df = prep.preprocess(df) X_train, X_test, y_train, y_test = train_test_split(df.drop(TARGET, axis=1), df[TARGET]) print('start timing...') start_time = time() tpot = TPOTAutoML(mode=MODE,
from automatminer.preprocess import PreProcess from matminer import PlotlyFig from scipy.stats import linregress from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split # inputs target = 'gap expt' RS = 24 mode = 'regression' MULTIINDEX = True if MULTIINDEX: target = ('Input Data', target) df_init = load_expt_gap() featzer = Featurize(exclude=['CohesiveEnergy', 'AtomicPackingEfficiency'], multiindex=MULTIINDEX) df = featzer.featurize_formula(df_init, featurizers='all', guess_oxidstates=False) prep = PreProcess(target=target) df = prep.preprocess(df) print(df.head()) df.to_csv('test.csv') X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1), df[target]) model = RandomForestRegressor(n_estimators=100,
model_tmp_path = r'example_data/matbench_data/autosklearn_output/tmp/' model_output_path = r'example_data/matbench_data/autosklearn_output/output/' feature_output_file = \ os.path.join(feature_output_path, "{}_all_featurized_data.csv".format(data_name)) if os.path.exists(feature_output_file): df = pd.read_csv(feature_output_file, index_col=0) else: df_init = load_glass_ternary_landolt() prof = Profile() prof.enable() featzer = Featurize() df_feats = featzer.featurize_formula(df_init, featurizers="all") prep = PreProcess(max_colnull=0.1) df = prep.preprocess(df_feats) prof.create_stats() print("featurize time:\n") pstats.Stats(prof).strip_dirs().sort_stats("time").print_stats(5) if os.path.exists(feature_output_path): print("output path: {} exists!".format(feature_output_path)) else: os.makedirs(feature_output_path) print("create output path: {} successful!".format(feature_output_path)) prof.dump_stats(
from automatminer.featurize import Featurize from automatminer.preprocess import PreProcess from sklearn.model_selection import train_test_split from time import time # user inputs target = 'gfa' RS = 29 timelimitmins = None print('timelimitmins = ', timelimitmins) model_type = 'classification' scoring = 'f1' # load and featurize: df_init = load_glass_ternary_landolt() featzer = Featurize(ignore_cols=['phase'], ignore_errors=True) df_feats = featzer.featurize_formula(df_init, featurizers='all') # preprocessing of the data prep = PreProcess(max_colnull=0.1) df = prep.preprocess(df_feats) df.to_csv('{}_tpot_trained_data.csv'.format(target)) print(df.shape) print(df.head()) assert df.isnull().sum().sum() == 0 # train/test split (development is within tpot crossvalidation) X_train, X_test, y_train, y_test = \ train_test_split(df.drop(target, axis=1).values, df[target], train_size=0.75, test_size=0.25, random_state=RS)