Exemplo n.º 1
0
    'CoulombMatrix', 'SineCoulombMatrix', 'OrbitalFieldMatrix',
    'MinimumRelativeDistances', 'ElectronicRadialDistributionFunction'
]
FEATUREIZE_THESE_COLUMNS = ["formula", "structure"]
MULTIINDEX = True
if MULTIINDEX:
    TARGET = ('Input Data', TARGET)

# actual pipeline:
df_init = load_castelli_perovskites()
if LIMIT and LIMIT < len(df_init):
    df_init = df_init.iloc[np.random.choice(len(df_init), LIMIT,
                                            replace=False)]

featzer = Featurize(ignore_cols=IGNORE_THESE_COLUMNS,
                    exclude=EXCLUDED_FEATURIZERS,
                    multiindex=MULTIINDEX,
                    drop_featurized_col=True)

df = featzer.auto_featurize(df_init,
                            input_cols=FEATUREIZE_THESE_COLUMNS,
                            guess_oxidstates=True)

prep = PreProcess(target=TARGET)
df = prep.preprocess(df)

X_train, X_test, y_train, y_test = train_test_split(df.drop(TARGET, axis=1),
                                                    df[TARGET])

print('start timing...')
start_time = time()
tpot = TPOTAutoML(mode=MODE,
from automatminer.preprocess import PreProcess
from matminer import PlotlyFig
from scipy.stats import linregress
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# inputs
target = 'gap expt'
RS = 24
mode = 'regression'
MULTIINDEX = True
if MULTIINDEX:
    target = ('Input Data', target)

df_init = load_expt_gap()
featzer = Featurize(exclude=['CohesiveEnergy', 'AtomicPackingEfficiency'],
                    multiindex=MULTIINDEX)

df = featzer.featurize_formula(df_init,
                               featurizers='all',
                               guess_oxidstates=False)

prep = PreProcess(target=target)
df = prep.preprocess(df)

print(df.head())
df.to_csv('test.csv')

X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1),
                                                    df[target])

model = RandomForestRegressor(n_estimators=100,
model_tmp_path = r'example_data/matbench_data/autosklearn_output/tmp/'
model_output_path = r'example_data/matbench_data/autosklearn_output/output/'

feature_output_file = \
    os.path.join(feature_output_path,
                 "{}_all_featurized_data.csv".format(data_name))

if os.path.exists(feature_output_file):
    df = pd.read_csv(feature_output_file, index_col=0)
else:
    df_init = load_glass_ternary_landolt()

    prof = Profile()
    prof.enable()

    featzer = Featurize()
    df_feats = featzer.featurize_formula(df_init, featurizers="all")
    prep = PreProcess(max_colnull=0.1)
    df = prep.preprocess(df_feats)

    prof.create_stats()
    print("featurize time:\n")
    pstats.Stats(prof).strip_dirs().sort_stats("time").print_stats(5)

    if os.path.exists(feature_output_path):
        print("output path: {} exists!".format(feature_output_path))
    else:
        os.makedirs(feature_output_path)
        print("create output path: {} successful!".format(feature_output_path))

    prof.dump_stats(
from automatminer.featurize import Featurize
from automatminer.preprocess import PreProcess
from sklearn.model_selection import train_test_split
from time import time

# user inputs
target = 'gfa'
RS = 29
timelimitmins = None
print('timelimitmins = ', timelimitmins)
model_type = 'classification'
scoring = 'f1'

# load and featurize:
df_init = load_glass_ternary_landolt()
featzer = Featurize(ignore_cols=['phase'], ignore_errors=True)

df_feats = featzer.featurize_formula(df_init, featurizers='all')

# preprocessing of the data
prep = PreProcess(max_colnull=0.1)
df = prep.preprocess(df_feats)
df.to_csv('{}_tpot_trained_data.csv'.format(target))
print(df.shape)
print(df.head())
assert df.isnull().sum().sum() == 0
# train/test split (development is within tpot crossvalidation)
X_train, X_test, y_train, y_test = \
    train_test_split(df.drop(target, axis=1).values,
                     df[target], train_size=0.75, test_size=0.25,
                     random_state=RS)