def gen_all_lin_model_inp(): """ Generates all linearised full 2nd order Scheffe model inputs from space filling experimental design """ tag = 'All_Lin_Full_Model_List' db = access_db(1, True) if db: debug('Already generated all linearised full model inputs') return sv_db = access_db(0, True) all_full_models = all_full_model_lin(sv_db) db.insert({tag: all_full_models})
def get_data_req_to_score_model(): """ Calculates all the data required to run score_models_per_data_type that does not need to be recalculated in score_models_per_data_type """ Q = Query() all_model_codes = [] for i in range(28): number_of_terms = i + 1 db = access_db(('All_Poss_Mod_{}_Terms'.format(number_of_terms)), False) all_model_codes += extractnames(db.search(Q.mc.exists()), 'mc') sv_db = access_db(0, True) model = LinearRegression(fit_intercept=False) all_full_models = get_all_lin_model_inp() return sv_db, model, all_full_models, all_model_codes
def gen_and_score_mod(self, column): """Generates all models and scores the data without storing all the possible models, no big tinydb's are used""" Y = self.Ys[column].dropna().values sn_Y = self.Ys[column].dropna().index my_cv = ShuffleSplit(len(Y), n_iter=3, test_size=0.333, random_state=0) equip, d_type = column.split(' ') top_db = access_db('Top_score_results_' + equip + '_' + d_type, False) for i in range(2): number_of_terms = i + 1 done = top_db.contains(self.Q.n_terms == number_of_terms) if done: continue terms_key = gen_terms_key() # Generate all possible models top_score = -10000.0 for i in combinations(list(range(28)), number_of_terms): invalid = False for j in i: if j >= 7: key_1 = terms_key[j][0] key_2 = terms_key[j][1] if key_1 not in i or key_2 not in i: invalid = True break if not invalid: # Generate X for certain model and Y X = gen_X(sn_Y, self.all_full_input, i) scores = cross_val_score(self.model_obj, X, Y, cv=my_cv) score = mean(scores) top_score = max(score, top_score) if top_score == score: top_mcode = list(i) entry = { 'equipment_name': equip, 'data_type': d_type, 'n_terms': number_of_terms, 'top_score': top_score, 'top_mcode': top_mcode } top_db.insert(entry)
def score_models(column): """Generates all models and scores the data without storing all the possible models, no big tinydb's are used""" Ys = get_Ys() all_full_input = get_all_lin_model_inp() model_obj = LinearRegression(fit_intercept=False) Y = Ys[column].dropna().values sn_Y = Ys[column].dropna().index my_cv = ShuffleSplit(len(Y), n_iter=3, test_size=0.333, random_state=0) equip, d_type = column.split(' ') top_db = access_db('Top_score_results_' + equip + '_' + d_type, False) for i in range(28): number_of_terms = i + 1 done = top_db.contains(Q.n_terms == number_of_terms) if done: continue f_name = 'All_Poss_Mod_{}_Terms'.format(number_of_terms) f_obj = access_file(f_name, write=False) mcodes = cPickle.load(f_obj) f_obj.close() top_score = -10000.0 for i in mcodes: # Generate X for certain model and Y X = gen_X(sn_Y, all_full_input, i) scores = cross_val_score(model_obj, X, Y, cv=my_cv) score = mean(scores) top_score = max(score, top_score) if top_score == score: top_mcode = list(i) entry = { 'equipment_name': equip, 'data_type': d_type, 'n_terms': number_of_terms, 'top_score': top_score, 'top_mcode': top_mcode } top_db.insert(entry)
def min_max_df(const_list, full=False): sv_db = access_db(0, True) msrmnts = get_msrmnts(sv_db, Q) if not full: for_opt = msrmnts[const_list] else: for_opt = msrmnts df = concat([for_opt.max(), for_opt.min()], axis=1) df.columns = ['max', 'min'] return df
def get_mod_info(): mod_db = access_db(3, True) mod_df = DataFrame(mod_db.all()) mod_df['name'] = mod_df.equipment_name + ' ' + mod_df.data_type mod_df = mod_df.set_index('name') mod_df = mod_df.drop(['data_type', 'equipment_name', 'p_vals', 'r_sqrd', 'select_score', 't_vals' ], axis=1) return mod_df
def preprocessing(): """ Runs all the functions that put raw data into the single values database """ sv_db = access_db(0, True) equip_list = [equipment.Rheomix(), equipment.Thermomat(), equipment.Colour(), equipment.LOI(), equipment.MCC(), equipment.ConeCal(), equipment.Tensile(), equipment.MassFrac() ] for e in equip_list: e.raw_data_to_db(sv_db)
def pca_X(impute=False, exclude_inp=True): sv_db = access_db(0, True) Q = Query() # Extract data from db using pandas to construct X compositions = DataFrame(sv_db.search(Q.ingredient.exists())) compositions[ 'name'] = compositions.data_type + ' ' + compositions.ingredient compositions = compositions[['name', 'sample_number', 'value']].pivot(index='sample_number', columns='name', values='value') measurements = get_msrmnts(sv_db, Q) alldata = concat([compositions, measurements], axis=1) # Database has missing values, missing values can either be # replaced by mean or the incomplete rows are excluded from X if not impute: measurements = measurements.dropna() alldata = alldata.dropna() if exclude_inp: use = measurements else: use = alldata X = use.values.tolist() if impute: imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) X = imp.transform(X) X_std = StandardScaler().fit_transform(X) # X ,df of data used return X_std, use
def get_Ys(do_pca=False): """Get Ys as DataFrame for fitting, if no PCA measurements are scaled from -1 to 1""" sv_db = access_db(0, True) measurements = get_msrmnts(sv_db, Q) if do_pca: X, df = pca_X() my_pca = PCA(n_components=0.99) my_pca.fit(X) X_trans = my_pca.transform(X) sn_Y = list(df.index) names = ['PCA Comp_' + str(i + 1) for i in range(my_pca.n_components_)] Ys = DataFrame(X_trans, index=sn_Y, columns=names) return Ys Ys = measurements Ys = Ys - Ys.min() Ys = Ys / Ys.max() return Ys * 2 - 1
def get_all_lin_model_inp(): """ Gets all the full model inputs from the All_Lin_Full_Model_Inputs """ tag = 'All_Lin_Full_Model_List' db = access_db(1, True) return db.all()[0][tag]
def get_select_models(): """ Selects the model that is 'best' from the top models at each number of model terms """ model_select_db = access_db(3, True) Ys = get_Ys() all_full_input = get_all_lin_model_inp() names = Ys.columns for column in names: equip, d_type = column.split(' ') top_db = access_db('Top_score_results_' + equip + '_' + d_type, False) df = DataFrame(top_db.all()) scores = list(df['top_score'].values) mcodes = list(df['top_mcode'].values) max_score = max(scores) done = False # Select model with least number of terms where prediction improves # no more than 5 % at max prediction. lim = max_score - (abs(max_score * 5 / 105)) for s in scores: if s > lim and not done: select_score = s done = True ind = scores.index(select_score) select_model = mcodes[ind] Y = Ys[column].dropna().values sn_Y = Ys[column].dropna().index X = gen_X(sn_Y, all_full_input, select_model) params, conf_int, r_sqrd, p_vals, t_vals = model_stats(X, Y) my_Q = ((Q.equipment_name == equip) & (Q.data_type == d_type)) done = model_select_db.contains(my_Q) if done: model_select_db.update( { 'select_score': select_score, 'select_mcode': select_model, 'model_params': list(params), 'r_sqrd': r_sqrd, 'p_vals': list(p_vals), 't_vals': list(t_vals) }, my_Q) continue entry = { 'equipment_name': equip, 'data_type': d_type, 'select_score': select_score, 'select_mcode': select_model, 'model_params': list(params), 'r_sqrd': r_sqrd, 'p_vals': list(p_vals), 't_vals': list(t_vals) } model_select_db.insert(entry)