def init_system(): global count, total_to_download, mode, app_name, data_splits debug.debug_text('Initialising System', level_1=True) count, total_to_download = 1, 1 collect_paths() debug.debug_text('Seting up logging file', update=True) for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) app_name = paths['Output'].parents[1].name logging_name = app_name + '.log' logging.basicConfig(filename=(paths['Output'] / logging_name).as_posix(), level=logging.DEBUG, format='%(asctime)s %(message)s', filemode='w') logging.info('Logging for {} stored at {}'.format(app_name, paths['Output'])) create_models() # track what data needs to be produced and stored if val_per > 0: data_splits = ['train', 'test', 'val'] else: data_splits = ['train', 'test'] # set up output csv return
def store_results(single_model): debug.debug_text('Storing results for {}'.format( single_model.model_to_run), update=True) single_model.store_result() single_model.display_state() return
def one_hot_encode(df, one_hot_columns=con.one_hot_columns): debug.debug_text('one hot encoding {}'.format(one_hot_columns), update=True) for column in one_hot_columns: dfDummies = pd.get_dummies(df[column], prefix=column) df = pd.concat([df, dfDummies], axis=1) df.drop(columns=[column], inplace=True) return df
def create_models(): global models debug.debug_text('Creating list of models', update=True) for key, values in models_to_evaulate.items(): for value in values: model_to_run, module = key.split('|') models.append( v.model(model_to_run=model_to_run, params=value, module=module))
def run_models(): for single_model in con.models: debug.debug_text('Running model -> {}'.format(single_model.name), level_2=True) train_model(single_model) predict_results(single_model) validate_model(single_model) store_results(single_model) return
def predict_results(single_model): debug.debug_text('Predicting {} with {}'.format(single_model.model_to_run, single_model.params), update=True) # predict for val in con.data_splits: single_model.predicted[val] = single_model.trained_model.predict( con.data_dic['x_' + val]) return
def train_model(single_model): debug.debug_text('Training {} with {}'.format(single_model.model_to_run, single_model.params), update=True) # train model based on class values module = importlib.import_module(single_model.module) model_to_run = getattr(module, single_model.model_to_run) # run model based on function and train single_model.trained_model = model_to_run(**single_model.params).fit( con.data_dic['x_train'], con.data_dic['y_train']) return
def collect_paths(): global paths debug.debug_text('Collecting Paths', update=True) basePath = Path(__file__).parents[2] # get base path paths = { 'Base': basePath, 'Input': (basePath / 'Input'), 'Output': (basePath / 'Output'), 'Raw_Files': (basePath / 'Raw_Files') } return
def load_file(name, type): debug.debug_text('loading file', update=True) df = None loc = con.paths['Input'] / (name + '.' + type) if type == 'csv': return pd.read_csv(loc) if type == 'excel': return pd.read_excel(loc) else: debug.debug_text('Incorrect file type', error=True) exit() return df
def replace_missing(df, field_with_missing_dic=con.field_with_missing_dic): debug.debug_text('replacing missing values for {}'.format(list(field_with_missing_dic.keys())), update=True) for key, value in field_with_missing_dic.items(): if value == 'Mean': df[key].fillna(df[key].mean(), inplace=True) if value == 'Median': df[key].fillna(df[key].median(), inplace=True) if value == '0': df[key].fillna(0, inplace=True) else: df[key].fillna(value, inplace=True) return df
def write_to_output_csv(df): file_name = app_name + '_output.csv' check_dir((paths['Output'] / file_name).as_posix()) with open((paths['Output'] / file_name).as_posix(), 'a') as f: df.to_csv(f, header=f.tell() == 0, sep=',', index=False) f.close() count = len(df) debug.debug_text("{} record saved to {}".format(count, file_name), update=True) return
def split_data(df, target=con.target, train_per=con.train_per, test_per=con.test_per, val_per=con.val_per): debug.debug_text('Splitting Data: Train ({}), Test ({}), Val ({}) with a target field of {}'.format( train_per, test_per, val_per, target ), update=True) x = df.drop(columns=[target]) y = df[target] x_train, x_test, x_val, y_train, y_test, y_val = '', '', '', '', '', '' x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_per) if val_per > 0: x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, train_size=test_per / (test_per + val_per)) debug.debug_text( 'Sizes ({}): x_train ({}), x_test ({}), x_val ({}), y_train ({}), y_test ({}), y_val ({})'.format(len(df), len( x_train), len( x_test), len( x_val), len( y_train), len( y_test), len( y_val)), update=True) return {'x_train': x_train, 'x_test': x_test, 'x_val': x_val, 'y_train': y_train, 'y_test': y_test, 'y_val': y_val} else: debug.debug_text( 'Sizes ({}): x_train ({}), x_test ({}), x_val ({}), y_train ({}), y_test ({}), y_val ({})'.format(len(df), len( x_train), len( x_test), len( x_val), len( y_train), len( y_test), len( y_val)), update=True) return {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}
def rename_column(df, name_dic=con.name_change_dic): debug.debug_text('renaming columns {}'.format(list(name_dic.keys())), update=True) return df.rename(columns=name_dic)
def validate_model(single_model): debug.debug_text('Validating {}'.format(single_model.model_to_run), update=True) single_model.validate() return
def normalise_fields(df): debug.debug_text('normalising fields', update=True) x = df.values # returns a numpy array min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) return pd.DataFrame(x_scaled, columns=df.columns, index=df.index)
def change_index_column(df, index_column=con.index_column): debug.debug_text('resetting index to {}'.format(index_column), update=True) return df.set_index(index_column)
def replace_values(df, replace_field_with_dic=con.replace_field_with_dic): debug.debug_text('replacing values for {}'.format(list(replace_field_with_dic.keys())), update=True) for key, value in replace_field_with_dic.items(): df[key].replace(value, inplace=True) return df
def display_state(self): debug.debug_text('Model is Currently', level_4=True) attrs = vars(self) debug.debug_text(',\n'.join("%s: %s" % item for item in attrs.items()), results=True) return
# main file, undertakes the core components from frameworks import data_prep as dp from frameworks import modelling as m from frameworks import validation as v from utilities import config as con from utilities import debugging as debug # initialise system con.init_system() ################# # DATA CLEANING # ################# debug.debug_text('Pre-processing Data', level_1=True) # 1) Load data df = dp.load_file(name='titanic', type='csv') # 2) rename fields df = dp.rename_column(df) # 3) remove fields df = dp.remove_columns(df) # 4) replace missing df = dp.replace_missing(df) # 5) replace values df = dp.replace_values(df) # 6) one hot encoding df = dp.one_hot_encode(df) # 7) change index column df = dp.change_index_column(df) # 8) normalise fields df = dp.normalise_fields(df)
def remove_columns(df, required_field_list=con.required_field_list): to_remove = list(set(list(df.columns)) - set(required_field_list)) debug.debug_text('removing columns {}'.format(to_remove), update=True) return df[required_field_list]