예제 #1
0
def init_system():
    global count, total_to_download, mode, app_name, data_splits
    debug.debug_text('Initialising System', level_1=True)
    count, total_to_download = 1, 1

    collect_paths()

    debug.debug_text('Seting up logging file', update=True)
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    app_name = paths['Output'].parents[1].name
    logging_name = app_name + '.log'
    logging.basicConfig(filename=(paths['Output'] / logging_name).as_posix(),
                        level=logging.DEBUG,
                        format='%(asctime)s %(message)s',
                        filemode='w')
    logging.info('Logging for {} stored at {}'.format(app_name,
                                                      paths['Output']))

    create_models()

    # track what data needs to be produced and stored
    if val_per > 0:
        data_splits = ['train', 'test', 'val']
    else:
        data_splits = ['train', 'test']

    # set up output csv

    return
예제 #2
0
def store_results(single_model):
    debug.debug_text('Storing results for {}'.format(
        single_model.model_to_run),
                     update=True)
    single_model.store_result()
    single_model.display_state()
    return
예제 #3
0
def one_hot_encode(df, one_hot_columns=con.one_hot_columns):
    debug.debug_text('one hot encoding {}'.format(one_hot_columns), update=True)
    for column in one_hot_columns:
        dfDummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dfDummies], axis=1)
        df.drop(columns=[column], inplace=True)

    return df
예제 #4
0
def create_models():
    global models
    debug.debug_text('Creating list of models', update=True)
    for key, values in models_to_evaulate.items():
        for value in values:
            model_to_run, module = key.split('|')
            models.append(
                v.model(model_to_run=model_to_run, params=value,
                        module=module))
예제 #5
0
def run_models():
    for single_model in con.models:
        debug.debug_text('Running model -> {}'.format(single_model.name),
                         level_2=True)
        train_model(single_model)
        predict_results(single_model)
        validate_model(single_model)
        store_results(single_model)
    return
예제 #6
0
def predict_results(single_model):
    debug.debug_text('Predicting {} with {}'.format(single_model.model_to_run,
                                                    single_model.params),
                     update=True)
    # predict
    for val in con.data_splits:
        single_model.predicted[val] = single_model.trained_model.predict(
            con.data_dic['x_' + val])

    return
예제 #7
0
def train_model(single_model):
    debug.debug_text('Training {} with {}'.format(single_model.model_to_run,
                                                  single_model.params),
                     update=True)
    # train model based on class values
    module = importlib.import_module(single_model.module)
    model_to_run = getattr(module, single_model.model_to_run)
    # run model based on function and train
    single_model.trained_model = model_to_run(**single_model.params).fit(
        con.data_dic['x_train'], con.data_dic['y_train'])
    return
예제 #8
0
def collect_paths():
    global paths
    debug.debug_text('Collecting Paths', update=True)
    basePath = Path(__file__).parents[2]  # get base path
    paths = {
        'Base': basePath,
        'Input': (basePath / 'Input'),
        'Output': (basePath / 'Output'),
        'Raw_Files': (basePath / 'Raw_Files')
    }
    return
예제 #9
0
def load_file(name, type):
    debug.debug_text('loading file', update=True)
    df = None
    loc = con.paths['Input'] / (name + '.' + type)
    if type == 'csv':
        return pd.read_csv(loc)
    if type == 'excel':
        return pd.read_excel(loc)
    else:
        debug.debug_text('Incorrect file type', error=True)
        exit()
    return df
예제 #10
0
def replace_missing(df, field_with_missing_dic=con.field_with_missing_dic):
    debug.debug_text('replacing missing values for {}'.format(list(field_with_missing_dic.keys())), update=True)
    for key, value in field_with_missing_dic.items():
        if value == 'Mean':
            df[key].fillna(df[key].mean(), inplace=True)
        if value == 'Median':
            df[key].fillna(df[key].median(), inplace=True)
        if value == '0':
            df[key].fillna(0, inplace=True)
        else:
            df[key].fillna(value, inplace=True)

    return df
예제 #11
0
def write_to_output_csv(df):
    file_name = app_name + '_output.csv'
    check_dir((paths['Output'] / file_name).as_posix())

    with open((paths['Output'] / file_name).as_posix(), 'a') as f:
        df.to_csv(f, header=f.tell() == 0, sep=',', index=False)

    f.close()

    count = len(df)
    debug.debug_text("{} record saved to {}".format(count, file_name),
                     update=True)

    return
예제 #12
0
def split_data(df, target=con.target, train_per=con.train_per, test_per=con.test_per, val_per=con.val_per):
    debug.debug_text('Splitting Data: Train ({}), Test ({}), Val ({}) with a target field of {}'.format(
        train_per,
        test_per,
        val_per,
        target
    ), update=True)
    x = df.drop(columns=[target])
    y = df[target]

    x_train, x_test, x_val, y_train, y_test, y_val = '', '', '', '', '', ''

    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_per)

    if val_per > 0:

        x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, train_size=test_per / (test_per + val_per))

        debug.debug_text(
            'Sizes ({}): x_train ({}), x_test ({}), x_val ({}), y_train ({}), y_test ({}), y_val ({})'.format(len(df),
                                                                                                              len(
                                                                                                                  x_train),
                                                                                                              len(
                                                                                                                  x_test),
                                                                                                              len(
                                                                                                                  x_val),
                                                                                                              len(
                                                                                                                  y_train),
                                                                                                              len(
                                                                                                                  y_test),
                                                                                                              len(
                                                                                                                  y_val)),
            update=True)

        return {'x_train': x_train, 'x_test': x_test, 'x_val': x_val, 'y_train': y_train, 'y_test': y_test,
                'y_val': y_val}
    else:
        debug.debug_text(
            'Sizes ({}): x_train ({}), x_test ({}), x_val ({}), y_train ({}), y_test ({}), y_val ({})'.format(len(df),
                                                                                                              len(
                                                                                                                  x_train),
                                                                                                              len(
                                                                                                                  x_test),
                                                                                                              len(
                                                                                                                  x_val),
                                                                                                              len(
                                                                                                                  y_train),
                                                                                                              len(
                                                                                                                  y_test),
                                                                                                              len(
                                                                                                                  y_val)),
            update=True)

        return {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}
예제 #13
0
def rename_column(df, name_dic=con.name_change_dic):
    debug.debug_text('renaming columns {}'.format(list(name_dic.keys())), update=True)
    return df.rename(columns=name_dic)
예제 #14
0
def validate_model(single_model):
    debug.debug_text('Validating {}'.format(single_model.model_to_run),
                     update=True)
    single_model.validate()
    return
예제 #15
0
def normalise_fields(df):
    debug.debug_text('normalising fields', update=True)
    x = df.values  # returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled, columns=df.columns, index=df.index)
예제 #16
0
def change_index_column(df, index_column=con.index_column):
    debug.debug_text('resetting index to {}'.format(index_column), update=True)
    return df.set_index(index_column)
예제 #17
0
def replace_values(df, replace_field_with_dic=con.replace_field_with_dic):
    debug.debug_text('replacing values for {}'.format(list(replace_field_with_dic.keys())), update=True)
    for key, value in replace_field_with_dic.items():
        df[key].replace(value, inplace=True)

    return df
예제 #18
0
 def display_state(self):
     debug.debug_text('Model is Currently', level_4=True)
     attrs = vars(self)
     debug.debug_text(',\n'.join("%s: %s" % item for item in attrs.items()),
                      results=True)
     return
예제 #19
0
파일: main.py 프로젝트: camwas92/utilities
# main file, undertakes the core components

from frameworks import data_prep as dp
from frameworks import modelling as m
from frameworks import validation as v
from utilities import config as con
from utilities import debugging as debug

# initialise system
con.init_system()

#################
# DATA CLEANING #
#################
debug.debug_text('Pre-processing Data', level_1=True)
# 1) Load data
df = dp.load_file(name='titanic', type='csv')
# 2) rename fields
df = dp.rename_column(df)
# 3) remove fields
df = dp.remove_columns(df)
# 4) replace missing
df = dp.replace_missing(df)
# 5) replace values
df = dp.replace_values(df)
# 6) one hot encoding
df = dp.one_hot_encode(df)
# 7) change index column
df = dp.change_index_column(df)
# 8) normalise fields
df = dp.normalise_fields(df)
예제 #20
0
def remove_columns(df, required_field_list=con.required_field_list):
    to_remove = list(set(list(df.columns)) - set(required_field_list))
    debug.debug_text('removing columns {}'.format(to_remove), update=True)
    return df[required_field_list]