def split_by_file(run_id, train_fname, test_fname): """ The method splits the training data by file """ logger = custom_logger.CustomLogger(run_id+':'+file_id) train_df = load_data.load_from_tsv(train_fname, 'training') test_df = load_data.load_from_tsv(test_fname, 'test') return train_df, test_df
def unpickle_model(model_fanme): """ Load model from 'config.info.model_file' """ print(model_fanme) logger = custom_logger.CustomLogger(result_filing.run_id + ':' + file_id) if model_fanme == '': logger.error('Filepath for model is not provided') sys.exit('Filepath for model is not provided') else: loaded_model = pickle.load(open(model_fanme, 'rb')) return loaded_model
def save_meta_file(gen_dict, f_name): """ Write train_model or test_results metadata to specified file """ logger = custom_logger.CustomLogger(run_id + ':' + file_id) filename = run_id + '_' + f_name + '.meta' f = open(os.path.join(unique_op_dir, filename), 'a') print('Output stored in %s' % (str(os.path.join(unique_op_dir, filename)))) logger.info('Output stored in %s' % (str(os.path.join(unique_op_dir, filename)))) for key, val in gen_dict.items(): line = str(key) + " : " + str(val) + "\n" f.write(line)
def split_by_date(run_id, train_fname, split_date): """ The method splits the training data by date """ logger = custom_logger.CustomLogger(run_id+':'+file_id) if split_date == '': logger.error('No split date provided') sys.exit('No split date provided') df_intraday = load_data.load_from_tsv(train_fname, 'training') date_start = str(df_intraday.head(1).index.date[0]) date_end = str(df_intraday.tail(1).index.date[0]) train_df = df_intraday[date_start:split_date] date_split_1 = datetime.strptime(split_date, "%Y-%m-%d")+ timedelta(days=1) test_df = df_intraday[date_split_1:date_end] return train_df, test_df
def main(): """ The start of the flow handles all initializations, configuration loading and performs training or deployment based upon 'config.info.operation_type' For more details on configuration options look up commnets in config.yaml """ parser = get_parser() config = parser.parse_args(['--cfg', 'config.yaml']) result_filing.init_config_vars(config) run_id = config.info.run_id logger = custom_logger.CustomLogger(run_id + ':' + file_id) operation = config.info.operation_type logger.info("Selected operation type %s." % (operation)) if operation == const.TRAIN_OP: train.train_model(config) elif operation == const.DEPLOY_OP: test.test_model(config)
def test_model(config): """The method loads model given model, test data and saves the test results at specified output directory. """ run_id = run_id = config.info.run_id logger = custom_logger.CustomLogger(run_id + ':' + file_id) test_df = load_data.load_from_tsv(config.train_test_split.test, 'testing') if not test_df.shape[0] == 0: logger.info('The data is loaded successfully') else: logger.error('Empty dataframe loaded') sys.exit('Empty dataframe loaded') print('Test df : %s' % (str(test_df.shape))) # m past smaples to consider for prediction m = config.info.m # n next steps to predict n = config.info.n model_file = config.info.model_file logger.info( 'Test is to predict next %d steps using past %d steps using model %s.' % (n, m, model_file)) test_dict = dict() X_test, y_test = load_data.create_custom_data_structure(test_df, m, n) test_dict['X_test_shape'] = X_test.shape test_dict['y_test_shape'] = y_test.shape print('X_test : %s and y_test: %s ' % (str(X_test.shape), str(y_test.shape))) has_null = y_test.isnull().sum().sum() + X_test.isnull().sum().sum() if not has_null: logger.info( 'Successfuly built custom data structure for (%d input steps, %d output steps) supervised prediction' % (m, n)) else: logger.error('Built custom dataframes have ', has_null, ' NaN values') sys.exit('Built custom dataframes have ', has_null, ' NaN values') model = model_manipulation.unpickle_model(model_file) y_test_predict = model.predict(X_test) avg_test_error = explained_variance_score(y_test, y_test_predict, multioutput='uniform_average') test_dict['avg_test_error'] = avg_test_error result_filing.save_meta_file(test_dict, 'test_results') logger.info('Loaded model predicts with %d average validation error.' % (avg_test_error))
def load_from_tsv(filename, type): """ The method loads tsv file """ #run_id is global variable logger = custom_logger.CustomLogger(result_filing.run_id+':'+file_id) if filename == '': logger.error('Filepath for %s is not provided'%(type)) sys.exit('Filepath for %s is not provided'%(type)) else: loaded_df = pd.read_csv(filename, sep="\t", infer_datetime_format=True, parse_dates=['delivery_start'], index_col=['delivery_start']) # nan check has_null = loaded_df.isnull().sum().sum() if not has_null: logger.info('No NaN values in dataframe loaded from %s'%(filename)) else: logger.error('Dataframe loaded from %s has %d NaN values'%(filename, has_null)) sys.exit('Dataframe loaded from %s has %d NaN values'%(filename, has_null)) return loaded_df
def visualize_train_data(train_df, fname): """ Visualize the time series input """ logger = custom_logger.CustomLogger(run_id + ':' + file_id) fig, axs = plt.subplots(3, figsize=(15, 15)) fig.suptitle('EPEX Intraday Continuous market electricity prices') axs[0].plot(train_df.index, train_df['low'], color='red') axs[0].set_title("Lowest Price") axs[0].set(xlabel='time', ylabel='price (Euros)') axs[1].plot(train_df.index, train_df['high'], color='green') axs[1].set_title("Highest Pice") axs[1].set(xlabel='time', ylabel='price (Euros)') axs[2].plot(train_df.index, train_df['weight_avg'], color='blue') axs[2].set_title("volume-weighted Average Price") axs[2].set(xlabel='time', ylabel='price (Euros)') fig.savefig(os.path.join(unique_op_dir, fname)) logger.info('Training data plots stored at ', os.path.join(unique_op_dir, fname))