def tune_fitness_function(params, **kwargs): '''Fitness function used by ABC Args: params (dict): bee hyperparams kwargs (dict): additional arguments Returns: float: error of NN with supplied hyperparams ''' vars = default_config() vars['beta_1'] = params[0] vars['beta_2'] = params[1] vars['decay'] = params[2] vars['epsilon'] = params[3] vars['learning_rate'] = params[4] vars['hidden_layers'] = kwargs['hidden_layers'] for l_idx in range(len(vars['hidden_layers'])): vars['hidden_layers'][l_idx][0] = params[5 + l_idx] df = kwargs['df'] if kwargs['shuffle'] is not None: df.shuffle(kwargs['shuffle'], kwargs['split']) sets = df.package_sets() return train_model(sets, vars, kwargs['eval_set'], kwargs['eval_fn'], validate=kwargs['validate'], save=False)
def test_init(self): print('\nUNIT TEST: Server init') sv = Server() self.assertTrue(exists('config.yml')) self.assertEqual(sv._vars, default_config()) remove('config.yml')
def test_use_model(self): print('\nUNIT TEST: use_model') df = data_utils.DataFrame(DB_LOC) df.create_sets(random=True) pd = df.package_sets() config = server_utils.default_config() config['epochs'] = 100 _ = server_utils.train_model(pd, config, 'test', 'rmse', filename='test_use.h5') self.assertEqual( len(server_utils.use_model(pd, 'learn', 'test_use.h5')), len(pd.learn_y)) self.assertEqual( len(server_utils.use_model(pd, 'valid', 'test_use.h5')), len(pd.valid_y)) self.assertEqual( len(server_utils.use_model(pd, 'test', 'test_use.h5')), len(pd.test_y)) self.assertEqual( len(server_utils.use_model(pd, 'train', 'test_use.h5')), len(pd.learn_y) + len(pd.valid_y)) self.assertEqual(len(server_utils.use_model(pd, None, 'test_use.h5')), len(pd.learn_y) + len(pd.valid_y) + len(pd.test_y)) remove('test_use.h5')
def test_check_config(self): print('\nUNIT TEST: check_config') dc = server_utils.default_config() del dc['batch_size'] self.assertFalse('batch_size' in list(dc.keys())) dc = server_utils.check_config(dc) self.assertTrue('batch_size' in list(dc.keys())) self.assertEqual(dc['batch_size'], 32)
def optimize_ecnet(param_dict, args): vars = default_config() vars['beta_1'] = param_dict['beta_1'].value vars['beta_2'] = param_dict['beta_2'].value vars['epsilon'] = param_dict['epsilon'].value vars['learning_rate'] = param_dict['learning_rate'].value vars['decay'] = param_dict['decay'].value vars['hidden_layers'][0][0] = param_dict['hidden_1'].value vars['hidden_layers'][1][0] = param_dict['hidden_2'].value dataframe = args['dataframe'] sets = dataframe.package_sets() return train_model(sets, vars, 'test', 'rmse', validate=True, save=False)
def test_train_model(self): print('\nUNIT TEST: train_model') df = data_utils.DataFrame(DB_LOC) df.create_sets(random=True) pd = df.package_sets() config = server_utils.default_config() config['epochs'] = 100 _ = server_utils.train_model(pd, config, 'test', 'r2', filename='test_train.h5') self.assertTrue(exists('test_train.h5')) remove('test_train.h5')
def __init__(self, model_config: str = 'config.yml', prj_file: str = None, num_processes: int = 1): '''Server object: handles data loading, model creation, data-to-model hand-off, data input parameter selection, hyperparameter tuning Args: model_config (str): path to multilayer perceptron .yml config file; if not found, default config is generated prj_file (str): path to pre-existing ECNet .prj file, if using for retraining/new predictions num_processes (int): number of parallel processes to utilize for training and tuning processes ''' logger.log('debug', 'Arguments:\n\t| model_config:\t\t{}\n\t|' ' prj_file:\t\t{}\n\t| num_processes:\t{}'.format( model_config, prj_file, num_processes), call_loc='INIT') self._num_processes = num_processes if prj_file is not None: self._prj_name, self._num_pools, self._num_candidates, self._df,\ self._cf_file, self._vars = open_project(prj_file) check_config(self._vars) self._sets = self._df.package_sets() logger.log('info', 'Opened project {}'.format(prj_file), call_loc='INIT') return self._cf_file = model_config self._prj_name = None self._vars = {} try: self._vars.update(open_config(self._cf_file)) check_config(self._vars) except FileNotFoundError: logger.log( 'warn', '{} not found, generating default config'.format(model_config), call_loc='INIT') self._vars = default_config() save_config(self._vars, self._cf_file)
def test_default_config(self): print('\nUNIT TEST: default_config') dc = server_utils.default_config() self.assertEqual( dc, { 'epochs': 3000, 'learning_rate': 0.01, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-8, 'decay': 0.0, 'hidden_layers': [[32, 'relu'], [32, 'relu']], 'output_activation': 'linear', 'batch_size': 32, 'patience': 128 })
def test_open_save_config(self): print('\nUNIT TEST: open/save config') config = server_utils.default_config() server_utils.save_config(config, 'config.yml') config = server_utils.open_config('config.yml') self.assertEqual( server_utils.open_config('config.yml'), { 'epochs': 3000, 'learning_rate': 0.01, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-8, 'decay': 0.0, 'hidden_layers': [[32, 'relu'], [32, 'relu']], 'output_activation': 'linear', 'batch_size': 32, 'patience': 128 }) remove('config.yml')
def test_th_multiprocess(self): print('\nUNIT TEST: tune_hyperparameters multiprocessed') df = DataFrame(DB_LOC) df.create_sets(random=True) config = default_config() new_hp = tune_hyperparameters(df, config, 2, 1, 2, epochs=100) self.assertGreaterEqual(new_hp['beta_1'], 0) self.assertLessEqual(new_hp['beta_1'], 1) self.assertGreaterEqual(new_hp['beta_2'], 0) self.assertLessEqual(new_hp['beta_2'], 1) self.assertGreaterEqual(new_hp['decay'], 0) self.assertLessEqual(new_hp['decay'], 1) self.assertGreaterEqual(new_hp['epsilon'], 0) self.assertLessEqual(new_hp['epsilon'], 1) self.assertGreaterEqual(new_hp['learning_rate'], 0) self.assertLessEqual(new_hp['learning_rate'], 1) self.assertGreaterEqual(new_hp['batch_size'], 1) self.assertLessEqual(new_hp['batch_size'], len(df.learn_set)) self.assertGreaterEqual(new_hp['hidden_layers'][0][0], 1) self.assertLessEqual(new_hp['hidden_layers'][0][0], 600) self.assertGreaterEqual(new_hp['hidden_layers'][1][0], 1) self.assertLessEqual(new_hp['hidden_layers'][1][0], 600)
def create_model(prop_abvr: str, smiles: list = None, targets: list = None, db_name: str = None, qspr_backend: str = 'padel', create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1], log_level: str = 'info', log_to_file: bool = True, num_processes: int = 1): ''' create_model: ECRL's database/model creation workflow for all publications Args: prop_abvr (str): abbreviation for the property name (e.g. CN) smiles (list): if supplied with targets, creates a new database targets (list): if supplied with smiles, creates a new database db_name (str): you may supply an existing ECNet-formatted database qspr_backend (str): if creating new database, generation software to use (`padel`, `alvadesc`) create_plots (bool): if True, creates plots for median absolute error vs. number of descriptors as inputs, parity plot for all sets data_split (list): [learn %, valid %, test %] for all supplied data log_level (str): `debug`, `info`, `warn`, `error`, `crit` log_to_file (bool): if True, saves workflow logs to a file in `logs` directory num_processes (int): number of concurrent processes to use for various tasks ''' # Initialize logging logger.stream_level = log_level if log_to_file: logger.file_level = log_level # If database not supplied, create database from supplied SMILES, targets if db_name is None: if smiles is None or targets is None: raise ValueError('Must supply SMILES and target values') db_name = datetime.now().strftime('{}_model_%Y%m%d.csv'.format( prop_abvr )) logger.log('info', 'Creating database {}...'.format(db_name), 'WORKFLOW') create_db(smiles, db_name, targets, prop_abvr, backend=qspr_backend) logger.log('info', 'Created database {}'.format(db_name), 'WORKFLOW') # Create database split, each subset has proportionally equal number of # compounds based on range of experimental/target values logger.log('info', 'Creating optimal data split...', 'WORKFLOW') prop_range_from_split(db_name, data_split) logger.log('info', 'Created optimal data split', 'WORKFLOW') df = DataFrame(db_name) df.create_sets() logger.log('info', '\tLearning set: {}'.format(len(df.learn_set)), 'WORKFLOW') logger.log('info', '\tValidation set: {}'.format(len(df.valid_set)), 'WORKFLOW') logger.log('info', '\tTest set: {}'.format(len(df.test_set)), 'WORKFLOW') # Find optimal number of QSPR input variables logger.log('info', 'Finding optimal number of inputs...', 'WORKFLOW') errors, desc = find_optimal_num_inputs(db_name, 'valid', num_processes) df = DataFrame(db_name) df.set_inputs(desc) df.save(db_name.replace('.csv', '_opt.csv')) logger.log('info', 'Found optimal number of inputs', 'WORKFLOW') logger.log('info', '\tNumber of inputs: {}'.format(len(df._input_names)), 'WORKFLOW') # Plot the curve of MAE vs. num. desc. added, if desired if create_plots: logger.log('info', 'Creating plot of MAE vs. descriptors...', 'WORKFLOW') num_add = [e[0] for e in errors] maes = [e[1] for e in errors] opt_num = len(desc) plt.clf() plt.rcParams['font.family'] = 'Times New Roman' plt.plot(num_add, maes, c='blue') plt.axvline(x=opt_num, c='red', linestyle='--') plt.xlabel('Number of Descriptors as ANN Input Variables') plt.ylabel('Median Absolute Error of {} Predictions'.format(prop_abvr)) plt.savefig(db_name.replace('.csv', '_desc_curve.png')) logger.log('info', 'Created plot of MAE vs. descriptors', 'WORKFLOW') # Tune ANN hyperparameters according to validation set performance logger.log('info', 'Tuning ANN hyperparameters...', 'WORKFLOW') config = default_config() config = tune_hyperparameters(df, config, 25, 10, num_processes, shuffle='train', split=[0.7, 0.2, 0.1], validate=True, eval_set='valid', eval_fn='med_abs_error', epochs=300) config['epochs'] = default_config()['epochs'] config_filename = db_name.replace('.csv', '.yml') save_config(config, config_filename) logger.log('info', 'Tuned ANN hyperparameters', 'WORKFLOW') logger.log('info', '\tLearning rate: {}'.format(config['learning_rate']), 'WORKFLOW') logger.log('info', '\tLR decay: {}'.format(config['decay']), 'WORKFLOW') logger.log('info', '\tBatch size: {}'.format(config['batch_size']), 'WORKFLOW') logger.log('info', '\tPatience: {}'.format(config['patience']), 'WORKFLOW') logger.log('info', '\tHidden layers: {}'.format(config['hidden_layers']), 'WORKFLOW') # Create Model logger.log('info', 'Generating ANN...', 'WORKFLOW') sv = Server(db_name.replace('.csv', '.yml'), num_processes=num_processes) sv.load_data(db_name.replace('.csv', '_opt.csv')) sv.create_project(db_name.replace('.csv', ''), 5, 75) sv.train(validate=True, selection_set='valid', shuffle='train', split=[0.7, 0.2, 0.1], selection_fn='med_abs_error') logger.log('info', 'ANN Generated', 'WORKFLOW') logger.log('info', 'Measuring ANN performance...', 'WORKFLOW') preds_test = sv.use(dset='test') preds_train = sv.use(dset='train') test_errors = sv.errors('r2', 'med_abs_error', dset='test') train_errors = sv.errors('r2', 'med_abs_error', dset='train') logger.log('info', 'Measured ANN performance', 'WORKFLOW') logger.log('info', '\tTraining set:\t R2: {}\t MAE: {}'.format( train_errors['r2'], train_errors['med_abs_error']), 'WORKFLOW') logger.log('info', '\tTesting set:\t R2: {}\t MAE: {}'.format( test_errors['r2'], test_errors['med_abs_error']), 'WORKFLOW') sv.save_project(del_candidates=True) if create_plots: logger.log('info', 'Creating parity plot...', 'WORKFLOW') plt.clf() parity_plot = ParityPlot( '', 'Experimental {} Value'.format(prop_abvr), 'Predicted {} Value'.format(prop_abvr) ) parity_plot.add_series(concatenate( (sv._sets.learn_y, sv._sets.valid_y) ), preds_train, 'Training Set', 'blue') parity_plot.add_series(sv._sets.test_y, preds_test, 'Test Set', 'red') parity_plot.add_error_bars(test_errors['med_abs_error'], 'Test MAE') parity_plot._add_label('Test $R^2$', test_errors['r2']) parity_plot._add_label('Training MAE', train_errors['med_abs_error']) parity_plot._add_label('Training $R^2$', train_errors['r2']) parity_plot.save(db_name.replace('.csv', '_parity.png')) logger.log('info', 'Created parity plot', 'WORKFLOW')
def find_optimal_num_inputs(db_name: str, eval_set: str, num_processes: int) -> tuple: ''' find_optimal_num_inputs: find the optimal number of input variables, return names of variables; optimal number of variables produces lowest median absolute error; variables added 25 at a time, according to RFR importance score (most-to-least important) Args: db_name (str): name/location of ECNet-formatted database eval_set (str): set to evaluate (`learn`, `valid`, `train`, `test`, None (all)) num_processes (int): number of concurrent processes to run for RFR, training Returns: tuple: ([addition1, error1, ..., additionN, errorN], opt_desc) ''' conf = default_config() conf['epochs'] = 300 df = DataFrame(db_name) df.create_sets() conf['batch_size'] = len(df.learn_set) desc = limit_rforest(df, len(df._input_names), num_processes=num_processes, eval_set=eval_set) desc = [d[0] for d in desc] errors = [] if num_processes > 1: if name != 'nt': set_start_method('spawn', force=True) train_pool = Pool(processes=num_processes) for d_idx in range(0, len(desc), 10): if d_idx >= len(desc) - 1: to_use = desc[:] else: to_use = desc[:d_idx + 1] df = DataFrame(db_name) df.set_inputs(to_use) df.create_sets() sets = df.package_sets() if num_processes > 1: errors.append([ d_idx, train_pool.apply_async(train_model, [ sets, conf, eval_set, 'med_abs_error', False, '_.h5', False, False ]) ]) else: errors.append([ d_idx, train_model(sets, conf, eval_set, 'med_abs_error', False, '_.h5', False, False)[0] ]) if num_processes > 1: train_pool.close() train_pool.join() for idx, err in enumerate(errors): errors[idx][1] = err[1].get()[0] min_error = errors[0][1] opt_num_desc = 1 for err in errors[1:]: if err[1] < min_error: min_error = err[1] opt_num_desc = err[0] return (errors, desc[:opt_num_desc])