示例#1
0
def predict(input_txt,
            results_file,
            prj_file,
            form='name',
            temp_db='_new_mols.csv',
            clean_up=True):
    '''Predicts values for new data using pre-existing .prj file

    Args:
        input_txt (str): path to .txt file containing either molecule names or
            SMILES strings
        results_file (str): path to results file generated by this function
        prj_file (str): path to pre-existing .prj file
        form (str): `name` if supplying molecule names, `SMILES` if supplying
            SMILES strings
        temp_db (str): path to temporary database generated by this function
        clean_up (bool): if True, cleans up all files generated during this
            function (except for input/results files)
    '''

    logger.stream_level = 'disable'
    sv = Server(prj_file=prj_file)
    input_names = sv._df.input_names
    create_db(input_txt, temp_db, form=form, clean_up=clean_up)
    new_data = DataFrame(temp_db)
    new_data.set_inputs(input_names)
    sv._df = new_data
    sv.use(output_filename=results_file)
    if clean_up:
        remove(temp_db)
        rmtree(prj_file.replace('.prj', ''))
示例#2
0
def main():

    logger.stream_level = 'info'
    sv = Server(prj_file='kinetic_viscosity.prj')

    train_exp = []
    train_exp.extend(y for y in sv._sets.learn_y)
    train_exp.extend(y for y in sv._sets.valid_y)
    train_pred = sv.use(dset='train')
    train_errors = sv.errors('rmse', 'r2', dset='train')

    test_exp = sv._sets.test_y
    test_pred = sv.use(dset='test')
    test_errors = sv.errors('rmse', 'r2', dset='test')

    kv_plot = ParityPlot(
        title='Predicted vs. Experimental Kinematic Viscosity',
        x_label='Experimental KV',
        y_label='Predicted KV')
    kv_plot.add_series(train_exp,
                       train_pred,
                       name='Training Set',
                       color='blue')
    kv_plot.add_series(test_exp, test_pred, name='Test Set', color='red')
    kv_plot.add_error_bars(test_errors['rmse'], label='Test RMSE')
    kv_plot._add_label('Test R-Squared', test_errors['r2'])
    kv_plot._add_label('Train RMSE', train_errors['rmse'])
    kv_plot._add_label('Train R-Squared', train_errors['r2'])
    kv_plot.save('../kv_parity_plot.png')
示例#3
0
def train(validate, dset=None):

    sv = Server()
    sv.load_data('cn_model_v1.0.csv')
    sv.train(validate=validate, selection_set=dset)
    sv.use(dset=dset)
    sv.errors('rmse', 'med_abs_error', 'mean_abs_error', 'r2', dset=dset)
示例#4
0
def train_project(validate,
                  shuffle,
                  split=[0.7, 0.2, 0.1],
                  num_processes=1,
                  dset=None,
                  sel_fn='rmse',
                  output_filename=None):

    sv = Server(num_processes=num_processes)
    sv.load_data('cn_model_v1.0.csv')
    sv.create_project('_training_test', num_pools=2, num_candidates=2)
    sv.train(shuffle=shuffle,
             split=split,
             selection_set=dset,
             selection_fn=sel_fn,
             validate=validate)
    sv.use(dset=dset, output_filename=output_filename)
    sv.errors('rmse', 'med_abs_error', 'mean_abs_error', 'r2')
    sv.save_project()
示例#5
0
    def test_use_project(self):

        print('\nUNIT TEST: Server.use')
        sv = Server()
        sv.load_data(DB_LOC, random=True, split=[0.7, 0.2, 0.1])
        sv.create_project('test_project', 2, 2)
        sv._vars['epochs'] = 100
        sv.train()
        results = sv.use()
        self.assertEqual(len(results), len(sv._df))
        remove('config.yml')
        rmtree('test_project')
示例#6
0
def create_model(prop_abvr: str, smiles: list = None, targets: list = None,
                 db_name: str = None, qspr_backend: str = 'padel',
                 create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1],
                 log_level: str = 'info', log_to_file: bool = True,
                 num_processes: int = 1):
    ''' create_model: ECRL's database/model creation workflow for all
    publications

    Args:
        prop_abvr (str): abbreviation for the property name (e.g. CN)
        smiles (list): if supplied with targets, creates a new database
        targets (list): if supplied with smiles, creates a new database
        db_name (str): you may supply an existing ECNet-formatted database
        qspr_backend (str): if creating new database, generation software to
            use (`padel`, `alvadesc`)
        create_plots (bool): if True, creates plots for median absolute error
            vs. number of descriptors as inputs, parity plot for all sets
        data_split (list): [learn %, valid %, test %] for all supplied data
        log_level (str): `debug`, `info`, `warn`, `error`, `crit`
        log_to_file (bool): if True, saves workflow logs to a file in `logs`
            directory
        num_processes (int): number of concurrent processes to use for various
            tasks
    '''

    # Initialize logging
    logger.stream_level = log_level
    if log_to_file:
        logger.file_level = log_level

    # If database not supplied, create database from supplied SMILES, targets
    if db_name is None:
        if smiles is None or targets is None:
            raise ValueError('Must supply SMILES and target values')
        db_name = datetime.now().strftime('{}_model_%Y%m%d.csv'.format(
            prop_abvr
        ))
        logger.log('info', 'Creating database {}...'.format(db_name),
                   'WORKFLOW')
        create_db(smiles, db_name, targets, prop_abvr, backend=qspr_backend)
        logger.log('info', 'Created database {}'.format(db_name), 'WORKFLOW')

    # Create database split, each subset has proportionally equal number of
    #   compounds based on range of experimental/target values
    logger.log('info', 'Creating optimal data split...', 'WORKFLOW')
    prop_range_from_split(db_name, data_split)
    logger.log('info', 'Created optimal data split', 'WORKFLOW')
    df = DataFrame(db_name)
    df.create_sets()
    logger.log('info', '\tLearning set: {}'.format(len(df.learn_set)),
               'WORKFLOW')
    logger.log('info', '\tValidation set: {}'.format(len(df.valid_set)),
               'WORKFLOW')
    logger.log('info', '\tTest set: {}'.format(len(df.test_set)), 'WORKFLOW')

    # Find optimal number of QSPR input variables
    logger.log('info', 'Finding optimal number of inputs...', 'WORKFLOW')
    errors, desc = find_optimal_num_inputs(db_name, 'valid', num_processes)
    df = DataFrame(db_name)
    df.set_inputs(desc)
    df.save(db_name.replace('.csv', '_opt.csv'))
    logger.log('info', 'Found optimal number of inputs', 'WORKFLOW')
    logger.log('info', '\tNumber of inputs: {}'.format(len(df._input_names)),
               'WORKFLOW')

    # Plot the curve of MAE vs. num. desc. added, if desired
    if create_plots:
        logger.log('info', 'Creating plot of MAE vs. descriptors...',
                   'WORKFLOW')
        num_add = [e[0] for e in errors]
        maes = [e[1] for e in errors]
        opt_num = len(desc)
        plt.clf()
        plt.rcParams['font.family'] = 'Times New Roman'
        plt.plot(num_add, maes, c='blue')
        plt.axvline(x=opt_num, c='red', linestyle='--')
        plt.xlabel('Number of Descriptors as ANN Input Variables')
        plt.ylabel('Median Absolute Error of {} Predictions'.format(prop_abvr))
        plt.savefig(db_name.replace('.csv', '_desc_curve.png'))
        logger.log('info', 'Created plot of MAE vs. descriptors', 'WORKFLOW')

    # Tune ANN hyperparameters according to validation set performance
    logger.log('info', 'Tuning ANN hyperparameters...', 'WORKFLOW')
    config = default_config()
    config = tune_hyperparameters(df, config, 25, 10, num_processes,
                                  shuffle='train', split=[0.7, 0.2, 0.1],
                                  validate=True, eval_set='valid',
                                  eval_fn='med_abs_error', epochs=300)
    config['epochs'] = default_config()['epochs']
    config_filename = db_name.replace('.csv', '.yml')
    save_config(config, config_filename)
    logger.log('info', 'Tuned ANN hyperparameters', 'WORKFLOW')
    logger.log('info', '\tLearning rate: {}'.format(config['learning_rate']),
               'WORKFLOW')
    logger.log('info', '\tLR decay: {}'.format(config['decay']), 'WORKFLOW')
    logger.log('info', '\tBatch size: {}'.format(config['batch_size']),
               'WORKFLOW')
    logger.log('info', '\tPatience: {}'.format(config['patience']), 'WORKFLOW')
    logger.log('info', '\tHidden layers: {}'.format(config['hidden_layers']),
               'WORKFLOW')

    # Create Model
    logger.log('info', 'Generating ANN...', 'WORKFLOW')
    sv = Server(db_name.replace('.csv', '.yml'), num_processes=num_processes)
    sv.load_data(db_name.replace('.csv', '_opt.csv'))
    sv.create_project(db_name.replace('.csv', ''), 5, 75)
    sv.train(validate=True, selection_set='valid', shuffle='train',
             split=[0.7, 0.2, 0.1], selection_fn='med_abs_error')
    logger.log('info', 'ANN Generated', 'WORKFLOW')
    logger.log('info', 'Measuring ANN performance...', 'WORKFLOW')
    preds_test = sv.use(dset='test')
    preds_train = sv.use(dset='train')
    test_errors = sv.errors('r2', 'med_abs_error', dset='test')
    train_errors = sv.errors('r2', 'med_abs_error', dset='train')
    logger.log('info', 'Measured ANN performance', 'WORKFLOW')
    logger.log('info', '\tTraining set:\t R2: {}\t MAE: {}'.format(
        train_errors['r2'], train_errors['med_abs_error']), 'WORKFLOW')
    logger.log('info', '\tTesting set:\t R2: {}\t MAE: {}'.format(
        test_errors['r2'], test_errors['med_abs_error']), 'WORKFLOW')
    sv.save_project(del_candidates=True)

    if create_plots:
        logger.log('info', 'Creating parity plot...', 'WORKFLOW')
        plt.clf()
        parity_plot = ParityPlot(
            '',
            'Experimental {} Value'.format(prop_abvr),
            'Predicted {} Value'.format(prop_abvr)
        )
        parity_plot.add_series(concatenate(
            (sv._sets.learn_y, sv._sets.valid_y)
        ), preds_train, 'Training Set', 'blue')
        parity_plot.add_series(sv._sets.test_y, preds_test, 'Test Set', 'red')
        parity_plot.add_error_bars(test_errors['med_abs_error'], 'Test MAE')
        parity_plot._add_label('Test $R^2$', test_errors['r2'])
        parity_plot._add_label('Training MAE', train_errors['med_abs_error'])
        parity_plot._add_label('Training $R^2$', train_errors['r2'])
        parity_plot.save(db_name.replace('.csv', '_parity.png'))
        logger.log('info', 'Created parity plot', 'WORKFLOW')
示例#7
0
def main():

    logger.stream_level = 'debug'
    sv = Server(prj_file='kinetic_viscosity.prj')
    sv.use(dset='test', output_filename='../kv_test_results.csv')
    sv.errors('rmse', 'mean_abs_error', 'med_abs_error', 'r2', dset='test')
def use_project():

    sv = Server(prj_file='use_project.prj')
    sv.use('train', output_filename='use_project_train.csv')
    sv.use('test', output_filename='use_project_test.csv')