def create_scaffold_split(dset_key, res_dir): params = { "dataset_key": dset_key, "datastore": "False", "uncertainty": "False", "splitter": "scaffold", "split_valid_frac": "0.1", "split_test_frac": "0.1", "split_strategy": "train_valid_test", "previously_split": "False", "prediction_type": "classification", "model_choice_score_type": "roc_auc", "response_cols": "active", "id_col": "compound_id", "smiles_col": "base_rdkit_smiles", "result_dir": res_dir, "system": "LC", "transformers": "True", "model_type": "NN", "featurizer": "computed_descriptors", "descriptor_type": "rdkit_raw", "learning_rate": ".0007", "layer_sizes": "512,128", "dropouts": "0.3,0.3", "save_results": "False", "max_epochs": "500", "early_stopping_patience": "50", "verbose": "False" } pparams = parse.wrapper(params) MP = mp.ModelPipeline(pparams) split_uuid = MP.split_dataset() return split_uuid
def train_model(input, output): """ Retrain a model saved in a model_metadata.json file Args: input (str): path to model_metadata.json file output (str): path to output directory Returns: None """ # Train model # ----------- # Read parameter JSON file with open(input) as f: config = json.loads(f.read()) # Parse parameters params = parse.wrapper(config) params.result_dir = output # otherwise this will have the same uuid as the source model params.model_uuid = None # use the same split params.previously_split = True params.split_uuid = config['splitting_parameters']['split_uuid'] logger.debug("model params %s" % str(params)) # Create model pipeline model = mp.ModelPipeline(params) # Train model model.train_model() return model
def featurize_from_shortlist(shortlist_path=None, split_json=None): """ Featurize and split the ChEMBL hERG pIC50 dataset. Then create a config file for running a hyperparameter search to model this dataset. """ sl = pd.read_csv(shortlist_path) with open(split_json, "r") as f: hp_params = json.load(f) print('Featurizing shortlist') hp_params.pop('use_shortlist') hp_params.pop('shortlist_key') for i, row in sl.iterrows(): hp_params['dataset_key'] = row.dataset_key hp_params['response_cols'] = row.response_cols pparams = parse.wrapper(hp_params) print('-----------------------------------------------') print(hp_params['dataset_key']) print(pparams.dataset_key) print('-----------------------------------------------') # Create a ModelPipeline object pipe = mp.ModelPipeline(pparams) # Featurize and split the dataset split_uuid = pipe.split_dataset() # Delete split file to keep it cleaner rdir = hp_params['result_dir'] dkey = row.dataset_key.replace('.csv', '') os.remove(f'{dkey}_train_valid_test_scaffold_{split_uuid}.csv')
def train_model_from_tracker(model_uuid, output_dir): """ Retrain a model saved in the model tracker, but save it to output_dir and don't insert it into the model tracker Args: model_uuid (str): model tracker model_uuid file output_dir (str): path to output directory Returns: the model pipeline object with trained model """ if not mlmt_supported: logger.debug( "Model tracker not supported in your environment; can load models from filesystem only." ) return None mlmt_client = dsf.initialize_model_tracker() collection_name = mt.get_model_collection_by_uuid(model_uuid, mlmt_client=mlmt_client) # get metadata from tracker config = mt.get_metadata_by_uuid(model_uuid) # check if datastore dataset try: result = dsf.retrieve_dataset_by_datasetkey( config['training_dataset']['dataset_key'], bucket=config['training_dataset']['bucket']) if result is not None: config['datastore'] = True except: pass # fix weird old parameters #if config[] # Parse parameters params = parse.wrapper(config) params.result_dir = output_dir # otherwise this will have the same uuid as the source model params.model_uuid = None # use the same split params.previously_split = True params.split_uuid = config['splitting_parameters']['split_uuid'] # specify collection params.collection_name = collection_name logger.debug("model params %s" % str(params)) # Create model pipeline model = mp.ModelPipeline(params) # Train model model.train_model() return model
def split(pparams): split_params = copy.copy(pparams) split_params.split_only=True split_params.previously_split=False model_pipeline = mp.ModelPipeline(split_params) # comment out this line after splitting once so you don't re-split split_uuid = model_pipeline.split_dataset() return split_uuid
def delaney_pipeline(y=["measured log solubility in mols per litre"], featurizer="ecfp", split_strategy="train_valid_test", splitter="random"): delaney_inp_file = currentdir + '/config_delaney.json' inp_params = parse.wrapper(delaney_inp_file) inp_params.response_cols = y inp_params.featurizer = featurizer inp_params.split_strategy = split_strategy inp_params.splitter = splitter mp = MP.ModelPipeline(inp_params) return mp
def test(): """ Test full model pipeline: Curate data, fit model, and predict property for new compounds """ # Clean # ----- clean() # Run HyperOpt # ------------ with open("H1_RF.json", "r") as f: hp_params = json.load(f) script_dir = parse.__file__.strip("parameter_parser.py").replace( "/pipeline/", "") python_path = sys.executable hp_params["script_dir"] = script_dir hp_params["python_path"] = python_path params = parse.wrapper(hp_params) if not os.path.isfile(params.dataset_key): params.dataset_key = os.path.join(params.script_dir, params.dataset_key) train_df = pd.read_csv(params.dataset_key) print(f"Train a RF models with ECFP") pl = mp.ModelPipeline(params) pl.train_model() print("Calculate AD index with the just trained model.") pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score") assert ( "AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp' print("Calculate AD index with the saved model tarball file.") pred_df_file = pfm.predict_from_model_file( model_path=pl.params.model_tarball_path, input_df=train_df[:10], id_col="compound_id", smiles_col="rdkit_smiles", response_col="pKi_mean", dont_standardize=True, AD_method="z_score") assert ("AD_index" in pred_df_file.columns.values ), 'Error: No AD_index column in pred_df_file'
def train_model_w_balan(dset_key, split_uuid, res_dir): # Now train models on the same dataset with balancing weights params = { "dataset_key": dset_key, "datastore": "False", "uncertainty": "False", "splitter": "scaffold", "split_valid_frac": "0.1", "split_test_frac": "0.1", "split_strategy": "train_valid_test", "previously_split": "True", "split_uuid": split_uuid, "prediction_type": "classification", "model_choice_score_type": "roc_auc", "response_cols": "active", "id_col": "compound_id", "smiles_col": "base_rdkit_smiles", "result_dir": res_dir, "system": "LC", "transformers": "True", "model_type": "NN", "featurizer": "computed_descriptors", "descriptor_type": "rdkit_raw", "weight_transform_type": "balancing", "learning_rate": ".0007", "layer_sizes": "512,128", "dropouts": "0.3,0.3", "save_results": "False", "max_epochs": "500", "early_stopping_patience": "50", "verbose": "False" } for i in range(nreps): pparams = parse.wrapper(params) MP = mp.ModelPipeline(pparams) MP.train_model() wrapper = MP.model_wrapper for ss in ['valid', 'test']: metvals = wrapper.get_pred_results(ss, 'best') for metric in [ 'roc_auc_score', 'prc_auc_score', 'cross_entropy', 'precision', 'recall_score', 'npv', 'accuracy_score', 'bal_accuracy', 'kappa', 'matthews_cc' ]: subset.append(ss) balanced.append('yes') metrics.append(metric) vals.append(metvals[metric])
def test(): """ Test full model pipeline: Curate data, fit model, and predict property for new compounds """ # Clean # ----- clean() # Run HyperOpt # ------------ with open("H1_hybrid.json", "r") as f: hp_params = json.load(f) script_dir = parse.__file__.strip("parameter_parser.py").replace( "/pipeline/", "") python_path = sys.executable hp_params["script_dir"] = script_dir hp_params["python_path"] = python_path params = parse.wrapper(hp_params) if not os.path.isfile(params.dataset_key): params.dataset_key = os.path.join(params.script_dir, params.dataset_key) train_df = pd.read_csv(params.dataset_key) print(f"Train a hybrid models with MOE descriptors") pl = mp.ModelPipeline(params) pl.train_model() print("Check the model performance on validation data") pred_data = pl.model_wrapper.get_perf_data(subset="valid", epoch_label="best") pred_results = pred_data.get_prediction_results() print(pred_results) pred_score = pred_results['r2_score'] score_threshold = 0.4 assert pred_score > score_threshold, \ f'Error: Score is too low {pred_score}. Must be higher than {score_threshold}' print("Make predictions with the hyrid model") predict = pl.predict_on_dataframe(train_df[:10], contains_responses=False) assert (predict['pred'].shape[0] == 10 ), 'Error: Incorrect number of predictions' assert (np.all(np.isfinite( predict['pred'].values))), 'Error: Predictions are not numbers'
def train_and_get_tar(input_json, ds_key_file): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, input_json) pparams = parse.wrapper(['--config_file', json_file]) pparams.dataset_key = os.path.join(script_path, ds_key_file) pparams.result_dir = os.path.join(script_path, 'result') train_pipe = mp.ModelPipeline(pparams) train_pipe.train_model() list_of_files = glob.glob('./result/*.gz') # check all *.gz latest_file = max(list_of_files, key=os.path.getctime) # get the latest gz return latest_file
def train_model(input, output, dskey=''): """ Retrain a model saved in a model_metadata.json file Args: input (str): path to model_metadata.json file output (str): path to output directory dskey (str): new dataset key if file location has changed Returns: None """ # Train model # ----------- # Read parameter JSON file with open(input) as f: config = json.loads(f.read()) # set a new dataset key if necessary if not dskey == '': config['dataset_key'] = dskey # Parse parameters params = parse.wrapper(config) params.result_dir = output # otherwise this will have the same uuid as the source model params.model_uuid = None # use the same split params.previously_split = True params.split_uuid = config['splitting_parameters']['split_uuid'] # specify collection logger.debug("model params %s" % str(params)) logger.debug(params.__dict__.items()) # Create model pipeline model = mp.ModelPipeline(params) # Train model model.train_model() return model
def train(pparams): train_pipe = mp.ModelPipeline(pparams) train_pipe.train_model() return train_pipe
def test(): """ Test full model pipeline: Curate data, fit model, and predict property for new compounds """ # Clean # ----- integrative_utilities.clean_fit_predict() clean() # Download # -------- download() # Curate # ------ curate() # Train model # ----------- # Read parameter JSON file with open('config_delaney_train_NN.json') as f: config = json.loads(f.read()) # Parse parameters params = parse.wrapper(config) # Create model pipeline model = mp.ModelPipeline(params) # Train model model.train_model() # Get uuid and reload directory # ----------------------------- uuid = integrative_utilities.get_subdirectory( 'result/delaney-processed_curated_fit/NN_graphconv_scaffold_regression' ) reload_dir = 'result/delaney-processed_curated_fit/NN_graphconv_scaffold_regression/' + uuid # Check training statistics # ------------------------- integrative_utilities.training_statistics_file(reload_dir, 'test', 0.6) # Make prediction parameters # -------------------------- # Read prediction parameter JSON file with open('config_delaney_predict_NN.json', 'r') as f: predict_parameters_dict = json.loads(f.read()) # Set transformer key here because model uuid is not known before fit predict_parameters_dict[ 'transformer_key'] = reload_dir + 'transformers.pkl' predict_parameters = parse.wrapper(predict_parameters_dict) # Load second test set # -------------------- data = pd.read_csv('delaney-processed_curated_external.csv') # Select columns and rename response column data = data[[ predict_parameters.id_col, predict_parameters.smiles_col, predict_parameters.response_cols[0] ]] data = data.rename( columns={predict_parameters.response_cols[0]: 'experimental_values'}) # Make prediction pipeline # ------------------------ pp = mp.create_prediction_pipeline_from_file(predict_parameters, reload_dir) # Predict # ------- predict = pp.predict_on_dataframe(data) # Check predictions # ----------------- assert (predict['pred'].shape[0] == 117 ), 'Error: Incorrect number of predictions' assert (np.all(np.isfinite( predict['pred'].values))), 'Error: Predictions are not numbers' # Save predictions with experimental values # ----------------------------------------- predict.reset_index(level=0, inplace=True) combined = pd.merge(data, predict, on=predict_parameters.id_col, how='inner') combined.to_csv('delaney-processed_curated_predict.csv') assert (os.path.isfile('delaney-processed_curated_predict.csv') and os.path.getsize('delaney-processed_curated_predict.csv') > 0 ), 'Error: Prediction file not created'
def base_feature_importance(model_pipeline=None, params=None): """ Minimal baseline feature importance function. Given an AMPL model (or the parameters to train a model), returns a data frame with a row for each feature. The columns of the data frame depend on the model type and prediction type. If the model is a binary classifier, the columns include t-statistics and p-values for the differences between the means of the active and inactive compounds. If the model is a random forest, the columns will include the mean decrease in impurity (MDI) of each feature, computed by the scikit-learn feature_importances_ function. See the scikit-learn documentation for warnings about interpreting the MDI importance. For all models, the returned data frame will include feature names, means and standard deviations for each feature. This function has been tested on RFs and NNs with rdkit descriptors. Other models and feature combinations may not be supported. Args: model_pipeline (`ModelPipeline`): A pipeline object for a model that was trained in the current Python session or loaded from the model tracker or a tarball file. Either model_pipeline or params must be provided. params (`dict`): Parameter dictionary for a model to be trained and analyzed. Either model_pipeline or a params argument must be passed; if both are passed, params is ignored and the parameters from model_pipeline are used. Returns: (imp_df, model_pipeline, pparams) (tuple): imp_df (`DataFrame`): Table of feature importance metrics. model_pipeline (`ModelPipeline`): Pipeline object for model that was passed to or trained by function. pparams (`Namespace`): Parsed parameters of model. """ log = logging.getLogger('ATOM') if model_pipeline is None: if params is None: raise ValueError( "Either model_pipeline or params can be None but not both") # Train a model based on the parameters given pparams = parse.wrapper(params) model_pipeline = mp.ModelPipeline(pparams) model_pipeline.train_model() else: if params is not None: log.info( "model_pipeline and params were both passed; ignoring params argument and using params from model" ) pparams = model_pipeline.params # Get the list of feature column names features = model_pipeline.featurization.get_feature_columns() nfeat = len(features) imp_df = pd.DataFrame({'feature': features}) # Get the training, validation and test sets (we assume we're not using K-fold CV). These are DeepChem Dataset objects. (train_dset, valid_dset) = model_pipeline.data.train_valid_dsets[0] test_dset = model_pipeline.data.test_dset imp_df['mean_value'] = train_dset.X.mean(axis=0) imp_df['std_value'] = train_dset.X.std(axis=0) if pparams.prediction_type == 'classification': # Compute a t-statistic for each feature for the difference between its mean values for active and inactive compounds tstats = [] pvalues = [] active = train_dset.X[train_dset.y[:, 0] == 1, :] inactive = train_dset.X[train_dset.y[:, 0] == 0, :] log.debug("Computing t-statistics") for ifeat in range(nfeat): res = stats.ttest_ind(active[:, ifeat], inactive[:, ifeat], equal_var=True, nan_policy='omit') tstats.append(res.statistic) pvalues.append(res.pvalue) imp_df['t_statistic'] = tstats imp_df['ttest_pvalue'] = pvalues if pparams.model_type == 'RF': # Tabulate the MDI-based feature importances for random forest models # TODO: Does this work for XGBoost models too? rf_model = model_pipeline.model_wrapper.model.model imp_df['mdi_importance'] = rf_model.feature_importances_ return imp_df, model_pipeline, pparams
def test_train_NN_graphconv_scaffold_inputs(): """ Args: pipeline (ModelPipeline): The ModelPipeline instance for this model run. Dependencies: ModelPipeline creation featurization creation creation of model_wrapper mp.load_featurize_data Calls: create_perf_data perf_data.accumulate_preds perf_data.comput_perf_metrics data.combined_training-data() self._copy_model """ # checking that the layers, dropouts, and learning rate are properly added to the deepchem graphconv model general_params['featurizer'] = 'graphconv' general_params['layer_sizes'] = '100,100,10' general_params['dropouts'] = '0.3,0.3,0.1' general_params['uncertainty'] = False inp_params = parse.wrapper(general_params) mp = MP.ModelPipeline(inp_params) mp.featurization = feat.create_featurization(inp_params) mp.model_wrapper = model_wrapper.create_model_wrapper( inp_params, mp.featurization, mp.ds_client) # asserting that the correct model is created with the correct layer sizes, dropouts, model_dir, and mode by default test1 = [] test1.append(mp.model_wrapper.params.layer_sizes == [100, 100, 10]) test1.append(mp.model_wrapper.params.dropouts == [0.3, 0.3, 0.1]) # checking that parameters are properly passed to the deepchem model object test1.append(isinstance(mp.model_wrapper.model, GraphConvModel)) test1.append( mp.model_wrapper.model.model_dir == mp.model_wrapper.model_dir) test1.append( [i.out_channel for i in mp.model_wrapper.model.model.graph_convs] == [100, 100]) test1.append( [i.rate for i in mp.model_wrapper.model.model.dropouts] == [0.3, 0.3, 0.1]) test1.append(mp.model_wrapper.model.mode == 'regression') test1.append(mp.model_wrapper.model.model.dense.units == 10) assert all(test1) #*********************************************************************************** def test_super_get_train_valid_pred_results(): """ Args: perf_data: A PerfData object that stores the predicted values and metrics Returns: dict: A dictionary of the prediction results Raises: None Dependencies: create_perf_data Calls: perf_data.get_prediction_results() """ pass # should be tested in perf_data.get_prediction_results() # should still be called to make sure that the function is callable #*********************************************************************************** def test_super_get_test_perf_data(): """ Args: model_dir (str): Directory where the saved model is stored model_dataset (DiskDataset): Stores the current dataset and related methods Returns: perf_data: PerfData object containing the predicted values and metrics for the current test dataset Raises: None Dependencies: A model must be in model_dir model_dataset.test_dset must exist Calls: create_perf_data self.generate_predictions perf_data.accumulate_preds """ pass # mostly tested in accumulate_preds, but should be tested to ensure taht the predictions are properly being called #*********************************************************************************** def test_super_get_test_pred_results(): """ Args: model_dir (str): Directory where the saved model is stored model_dataset (DiskDataset): Stores the current dataset and related methods Returns: dict: A dictionary containing the prediction values and metrics for the current dataset. Raises: None Dependencies: A model must be in model_dir model_dataset.test_dset must exist Calls: self.get_test_perf_data perf_data.get_prediction_results """ pass #mostly tested in perf_data.get_prediction_results #*********************************************************************************** def test_super_get_full_dataset_perf_data(): """ Args: model_dataset (DiskDataset): Stores the current dataset and related methods Returns: perf_data: PerfData object containing the predicted values and metrics for the current full dataset Raises: None Dependencies: A model must already be trained Calls: create_perf_data self.generate_predictions self.accumulate_preds """ pass #*********************************************************************************** def test_super_get_full_dataset_pred_results(): """ Args: model_dataset (DiskDataset): Stores the current dataset and related methods Returns: dict: A dictionary containing predicted values and metrics for the current full dataset Raises: None Dependencies: A model was already be trained. Calls: get_full_dataset_perf_data self.get_prediction_results() """ pass
def train_and_predict(train_json_f, prefix='delaney-processed'): # Train model # ----------- # Read parameter JSON file with open(train_json_f) as f: config = json.loads(f.read()) # Parse parameters params = parse.wrapper(config) # Create model pipeline model = mp.ModelPipeline(params) # Train model model.train_model() # Get uuid and reload directory # ----------------------------- model_type = params.model_type prediction_type = params.prediction_type descriptor_type = params.descriptor_type featurizer = params.featurizer splitter = params.splitter model_dir = 'result/%s_curated_fit/%s_%s_%s_%s' % ( prefix, model_type, featurizer, splitter, prediction_type) uuid = model.params.model_uuid tar_f = 'result/%s_curated_fit_model_%s.tar.gz' % (prefix, uuid) reload_dir = model_dir + '/' + uuid # Check training statistics # ------------------------- if prediction_type == 'regression': threshold = 0.6 if 'perf_threshold' in config: threshold = float(config['perf_threshold']) integrative_utilities.training_statistics_file(reload_dir, 'test', threshold, 'r2_score') score = integrative_utilities.read_training_statistics_file( reload_dir, 'test', 'r2_score') else: threshold = 0.7 if 'perf_threshold' in config: threshold = float(config['perf_threshold']) integrative_utilities.training_statistics_file(reload_dir, 'test', threshold, 'accuracy_score') score = integrative_utilities.read_training_statistics_file( reload_dir, 'test', 'accuracy_score') print("Final test score:", score) # Load second test set # -------------------- data = pd.read_csv('%s_curated_external.csv' % prefix) predict = pfm.predict_from_model_file(tar_f, data, id_col=params.id_col, smiles_col=params.smiles_col, response_col=params.response_cols) pred_cols = [f for f in predict.columns if f.endswith('_pred')] pred = predict[pred_cols].to_numpy() # Check predictions # ----------------- assert ( pred.shape[0] == len(data)), 'Error: Incorrect number of predictions' assert (np.all(np.isfinite(pred))), 'Error: Predictions are not numbers' # Save predictions with experimental values # ----------------------------------------- predict.reset_index(level=0, inplace=True) combined = pd.merge(data, predict, on=params.id_col, how='inner') pred_csv_name = '%s_curated_%s_%s_%s_%s_%d_%s_predict.csv' % ( prefix, model_type, prediction_type, descriptor_type, featurizer, len(model.params.response_cols), model.params.splitter) combined.to_csv(pred_csv_name) assert (os.path.isfile(pred_csv_name) and os.path.getsize(pred_csv_name) > 0 ), 'Error: Prediction file not created' return tar_f