def get_model_training_data_by_uuid(uuid): """Retrieve data used to train, validate, and test a model given the uuid Args: uuid (str): model uuid Returns: a tuple of datafraes containint training data, validation data, and test data including the compound ID, RDKIT SMILES, and response value """ model_meta = get_metadata_by_uuid(uuid) response_col = model_meta['training_dataset']['response_cols'] smiles_col = model_meta['training_dataset']['smiles_col'] full_data = dsf.retrieve_dataset_by_dataset_oid(model_meta['training_dataset']['dataset_oid'], verbose=False) # Pull split data and merge into initial dataset split_meta = dsf.search_datasets_by_key_value('split_dataset_uuid', model_meta['splitting_parameters']['Splitting']['split_uuid']) split_oid = split_meta['dataset_oid'].values[0] split_data = dsf.retrieve_dataset_by_dataset_oid(split_oid, verbose=False) split_data['compound_id'] = split_data['cmpd_id'] split_data = split_data.drop(columns=['cmpd_id']) full_data = pd.merge(full_data, split_data, how='inner', on=['compound_id']) train_data = full_data[full_data['subset'] == 'train'][['compound_id',smiles_col,*response_col]].reset_index(drop=True) valid_data = full_data[full_data['subset'] == 'valid'][['compound_id',smiles_col,*response_col]].reset_index(drop=True) test_data = full_data[full_data['subset'] == 'test'][['compound_id',smiles_col,*response_col]].reset_index(drop=True) return train_data, valid_data, test_data
def get_model_training_data_by_uuid(uuid): """Retrieve data used to train, validate, and test a model given the uuid Args: uuid (str): model uuid Returns: a tuple of datafraes containint training data, validation data, and test data including the compound ID, RDKIT SMILES, and response value """ if not mlmt_supported: print( "Model tracker not supported in your environment; can load models from filesystem only." ) return None model_meta = get_metadata_by_uuid(uuid) response_col = model_meta['training_dataset']['response_cols'] smiles_col = model_meta['training_dataset']['smiles_col'] id_col = model_meta['training_dataset']['id_col'] full_data = dsf.retrieve_dataset_by_dataset_oid( model_meta['training_dataset']['dataset_oid']) # Pull split data and merge into initial dataset split_meta = dsf.search_datasets_by_key_value( 'split_dataset_uuid', model_meta['splitting_parameters']['split_uuid']) split_oid = split_meta['dataset_oid'].values[0] split_data = dsf.retrieve_dataset_by_dataset_oid(split_oid) split_data['compound_id'] = split_data['cmpd_id'] split_data = split_data.drop(columns=['cmpd_id']) full_data = pd.merge(full_data, split_data, how='inner', left_on=[id_col], right_on=['compound_id']) train_data = full_data[full_data['subset'] == 'train'][[ 'compound_id', smiles_col, id_col, *response_col ]].reset_index(drop=True) valid_data = full_data[full_data['subset'] == 'valid'][[ 'compound_id', smiles_col, id_col, *response_col ]].reset_index(drop=True) test_data = full_data[full_data['subset'] == 'test'][[ 'compound_id', smiles_col, id_col, *response_col ]].reset_index(drop=True) return train_data, valid_data, test_data
def liability_dset_diversity(bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine', **metric_kwargs): """ Load datasets from datastore, featurize them, and plot distributions of their inter-compound distances. """ log = logging.getLogger('ATOM') ds_client = dsf.config_client() ds_table = dsf.search_datasets_by_key_value(key='param', value=['PIC50','PEC50'], operator='in', bucket=bucket, client=ds_client) dset_keys = ds_table.dataset_key.values metadata = ds_table.metadata.values split = 'random' task_names = [] num_cmpds = [] for i, dset_key in enumerate(dset_keys): md_dict = dsf.metadata_to_dict(metadata[i]) task_name = md_dict['task_name'] num_cmpds = md_dict['CMPD_COUNT'][0] log.warning("Loading dataset for %s, %d compounds" % (task_name, num_cmpds)) dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client) dataset_dir = os.path.dirname(dset_key) dataset_file = os.path.basename(dset_key) if feat_type == 'descriptors': params = argparse.Namespace(dataset_dir=dataset_dir, dataset_file=dataset_file, y=task_name, bucket=bucket, descriptor_key='all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi', descriptor_type='MOE', splitter=split, id_col='compound_id', smiles_col='rdkit_smiles', featurizer='descriptors', prediction_type='regression', system='twintron-blue', datastore=True, transformers=True) elif feat_type == 'ECFP': params = argparse.Namespace(dataset_dir=dataset_dir, dataset_file=dataset_file, y=task_name, bucket=bucket, splitter=split, id_col='compound_id', smiles_col='rdkit_smiles', featurizer='ECFP', prediction_type='regression', system='twintron-blue', datastore=True, ecfp_radius=2, ecfp_size=1024, transformers=True) else: log.error("Feature type %s not supported" % feat_type) return log.warning("Featurizing data with %s featurizer" % feat_type) model_dataset = md.MinimalDataset(params) model_dataset.get_featurized_data(dset_df) num_cmpds = model_dataset.dataset.X.shape[0] if num_cmpds > 50000: log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds) continue plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, task_name, **metric_kwargs) # ------------------------------------------------------------------------------------------------------------------ def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine', **metric_kwargs): """ Load datasets from datastore, featurize them, and plot distributions of their inter-compound distances. """ log = logging.getLogger('ATOM') dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client) if feat_type == 'descriptors': params = parse.wrapper(dict( dataset_key=dset_key, bucket=bucket, descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather', descriptor_type='moe', featurizer='descriptors', system='twintron-blue', datastore=True, transformers=True)) elif feat_type == 'ECFP': params = parse.wrapper(dict( dataset_key=dset_key, bucket=bucket, featurizer='ECFP', system='twintron-blue', datastore=True, ecfp_radius=2, ecfp_size=1024, transformers=True)) else: log.error("Feature type %s not supported" % feat_type) return metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket) if 'id_col' in metadata.keys(): params.id_col = metadata['id_col'] if 'param' in metadata.keys(): params.response_cols = [metadata['param']] elif 'response_col' in metadata.keys(): params.response_cols = [metadata['response_col']] elif 'response_cols' in metadata.keys(): params.response_cols = metadata['response_cols'] if 'smiles_col' in metadata.keys(): params.smiles_col = metadata['smiles_col'] if 'class_number' in metadata.keys(): params.class_number = metadata['class_number'] params.dataset_name = dset_key.split('/')[-1].rstrip('.csv') log.warning("Featurizing data with %s featurizer" % feat_type) featurization = feat.create_featurization(params) model_dataset = md.MinimalDataset(params, featurization) model_dataset.get_featurized_data(dset_df) num_cmpds = model_dataset.dataset.X.shape[0] if num_cmpds > 50000: log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds) return # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs) dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all') import scipy dists = scipy.spatial.distance.squareform(dists) res_dir = '/ds/projdata/gsk_data/model_analysis/' plt_dir = '%s/Plots' % res_dir file_prefix = dset_key.split('/')[-1].rstrip('.csv') mcs_linkage = linkage(dists, method='complete') pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix) pdf = PdfPages(pdf_path) g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma') if plt_dir is not None: pdf.savefig(g.fig) pdf.close() return dists
def analyze_split(params, id_col='compound_id', smiles_col='rdkit_smiles', active_col='active'): """ Evaluate the AVE bias for the training/validation and training/test set splits of the given dataset. Also show the active frequencies in each subset and for the dataset as a whole. id_col, smiles_col and active_col are defaults to be used in case they aren't found in the dataset metadata; if found the metadata values are used instead. Args: params (argparse.Namespace): Pipeline parameters. id_col (str): Dataset column containing compound IDs. smiles_col (str): Dataset column containing SMILES strings. active_col (str): Dataset column containing binary classifications. Returns: :obj:`pandas.DataFrame`: Table of split subsets showing sizes, numbers and fractions of active compounds """ dset_key = params.dataset_key bucket = params.bucket split_uuid = params.split_uuid ds_client = dsf.config_client() try: split_metadata = dsf.search_datasets_by_key_value('split_dataset_uuid', split_uuid, ds_client, operator='in', bucket=bucket) split_oid = split_metadata['dataset_oid'].values[0] split_df = dsf.retrieve_dataset_by_dataset_oid(split_oid, client=ds_client) except Exception as e: print("Error when loading split file:\n%s" % str(e)) raise try: dataset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, client=ds_client) dataset_meta = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, client=ds_client, return_metadata=True) except Exception as e: print("Error when loading dataset:\n%s" % str(e)) raise kv_dict = dsf.get_key_val(dataset_meta['metadata']) id_col = kv_dict.get('id_col', id_col) smiles_col = kv_dict.get('smiles_col', smiles_col) active_col = kv_dict.get('response_col', active_col) try: print('Dataset has %d unique compound IDs' % len(set(dataset_df[id_col].values))) print('Split table has %d unique compound IDs' % len(set(split_df.cmpd_id.values))) dset_df = dataset_df.merge(split_df, how='inner', left_on=id_col, right_on='cmpd_id').drop('cmpd_id', axis=1) except Exception as e: print("Error when joining dataset with split dataset:\n%s" % str(e)) raise featurization = feat.create_featurization(params) data = md.create_model_dataset(params, featurization, ds_client) data.get_featurized_data() feat_arr = data.dataset.X # TODO: impute missing values if necessary y = data.dataset.y.flatten() if len(set(y) - set([0, 1])) > 0: raise ValueError( 'AVEMinSplitter only works on binary classification datasets') ids = data.dataset.ids active_ind = np.where(y == 1)[0] inactive_ind = np.where(y == 0)[0] active_feat = feat_arr[active_ind, :] inactive_feat = feat_arr[inactive_ind, :] num_active = len(active_ind) num_inactive = len(inactive_ind) active_ids = ids[active_ind] inactive_ids = ids[inactive_ind] active_id_ind = dict(zip(active_ids, range(len(active_ids)))) inactive_id_ind = dict(zip(inactive_ids, range(len(inactive_ids)))) if params.featurizer == 'ecfp': metric = 'jaccard' elif params.featurizer == 'graphconv': raise ValueError( "ave_min splitter dopesn't support graphconv features") else: metric = 'euclidean' # Calculate distance thresholds where nearest neighborfunction should be evaluated if metric == 'jaccard': max_nn_dist = 1.0 else: nan_mat = np.isnan(feat_arr) nnan = np.sum(nan_mat) if nnan > 0: log.info('Input feature matrix has %d NaN elements' % nnan) not_nan = ~nan_mat for i in range(feat_arr.shape[1]): feat_arr[nan_mat[:, i], i] = np.mean(feat_arr[not_nan[:, i], i]) nn_dist = np.sort(squareform(pdist(feat_arr, metric)))[:, 1] med_nn_dist = np.median(nn_dist) max_nn_dist = 3.0 * med_nn_dist ndist = 100 dist_thresh = np.linspace(0.0, max_nn_dist, ndist) # Compute distance matrices between subsets num_workers = 1 aa_dist = _calc_dist_mat(active_feat, active_feat, metric, None, num_workers) ii_dist = _calc_dist_mat(inactive_feat, inactive_feat, metric, None, num_workers) ai_dist = _calc_dist_mat(active_feat, inactive_feat, metric, None, num_workers) ia_dist = ai_dist.transpose() subsets = sorted(set(dset_df.subset.values)) subset_active_ind = {} subset_inactive_ind = {} if 'train' in subsets: # this is a TVT split subsets = ['train', 'valid', 'test'] for subset in subsets: subset_df = dset_df[dset_df.subset == subset] active_df = subset_df[subset_df[active_col] == 1] inactive_df = subset_df[subset_df[active_col] == 0] subset_active_ids = active_df[id_col].values subset_inactive_ids = inactive_df[id_col].values subset_active_ind[subset] = [ active_id_ind[id] for id in subset_active_ids ] subset_inactive_ind[subset] = [ inactive_id_ind[id] for id in subset_inactive_ids ] taI = subset_active_ind['train'] tiI = subset_inactive_ind['train'] print("Results for %s split with %s %s features:" % (params.splitter, params.descriptor_type, params.featurizer)) for valid_set in ['valid', 'test']: vaI = subset_active_ind[valid_set] viI = subset_inactive_ind[valid_set] split_params = ((vaI, viI, taI, tiI), aa_dist, ii_dist, ai_dist, ia_dist, dist_thresh) _plot_nn_dist_distr(split_params) bias = _plot_bias(split_params, niter=0) print("For train/%s split: AVE bias = %.5f" % (valid_set, bias)) else: # TODO: deal with k-fold splits later print('k-fold CV splits not supported yet') return # Tabulate the fractions of actives in the full dataset and each subset subset_list = [] size_list = [] frac_list = [] active_frac_list = [] dset_size = data.dataset.X.shape[0] dset_active = sum(data.dataset.y) subset_list.append('full dataset') size_list.append(dset_size) frac_list.append(1.0) active_frac_list.append(dset_active / dset_size) for subset in subsets: active_size = len(subset_active_ind[subset]) inactive_size = len(subset_inactive_ind[subset]) subset_size = active_size + inactive_size active_frac = active_size / subset_size subset_list.append(subset) size_list.append(subset_size) frac_list.append(subset_size / dset_size) active_frac_list.append(active_frac) frac_df = pd.DataFrame( dict(subset=subset_list, size=size_list, fraction=frac_list, active_frac=active_frac_list)) print('\nSplit subsets:') print(frac_df) return frac_df
def _liability_dset_diversity(bucket='public', feat_type='descriptors', dist_metric='cosine', **metric_kwargs): """ Load datasets from datastore, featurize them, and plot distributions of their inter-compound distances. """ log = logging.getLogger('ATOM') ds_client = dsf.config_client() ds_table = dsf.search_datasets_by_key_value(key='param', value=['PIC50', 'PEC50'], operator='in', bucket=bucket, client=ds_client) dset_keys = ds_table.dataset_key.values metadata = ds_table.metadata.values split = 'random' task_names = [] num_cmpds = [] for i, dset_key in enumerate(dset_keys): md_dict = dsf.metadata_to_dict(metadata[i]) task_name = md_dict['task_name'] num_cmpds = md_dict['CMPD_COUNT'][0] log.warning("Loading dataset for %s, %d compounds" % (task_name, num_cmpds)) dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client) dataset_dir = os.path.dirname(dset_key) dataset_file = os.path.basename(dset_key) if feat_type == 'descriptors': params = argparse.Namespace( dataset_dir=dataset_dir, dataset_file=dataset_file, y=task_name, bucket=bucket, descriptor_key= 'all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi', descriptor_type='MOE', splitter=split, id_col='compound_id', smiles_col='rdkit_smiles', featurizer='descriptors', prediction_type='regression', system='twintron-blue', datastore=True, transformers=True) elif feat_type == 'ECFP': params = argparse.Namespace(dataset_dir=dataset_dir, dataset_file=dataset_file, y=task_name, bucket=bucket, splitter=split, id_col='compound_id', smiles_col='rdkit_smiles', featurizer='ECFP', prediction_type='regression', system='twintron-blue', datastore=True, ecfp_radius=2, ecfp_size=1024, transformers=True) else: log.error("Feature type %s not supported" % feat_type) return log.warning("Featurizing data with %s featurizer" % feat_type) model_dataset = md.MinimalDataset(params) model_dataset.get_featurized_data(dset_df) num_cmpds = model_dataset.dataset.X.shape[0] if num_cmpds > 50000: log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds) continue plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, task_name, **metric_kwargs)