Exemplo n.º 1
0
def get_model_training_data_by_uuid(uuid):
    """Retrieve data used to train, validate, and test a model given the uuid

    Args:
        uuid (str): model uuid
    Returns:
        a tuple of datafraes containint training data, validation data, and test data including the compound ID, RDKIT SMILES, and response value
    """
    model_meta = get_metadata_by_uuid(uuid)
    response_col = model_meta['training_dataset']['response_cols']
    smiles_col = model_meta['training_dataset']['smiles_col']
    full_data  = dsf.retrieve_dataset_by_dataset_oid(model_meta['training_dataset']['dataset_oid'], verbose=False)

    # Pull split data and merge into initial dataset
    split_meta = dsf.search_datasets_by_key_value('split_dataset_uuid', model_meta['splitting_parameters']['Splitting']['split_uuid'])
    split_oid  = split_meta['dataset_oid'].values[0]
    split_data = dsf.retrieve_dataset_by_dataset_oid(split_oid, verbose=False)
    split_data['compound_id'] = split_data['cmpd_id']
    split_data = split_data.drop(columns=['cmpd_id'])
    full_data = pd.merge(full_data, split_data, how='inner', on=['compound_id'])

    train_data = full_data[full_data['subset'] == 'train'][['compound_id',smiles_col,*response_col]].reset_index(drop=True)
    valid_data = full_data[full_data['subset'] == 'valid'][['compound_id',smiles_col,*response_col]].reset_index(drop=True)
    test_data  = full_data[full_data['subset'] == 'test'][['compound_id',smiles_col,*response_col]].reset_index(drop=True)

    return train_data, valid_data, test_data
Exemplo n.º 2
0
def get_model_training_data_by_uuid(uuid):
    """Retrieve data used to train, validate, and test a model given the uuid

    Args:
        uuid (str): model uuid

    Returns:
        a tuple of datafraes containint training data, validation data, and test data including the compound ID, RDKIT SMILES, and response value
    """
    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    model_meta = get_metadata_by_uuid(uuid)
    response_col = model_meta['training_dataset']['response_cols']
    smiles_col = model_meta['training_dataset']['smiles_col']
    id_col = model_meta['training_dataset']['id_col']
    full_data = dsf.retrieve_dataset_by_dataset_oid(
        model_meta['training_dataset']['dataset_oid'])

    # Pull split data and merge into initial dataset
    split_meta = dsf.search_datasets_by_key_value(
        'split_dataset_uuid', model_meta['splitting_parameters']['split_uuid'])
    split_oid = split_meta['dataset_oid'].values[0]
    split_data = dsf.retrieve_dataset_by_dataset_oid(split_oid)
    split_data['compound_id'] = split_data['cmpd_id']
    split_data = split_data.drop(columns=['cmpd_id'])
    full_data = pd.merge(full_data,
                         split_data,
                         how='inner',
                         left_on=[id_col],
                         right_on=['compound_id'])

    train_data = full_data[full_data['subset'] == 'train'][[
        'compound_id', smiles_col, id_col, *response_col
    ]].reset_index(drop=True)
    valid_data = full_data[full_data['subset'] == 'valid'][[
        'compound_id', smiles_col, id_col, *response_col
    ]].reset_index(drop=True)
    test_data = full_data[full_data['subset'] == 'test'][[
        'compound_id', smiles_col, id_col, *response_col
    ]].reset_index(drop=True)

    return train_data, valid_data, test_data
Exemplo n.º 3
0
def liability_dset_diversity(bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine', **metric_kwargs):
    """
    Load datasets from datastore, featurize them, and plot distributions of their inter-compound
    distances.
    """
    log = logging.getLogger('ATOM')
    ds_client = dsf.config_client()
    ds_table = dsf.search_datasets_by_key_value(key='param', value=['PIC50','PEC50'], operator='in', 
                                                bucket=bucket, client=ds_client)
    dset_keys = ds_table.dataset_key.values
    metadata = ds_table.metadata.values
    split = 'random'
    task_names = []
    num_cmpds = []
    for i, dset_key in enumerate(dset_keys):
        md_dict = dsf.metadata_to_dict(metadata[i])
        task_name = md_dict['task_name']
        num_cmpds = md_dict['CMPD_COUNT'][0]
        log.warning("Loading dataset for %s, %d compounds" % (task_name, num_cmpds))
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
        dataset_dir = os.path.dirname(dset_key)
        dataset_file = os.path.basename(dset_key)
        if feat_type == 'descriptors':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                            dataset_file=dataset_file,
                            y=task_name,
                            bucket=bucket,
                            descriptor_key='all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi',
                            descriptor_type='MOE',
                            splitter=split,
                            id_col='compound_id',
                            smiles_col='rdkit_smiles',
                            featurizer='descriptors',
                            prediction_type='regression', 
                            system='twintron-blue',
                            datastore=True,
                            transformers=True)
        elif feat_type == 'ECFP':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                            dataset_file=dataset_file,
                            y=task_name,
                            bucket=bucket,
                            splitter=split,
                            id_col='compound_id',
                            smiles_col='rdkit_smiles',
                            featurizer='ECFP',
                            prediction_type='regression', 
                            system='twintron-blue',
                            datastore=True,
                            ecfp_radius=2, ecfp_size=1024, 
                            transformers=True)
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        log.warning("Featurizing data with %s featurizer" % feat_type)
        model_dataset = md.MinimalDataset(params)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
            continue
        plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, task_name, **metric_kwargs)

    # ------------------------------------------------------------------------------------------------------------------
    def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine',
                           **metric_kwargs):
        """
        Load datasets from datastore, featurize them, and plot distributions of their inter-compound
        distances.
        """
        log = logging.getLogger('ATOM')
    
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
    
        if feat_type == 'descriptors':
            params = parse.wrapper(dict(
                dataset_key=dset_key,
                bucket=bucket,
                descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather',
                descriptor_type='moe',
                featurizer='descriptors',
                system='twintron-blue',
                datastore=True,
                transformers=True))
        elif feat_type == 'ECFP':
            params = parse.wrapper(dict(
                dataset_key=dset_key,
                bucket=bucket,
                featurizer='ECFP',
                system='twintron-blue',
                datastore=True,
                ecfp_radius=2,
                ecfp_size=1024,
                transformers=True))
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket)
        if 'id_col' in metadata.keys():
            params.id_col = metadata['id_col']
        if 'param' in metadata.keys():
            params.response_cols = [metadata['param']]
        elif 'response_col' in metadata.keys():
            params.response_cols = [metadata['response_col']]
        elif 'response_cols' in metadata.keys():
            params.response_cols = metadata['response_cols']
    
        if 'smiles_col' in metadata.keys():
            params.smiles_col = metadata['smiles_col']
    
        if 'class_number' in metadata.keys():
            params.class_number = metadata['class_number']
        params.dataset_name = dset_key.split('/')[-1].rstrip('.csv')
    
        log.warning("Featurizing data with %s featurizer" % feat_type)
        featurization = feat.create_featurization(params)
        model_dataset = md.MinimalDataset(params, featurization)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
            return
        # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs)
        dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all')
        import scipy
        dists = scipy.spatial.distance.squareform(dists)
        res_dir = '/ds/projdata/gsk_data/model_analysis/'
        plt_dir = '%s/Plots' % res_dir
        file_prefix = dset_key.split('/')[-1].rstrip('.csv')
        mcs_linkage = linkage(dists, method='complete')
        pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix)
        pdf = PdfPages(pdf_path)
        g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma')
        if plt_dir is not None:
            pdf.savefig(g.fig)
            pdf.close()
        return dists
Exemplo n.º 4
0
def analyze_split(params,
                  id_col='compound_id',
                  smiles_col='rdkit_smiles',
                  active_col='active'):
    """
    Evaluate the AVE bias for the training/validation and training/test set splits of the given dataset.

    Also show the active frequencies in each subset and for the dataset as a whole.
    id_col, smiles_col and active_col are defaults to be used in case they aren't found in the dataset metadata; if found
    the metadata values are used instead.

    Args:
        params (argparse.Namespace): Pipeline parameters.

        id_col (str): Dataset column containing compound IDs.

        smiles_col (str): Dataset column containing SMILES strings.

        active_col (str): Dataset column containing binary classifications.

    Returns:
        :obj:`pandas.DataFrame`: Table of split subsets showing sizes, numbers and fractions of active compounds

    """
    dset_key = params.dataset_key
    bucket = params.bucket
    split_uuid = params.split_uuid

    ds_client = dsf.config_client()
    try:
        split_metadata = dsf.search_datasets_by_key_value('split_dataset_uuid',
                                                          split_uuid,
                                                          ds_client,
                                                          operator='in',
                                                          bucket=bucket)
        split_oid = split_metadata['dataset_oid'].values[0]
        split_df = dsf.retrieve_dataset_by_dataset_oid(split_oid,
                                                       client=ds_client)
    except Exception as e:
        print("Error when loading split file:\n%s" % str(e))
        raise

    try:
        dataset_df = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                        bucket,
                                                        client=ds_client)
        dataset_meta = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                          bucket,
                                                          client=ds_client,
                                                          return_metadata=True)
    except Exception as e:
        print("Error when loading dataset:\n%s" % str(e))
        raise
    kv_dict = dsf.get_key_val(dataset_meta['metadata'])
    id_col = kv_dict.get('id_col', id_col)
    smiles_col = kv_dict.get('smiles_col', smiles_col)
    active_col = kv_dict.get('response_col', active_col)

    try:
        print('Dataset has %d unique compound IDs' %
              len(set(dataset_df[id_col].values)))
        print('Split table has %d unique compound IDs' %
              len(set(split_df.cmpd_id.values)))

        dset_df = dataset_df.merge(split_df,
                                   how='inner',
                                   left_on=id_col,
                                   right_on='cmpd_id').drop('cmpd_id', axis=1)
    except Exception as e:
        print("Error when joining dataset with split dataset:\n%s" % str(e))
        raise

    featurization = feat.create_featurization(params)
    data = md.create_model_dataset(params, featurization, ds_client)
    data.get_featurized_data()
    feat_arr = data.dataset.X
    # TODO: impute missing values if necessary
    y = data.dataset.y.flatten()
    if len(set(y) - set([0, 1])) > 0:
        raise ValueError(
            'AVEMinSplitter only works on binary classification datasets')
    ids = data.dataset.ids
    active_ind = np.where(y == 1)[0]
    inactive_ind = np.where(y == 0)[0]
    active_feat = feat_arr[active_ind, :]
    inactive_feat = feat_arr[inactive_ind, :]
    num_active = len(active_ind)
    num_inactive = len(inactive_ind)
    active_ids = ids[active_ind]
    inactive_ids = ids[inactive_ind]
    active_id_ind = dict(zip(active_ids, range(len(active_ids))))
    inactive_id_ind = dict(zip(inactive_ids, range(len(inactive_ids))))
    if params.featurizer == 'ecfp':
        metric = 'jaccard'
    elif params.featurizer == 'graphconv':
        raise ValueError(
            "ave_min splitter dopesn't support graphconv features")
    else:
        metric = 'euclidean'

    # Calculate distance thresholds where nearest neighborfunction should be evaluated
    if metric == 'jaccard':
        max_nn_dist = 1.0
    else:
        nan_mat = np.isnan(feat_arr)
        nnan = np.sum(nan_mat)
        if nnan > 0:
            log.info('Input feature matrix has %d NaN elements' % nnan)
            not_nan = ~nan_mat
            for i in range(feat_arr.shape[1]):
                feat_arr[nan_mat[:, i], i] = np.mean(feat_arr[not_nan[:, i],
                                                              i])
        nn_dist = np.sort(squareform(pdist(feat_arr, metric)))[:, 1]
        med_nn_dist = np.median(nn_dist)
        max_nn_dist = 3.0 * med_nn_dist
    ndist = 100
    dist_thresh = np.linspace(0.0, max_nn_dist, ndist)

    # Compute distance matrices between subsets
    num_workers = 1
    aa_dist = _calc_dist_mat(active_feat, active_feat, metric, None,
                             num_workers)
    ii_dist = _calc_dist_mat(inactive_feat, inactive_feat, metric, None,
                             num_workers)
    ai_dist = _calc_dist_mat(active_feat, inactive_feat, metric, None,
                             num_workers)
    ia_dist = ai_dist.transpose()

    subsets = sorted(set(dset_df.subset.values))
    subset_active_ind = {}
    subset_inactive_ind = {}

    if 'train' in subsets:
        # this is a TVT split
        subsets = ['train', 'valid', 'test']
        for subset in subsets:
            subset_df = dset_df[dset_df.subset == subset]
            active_df = subset_df[subset_df[active_col] == 1]
            inactive_df = subset_df[subset_df[active_col] == 0]
            subset_active_ids = active_df[id_col].values
            subset_inactive_ids = inactive_df[id_col].values
            subset_active_ind[subset] = [
                active_id_ind[id] for id in subset_active_ids
            ]
            subset_inactive_ind[subset] = [
                inactive_id_ind[id] for id in subset_inactive_ids
            ]

        taI = subset_active_ind['train']
        tiI = subset_inactive_ind['train']
        print("Results for %s split with %s %s features:" %
              (params.splitter, params.descriptor_type, params.featurizer))
        for valid_set in ['valid', 'test']:
            vaI = subset_active_ind[valid_set]
            viI = subset_inactive_ind[valid_set]
            split_params = ((vaI, viI, taI, tiI), aa_dist, ii_dist, ai_dist,
                            ia_dist, dist_thresh)
            _plot_nn_dist_distr(split_params)
            bias = _plot_bias(split_params, niter=0)
            print("For train/%s split: AVE bias = %.5f" % (valid_set, bias))
    else:
        # TODO: deal with k-fold splits later
        print('k-fold CV splits not supported yet')
        return

    # Tabulate the fractions of actives in the full dataset and each subset
    subset_list = []
    size_list = []
    frac_list = []
    active_frac_list = []

    dset_size = data.dataset.X.shape[0]
    dset_active = sum(data.dataset.y)
    subset_list.append('full dataset')
    size_list.append(dset_size)
    frac_list.append(1.0)
    active_frac_list.append(dset_active / dset_size)

    for subset in subsets:
        active_size = len(subset_active_ind[subset])
        inactive_size = len(subset_inactive_ind[subset])
        subset_size = active_size + inactive_size
        active_frac = active_size / subset_size
        subset_list.append(subset)
        size_list.append(subset_size)
        frac_list.append(subset_size / dset_size)
        active_frac_list.append(active_frac)
    frac_df = pd.DataFrame(
        dict(subset=subset_list,
             size=size_list,
             fraction=frac_list,
             active_frac=active_frac_list))
    print('\nSplit subsets:')
    print(frac_df)

    return frac_df
Exemplo n.º 5
0
def _liability_dset_diversity(bucket='public',
                              feat_type='descriptors',
                              dist_metric='cosine',
                              **metric_kwargs):
    """
    Load datasets from datastore, featurize them, and plot distributions of their inter-compound
    distances.
    """
    log = logging.getLogger('ATOM')
    ds_client = dsf.config_client()
    ds_table = dsf.search_datasets_by_key_value(key='param',
                                                value=['PIC50', 'PEC50'],
                                                operator='in',
                                                bucket=bucket,
                                                client=ds_client)
    dset_keys = ds_table.dataset_key.values
    metadata = ds_table.metadata.values
    split = 'random'
    task_names = []
    num_cmpds = []
    for i, dset_key in enumerate(dset_keys):
        md_dict = dsf.metadata_to_dict(metadata[i])
        task_name = md_dict['task_name']
        num_cmpds = md_dict['CMPD_COUNT'][0]
        log.warning("Loading dataset for %s, %d compounds" %
                    (task_name, num_cmpds))
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket,
                                                     ds_client)
        dataset_dir = os.path.dirname(dset_key)
        dataset_file = os.path.basename(dset_key)
        if feat_type == 'descriptors':
            params = argparse.Namespace(
                dataset_dir=dataset_dir,
                dataset_file=dataset_file,
                y=task_name,
                bucket=bucket,
                descriptor_key=
                'all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi',
                descriptor_type='MOE',
                splitter=split,
                id_col='compound_id',
                smiles_col='rdkit_smiles',
                featurizer='descriptors',
                prediction_type='regression',
                system='twintron-blue',
                datastore=True,
                transformers=True)
        elif feat_type == 'ECFP':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                                        dataset_file=dataset_file,
                                        y=task_name,
                                        bucket=bucket,
                                        splitter=split,
                                        id_col='compound_id',
                                        smiles_col='rdkit_smiles',
                                        featurizer='ECFP',
                                        prediction_type='regression',
                                        system='twintron-blue',
                                        datastore=True,
                                        ecfp_radius=2,
                                        ecfp_size=1024,
                                        transformers=True)
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        log.warning("Featurizing data with %s featurizer" % feat_type)
        model_dataset = md.MinimalDataset(params)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" %
                        num_cmpds)
            continue
        plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric,
                                task_name, **metric_kwargs)