Пример #1
0
def get_model_collection_by_uuid(uuid, client_wrapper=None):
    """Retrieve model collection given a uuid.

    Retrieve model collection given a uuid.

    Args:
        uuid (str): model uuid
    Returns:
        Matching collection name
    """

    if client_wrapper is None:
        client_wrapper = mlmt_client_wrapper.MLMTClientWrapper(
            ds_client=dsf.config_client())
        client_wrapper.instantiate_mlmt_client()

    collection = 'Collection not found for uuid: ' + uuid
    colls = client_wrapper.get_collection_names({})
    for col in colls['matching_collection_names']:
        model_meta = list(
            get_full_metadata({"model_uuid": uuid},
                              client_wrapper=client_wrapper,
                              collection_name=col))
        if model_meta != []:
            collection = col
            break

    return collection
Пример #2
0
def get_metadata_by_uuid(uuid,
                         client_wrapper=None,
                         collection_name=None,
                         log=False):
    """Retrieve relevant model metadata by uuid.

    Retrieve metadata matching given uuid

    Args:
        uuid (str): model uuid
        collection(str): collection to search (optional, searches all collections if not specified)
    Returns:
        Matching metadata dictionary. Raises MongoQueryException if the query fails.
    """

    if client_wrapper is None:
        client_wrapper = mlmt_client_wrapper.MLMTClientWrapper(
            ds_client=dsf.config_client())
        client_wrapper.instantiate_mlmt_client()

    if not client_wrapper.mlmt_client:
        raise Exception('mlmt_client failed to instantitate')

    if collection_name is None:
        collection_name = get_model_collection_by_uuid(
            uuid, client_wrapper=client_wrapper)

    model_meta = list(
        get_full_metadata({"model_uuid": uuid},
                          client_wrapper=client_wrapper,
                          collection_name=collection_name))

    return model_meta[0]
Пример #3
0
def get_metadata(filter_dict,
                 client_wrapper=None,
                 collection_name='model_tracker',
                 log=False):
    """Retrieve relevant metadata.

    Retrieve metadata matching given criteria.

    Args:
        filter_dict (dict): dictionary to filter on

    Returns:
        A list of matching metadata dictionaries. Raises MongoQueryException if
        the query fails.
    """

    if filter_dict is None:
        raise Exception('filter_dict cannot be None.')
    if client_wrapper is None:
        client_wrapper = mlmt_client_wrapper.MLMTClientWrapper(
            ds_client=dsf.config_client())
        client_wrapper.instantiate_mlmt_client()

    # Temporarily add collection_name key. The model tracker will use this key
    # internally and pop it from the dict.
    filter_dict['collection_name'] = collection_name
    gen = client_wrapper.get_metadata_generator(filter_dict=filter_dict,
                                                log=log)
    if log:
        print('Successfully constructed metadata generator.')
    return gen
Пример #4
0
def extract_datastore_model_tarball(model_uuid, model_bucket, output_dir,
                                    model_dir):
    """
    Load a model tarball saved in the datastore and check the format. If it is a new style tarball (containing
    the model metadata and transformers along with the model state), unpack it into output_dir. Otherwise
    it contains the model state only; unpack it into model_dir.

    Args:
        model_uuid (str): UUID of model to be retrieved

        model_bucket (str): Datastore bucket containing model tarball file

        output_dir (str): Output directory to unpack tarball into if it's in the new format

        model_dir (str): Output directory to unpack tarball into if it's in the old format

    Returns:
        extract_dir (str): The directory (output_dir or model_dir) the tarball was extracted into.
    """

    ds_client = dsf.config_client()
    model_dataset_key = 'model_%s_tarball' % model_uuid

    # Look at the tarball contents and figure out which format it's in. If it already has the metadata.json
    # and transformers, extract it into output_dir; otherwise into model_dir.
    with ds_client.open_bucket_dataset(model_bucket,
                                       model_dataset_key,
                                       mode='b') as dstore_fp:
        with tarfile.open(fileobj=dstore_fp, mode='r:gz') as tfile:
            tar_contents = tfile.getnames()
    if './model_metadata.json' in tar_contents:
        extract_dir = output_dir
    else:
        extract_dir = model_dir
    os.makedirs(extract_dir, exist_ok=True)

    with ds_client.open_bucket_dataset(model_bucket,
                                       model_dataset_key,
                                       mode='b') as dstore_fp:
        with tarfile.open(fileobj=dstore_fp, mode='r:gz') as tfile:
            tfile.extractall(path=extract_dir)
    logger.info(f"Extracted model tarball contents to {extract_dir}")
    return extract_dir
Пример #5
0
def _get_descriptors(smiles_arr):
    """
    DEPRECATED. This function is guaranteed not to work, since it refers to datasets that no longer exist.
    """
    ds_client = dsf.config_client()

    full_feature_matrix_key = '/ds/projdata/gsk_data/GSK_datasets/eXP_Panel_Min_100_Cmpds/scaled_descriptors/' \
                              'subset_all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi_HTR2A_5_' \
                              'HT2A_Human_Antagonist_HEK_Luminescence_f_PIC50.csv'
    full_feature_matrix = dsf.retrieve_dataset_by_datasetkey(full_feature_matrix_key, 'gskdata', ds_client)
    smiles_df = pd.DataFrame(smiles_arr)
    #df = full_feature_matrix.merge(
    #    smiles_df, how='inner', left_on='smiles', right_on=smiles_df.columns[0])
    df = full_feature_matrix.head(20)
    del full_feature_matrix
    descriptor_features = [x for x in df.columns.values.tolist() if x not in
                               ['compound_id', 'inchi_key', 'smiles', 'smiles_out',
                                'lost_frags', 'inchi_string', 'pxc50', 'rdkit_smiles',
                                'HTR2A_5_HT2A_Human_Antagonist_HEK_Luminescence_f_PIC50']]
    #TODO this probably doesn't work
    return df[descriptor_features]
def export_model(model_uuid, collection, model_dir, alt_bucket='CRADA'):
    """
    Export the metadata (parameters) and other files needed to recreate a model
    from the model tracker database to a gzipped tar archive.

    Args:
        model_uuid (str): Model unique identifier

        collection (str): Name of the collection holding the model in the database.

        model_dir (str): Path to directory where the model metadata and parameter files will be written. The directory will
        be created if it doesn't already exist. Subsequently, the directory contents will be packed into a gzipped tar archive
        named model_dir.tar.gz.

        alt_bucket (str): Alternate datastore bucket to search for model tarball and transformer objects.

    Returns:
        none
    """
    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return

    ds_client = dsf.config_client()
    metadata_dict = get_metadata_by_uuid(model_uuid,
                                         collection_name=collection)

    # Get the tarball containing the saved model from the datastore, and extract it into model_dir.
    if 'ModelMetadata' in metadata_dict:
        # Convert old style metadata
        metadata_dict = convert_metadata(metadata_dict)

    if 'model_parameters' in metadata_dict:
        model_parameters = metadata_dict['model_parameters']
    else:
        raise Exception("Bad metadata for model UUID %s" % model_uuid)

    os.makedirs(model_dir, exist_ok=True)

    model_params = parse.wrapper(metadata_dict)

    # Override selected model training parameters

    # Check that buckets where model tarball and transformers were saved still exist. If not, try alt_bucket.
    trans_bucket_differs = (model_params.transformer_bucket !=
                            model_params.model_bucket)
    model_bucket_meta = ds_client.ds_buckets.get_buckets(
        buckets=[model_params.model_bucket]).result()
    if len(model_bucket_meta) == 0:
        model_params.model_bucket = alt_bucket
    if trans_bucket_differs:
        trans_bucket_meta = ds_client.ds_buckets.get_buckets(
            buckets=[model_params.transformer_bucket]).result()
        if len(trans_bucket_meta) == 0:
            model_params.transformer_bucket = alt_bucket
    else:
        if len(model_bucket_meta) == 0:
            model_params.transformer_bucket = alt_bucket

    # Unpack the model state tarball into a subdirectory of the new archive
    model_dataset_key = 'model_%s_tarball' % model_uuid
    extract_dir = dsf.retrieve_dataset_by_datasetkey(model_dataset_key,
                                                     model_params.model_bucket,
                                                     client=ds_client,
                                                     return_metadata=False,
                                                     nrows=None,
                                                     print_metadata=False,
                                                     sep=False,
                                                     tarpath='%s/best_model' %
                                                     model_dir)

    # Download the transformers pickle file if there is one
    if trans.transformers_needed(model_params):
        try:
            if model_params.transformer_key is None:
                transformer_key = 'transformers_%s.pkl' % model_uuid
            else:
                transformer_key = model_params.transformer_key
            trans_fp = ds_client.open_bucket_dataset(
                model_params.transformer_bucket, transformer_key, mode='b')
            trans_data = trans_fp.read()
            trans_fp.close()
            trans_path = "%s/transformers.pkl" % model_dir
            trans_out = open(trans_path, mode='wb')
            trans_out.write(trans_data)
            trans_out.close()
            del model_parameters['transformer_oid']
            model_parameters['transformer_key'] = 'transformers.pkl'

        except:
            print(
                "Transformers expected but not found in datastore in bucket %s with key\n%s"
                % (model_params.transformer_bucket, transformer_key))
            raise

    # Save the metadata params
    model_parameters['save_results'] = False
    meta_path = "%s/model_metadata.json" % model_dir
    with open(meta_path, 'w') as meta_out:
        json.dump(metadata_dict, meta_out, indent=4)

    # Create a new tarball containing both the metadata and the parameters from the retrieved model tarball
    new_tarpath = "%s.tar.gz" % model_dir
    tarball = tarfile.open(new_tarpath, mode='w:gz')
    tarball.add(model_dir, arcname='.')
    tarball.close()
    print("Wrote model files to %s" % new_tarpath)
Пример #7
0
def liability_dset_diversity(bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine', **metric_kwargs):
    """
    Load datasets from datastore, featurize them, and plot distributions of their inter-compound
    distances.
    """
    log = logging.getLogger('ATOM')
    ds_client = dsf.config_client()
    ds_table = dsf.search_datasets_by_key_value(key='param', value=['PIC50','PEC50'], operator='in', 
                                                bucket=bucket, client=ds_client)
    dset_keys = ds_table.dataset_key.values
    metadata = ds_table.metadata.values
    split = 'random'
    task_names = []
    num_cmpds = []
    for i, dset_key in enumerate(dset_keys):
        md_dict = dsf.metadata_to_dict(metadata[i])
        task_name = md_dict['task_name']
        num_cmpds = md_dict['CMPD_COUNT'][0]
        log.warning("Loading dataset for %s, %d compounds" % (task_name, num_cmpds))
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
        dataset_dir = os.path.dirname(dset_key)
        dataset_file = os.path.basename(dset_key)
        if feat_type == 'descriptors':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                            dataset_file=dataset_file,
                            y=task_name,
                            bucket=bucket,
                            descriptor_key='all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi',
                            descriptor_type='MOE',
                            splitter=split,
                            id_col='compound_id',
                            smiles_col='rdkit_smiles',
                            featurizer='descriptors',
                            prediction_type='regression', 
                            system='twintron-blue',
                            datastore=True,
                            transformers=True)
        elif feat_type == 'ECFP':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                            dataset_file=dataset_file,
                            y=task_name,
                            bucket=bucket,
                            splitter=split,
                            id_col='compound_id',
                            smiles_col='rdkit_smiles',
                            featurizer='ECFP',
                            prediction_type='regression', 
                            system='twintron-blue',
                            datastore=True,
                            ecfp_radius=2, ecfp_size=1024, 
                            transformers=True)
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        log.warning("Featurizing data with %s featurizer" % feat_type)
        model_dataset = md.MinimalDataset(params)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
            continue
        plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, task_name, **metric_kwargs)

    # ------------------------------------------------------------------------------------------------------------------
    def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine',
                           **metric_kwargs):
        """
        Load datasets from datastore, featurize them, and plot distributions of their inter-compound
        distances.
        """
        log = logging.getLogger('ATOM')
    
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
    
        if feat_type == 'descriptors':
            params = parse.wrapper(dict(
                dataset_key=dset_key,
                bucket=bucket,
                descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather',
                descriptor_type='moe',
                featurizer='descriptors',
                system='twintron-blue',
                datastore=True,
                transformers=True))
        elif feat_type == 'ECFP':
            params = parse.wrapper(dict(
                dataset_key=dset_key,
                bucket=bucket,
                featurizer='ECFP',
                system='twintron-blue',
                datastore=True,
                ecfp_radius=2,
                ecfp_size=1024,
                transformers=True))
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket)
        if 'id_col' in metadata.keys():
            params.id_col = metadata['id_col']
        if 'param' in metadata.keys():
            params.response_cols = [metadata['param']]
        elif 'response_col' in metadata.keys():
            params.response_cols = [metadata['response_col']]
        elif 'response_cols' in metadata.keys():
            params.response_cols = metadata['response_cols']
    
        if 'smiles_col' in metadata.keys():
            params.smiles_col = metadata['smiles_col']
    
        if 'class_number' in metadata.keys():
            params.class_number = metadata['class_number']
        params.dataset_name = dset_key.split('/')[-1].rstrip('.csv')
    
        log.warning("Featurizing data with %s featurizer" % feat_type)
        featurization = feat.create_featurization(params)
        model_dataset = md.MinimalDataset(params, featurization)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
            return
        # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs)
        dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all')
        import scipy
        dists = scipy.spatial.distance.squareform(dists)
        res_dir = '/ds/projdata/gsk_data/model_analysis/'
        plt_dir = '%s/Plots' % res_dir
        file_prefix = dset_key.split('/')[-1].rstrip('.csv')
        mcs_linkage = linkage(dists, method='complete')
        pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix)
        pdf = PdfPages(pdf_path)
        g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma')
        if plt_dir is not None:
            pdf.savefig(g.fig)
            pdf.close()
        return dists
Пример #8
0
def export_model(model_uuid, collection, model_dir):
    """
    Export the metadata (parameters) and other files needed to recreate a model
    from the model tracker database to a gzipped tar archive.

    Args:
        model_uuid (str): Model unique identifier

        collection (str): Name of the collection holding the model in the database.

        model_dir (str): Path to directory where the model metadata and parameter files will be written. The directory will
        be created if it doesn't already exist. Subsequently, the directory contents will be packed into a gzipped tar archive
        named model_dir.tar.gz.

    Returns:
        none
    """
    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return

    ds_client = dsf.config_client()
    metadata_dict = get_metadata_by_uuid(model_uuid,
                                         collection_name=collection)

    # Get the tarball containing the saved model from the datastore, and extract it into model_dir.
    if 'ModelMetadata' in metadata_dict:
        # Convert old style metadata
        metadata_dict = convert_metadata(metadata_dict)

    if 'model_parameters' in metadata_dict:
        model_parameters = metadata_dict['model_parameters']
        model_dataset_oid = model_parameters['model_dataset_oid']
    else:
        raise Exception("Bad metadata for model UUID %s" % model_uuid)

    os.makedirs(model_dir, exist_ok=True)

    # Unpack the model state tarball into a subdirectory of the new archive
    extract_dir = dsf.retrieve_dataset_by_dataset_oid(model_dataset_oid,
                                                      client=ds_client,
                                                      return_metadata=False,
                                                      nrows=None,
                                                      print_metadata=False,
                                                      sep=False,
                                                      tarpath='%s/best_model' %
                                                      model_dir)

    # Download the transformers pickle file if there is one
    try:
        transformer_oid = model_parameters["transformer_oid"]
        trans_fp = ds_client.open_dataset(transformer_oid, mode='b')
        trans_data = trans_fp.read()
        trans_fp.close()
        trans_path = "%s/transformers.pkl" % model_dir
        trans_out = open(trans_path, mode='wb')
        trans_out.write(trans_data)
        trans_out.close()
        del model_parameters['transformer_oid']
        model_parameters['transformer_key'] = 'transformers.pkl'

    except KeyError:
        # OK if there are no transformers
        pass

    # Save the metadata params
    meta_path = "%s/model_metadata.json" % model_dir
    with open(meta_path, 'w') as meta_out:
        json.dump(metadata_dict, meta_out, indent=4)

    # Create a new tarball containing both the metadata and the parameters from the retrieved model tarball
    new_tarpath = "%s.tar.gz" % model_dir
    tarball = tarfile.open(new_tarpath, mode='w:gz')
    tarball.add(model_dir, arcname='.')
    tarball.close()
    print("Wrote model files to %s" % new_tarpath)
Пример #9
0
def analyze_split(params,
                  id_col='compound_id',
                  smiles_col='rdkit_smiles',
                  active_col='active'):
    """
    Evaluate the AVE bias for the training/validation and training/test set splits of the given dataset.

    Also show the active frequencies in each subset and for the dataset as a whole.
    id_col, smiles_col and active_col are defaults to be used in case they aren't found in the dataset metadata; if found
    the metadata values are used instead.

    Args:
        params (argparse.Namespace): Pipeline parameters.

        id_col (str): Dataset column containing compound IDs.

        smiles_col (str): Dataset column containing SMILES strings.

        active_col (str): Dataset column containing binary classifications.

    Returns:
        :obj:`pandas.DataFrame`: Table of split subsets showing sizes, numbers and fractions of active compounds

    """
    dset_key = params.dataset_key
    bucket = params.bucket
    split_uuid = params.split_uuid

    ds_client = dsf.config_client()
    try:
        split_metadata = dsf.search_datasets_by_key_value('split_dataset_uuid',
                                                          split_uuid,
                                                          ds_client,
                                                          operator='in',
                                                          bucket=bucket)
        split_oid = split_metadata['dataset_oid'].values[0]
        split_df = dsf.retrieve_dataset_by_dataset_oid(split_oid,
                                                       client=ds_client)
    except Exception as e:
        print("Error when loading split file:\n%s" % str(e))
        raise

    try:
        dataset_df = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                        bucket,
                                                        client=ds_client)
        dataset_meta = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                          bucket,
                                                          client=ds_client,
                                                          return_metadata=True)
    except Exception as e:
        print("Error when loading dataset:\n%s" % str(e))
        raise
    kv_dict = dsf.get_key_val(dataset_meta['metadata'])
    id_col = kv_dict.get('id_col', id_col)
    smiles_col = kv_dict.get('smiles_col', smiles_col)
    active_col = kv_dict.get('response_col', active_col)

    try:
        print('Dataset has %d unique compound IDs' %
              len(set(dataset_df[id_col].values)))
        print('Split table has %d unique compound IDs' %
              len(set(split_df.cmpd_id.values)))

        dset_df = dataset_df.merge(split_df,
                                   how='inner',
                                   left_on=id_col,
                                   right_on='cmpd_id').drop('cmpd_id', axis=1)
    except Exception as e:
        print("Error when joining dataset with split dataset:\n%s" % str(e))
        raise

    featurization = feat.create_featurization(params)
    data = md.create_model_dataset(params, featurization, ds_client)
    data.get_featurized_data()
    feat_arr = data.dataset.X
    # TODO: impute missing values if necessary
    y = data.dataset.y.flatten()
    if len(set(y) - set([0, 1])) > 0:
        raise ValueError(
            'AVEMinSplitter only works on binary classification datasets')
    ids = data.dataset.ids
    active_ind = np.where(y == 1)[0]
    inactive_ind = np.where(y == 0)[0]
    active_feat = feat_arr[active_ind, :]
    inactive_feat = feat_arr[inactive_ind, :]
    num_active = len(active_ind)
    num_inactive = len(inactive_ind)
    active_ids = ids[active_ind]
    inactive_ids = ids[inactive_ind]
    active_id_ind = dict(zip(active_ids, range(len(active_ids))))
    inactive_id_ind = dict(zip(inactive_ids, range(len(inactive_ids))))
    if params.featurizer == 'ecfp':
        metric = 'jaccard'
    elif params.featurizer == 'graphconv':
        raise ValueError(
            "ave_min splitter dopesn't support graphconv features")
    else:
        metric = 'euclidean'

    # Calculate distance thresholds where nearest neighborfunction should be evaluated
    if metric == 'jaccard':
        max_nn_dist = 1.0
    else:
        nan_mat = np.isnan(feat_arr)
        nnan = np.sum(nan_mat)
        if nnan > 0:
            log.info('Input feature matrix has %d NaN elements' % nnan)
            not_nan = ~nan_mat
            for i in range(feat_arr.shape[1]):
                feat_arr[nan_mat[:, i], i] = np.mean(feat_arr[not_nan[:, i],
                                                              i])
        nn_dist = np.sort(squareform(pdist(feat_arr, metric)))[:, 1]
        med_nn_dist = np.median(nn_dist)
        max_nn_dist = 3.0 * med_nn_dist
    ndist = 100
    dist_thresh = np.linspace(0.0, max_nn_dist, ndist)

    # Compute distance matrices between subsets
    num_workers = 1
    aa_dist = _calc_dist_mat(active_feat, active_feat, metric, None,
                             num_workers)
    ii_dist = _calc_dist_mat(inactive_feat, inactive_feat, metric, None,
                             num_workers)
    ai_dist = _calc_dist_mat(active_feat, inactive_feat, metric, None,
                             num_workers)
    ia_dist = ai_dist.transpose()

    subsets = sorted(set(dset_df.subset.values))
    subset_active_ind = {}
    subset_inactive_ind = {}

    if 'train' in subsets:
        # this is a TVT split
        subsets = ['train', 'valid', 'test']
        for subset in subsets:
            subset_df = dset_df[dset_df.subset == subset]
            active_df = subset_df[subset_df[active_col] == 1]
            inactive_df = subset_df[subset_df[active_col] == 0]
            subset_active_ids = active_df[id_col].values
            subset_inactive_ids = inactive_df[id_col].values
            subset_active_ind[subset] = [
                active_id_ind[id] for id in subset_active_ids
            ]
            subset_inactive_ind[subset] = [
                inactive_id_ind[id] for id in subset_inactive_ids
            ]

        taI = subset_active_ind['train']
        tiI = subset_inactive_ind['train']
        print("Results for %s split with %s %s features:" %
              (params.splitter, params.descriptor_type, params.featurizer))
        for valid_set in ['valid', 'test']:
            vaI = subset_active_ind[valid_set]
            viI = subset_inactive_ind[valid_set]
            split_params = ((vaI, viI, taI, tiI), aa_dist, ii_dist, ai_dist,
                            ia_dist, dist_thresh)
            _plot_nn_dist_distr(split_params)
            bias = _plot_bias(split_params, niter=0)
            print("For train/%s split: AVE bias = %.5f" % (valid_set, bias))
    else:
        # TODO: deal with k-fold splits later
        print('k-fold CV splits not supported yet')
        return

    # Tabulate the fractions of actives in the full dataset and each subset
    subset_list = []
    size_list = []
    frac_list = []
    active_frac_list = []

    dset_size = data.dataset.X.shape[0]
    dset_active = sum(data.dataset.y)
    subset_list.append('full dataset')
    size_list.append(dset_size)
    frac_list.append(1.0)
    active_frac_list.append(dset_active / dset_size)

    for subset in subsets:
        active_size = len(subset_active_ind[subset])
        inactive_size = len(subset_inactive_ind[subset])
        subset_size = active_size + inactive_size
        active_frac = active_size / subset_size
        subset_list.append(subset)
        size_list.append(subset_size)
        frac_list.append(subset_size / dset_size)
        active_frac_list.append(active_frac)
    frac_df = pd.DataFrame(
        dict(subset=subset_list,
             size=size_list,
             fraction=frac_list,
             active_frac=active_frac_list))
    print('\nSplit subsets:')
    print(frac_df)

    return frac_df
Пример #10
0
def _liability_dset_diversity(bucket='public',
                              feat_type='descriptors',
                              dist_metric='cosine',
                              **metric_kwargs):
    """
    Load datasets from datastore, featurize them, and plot distributions of their inter-compound
    distances.
    """
    log = logging.getLogger('ATOM')
    ds_client = dsf.config_client()
    ds_table = dsf.search_datasets_by_key_value(key='param',
                                                value=['PIC50', 'PEC50'],
                                                operator='in',
                                                bucket=bucket,
                                                client=ds_client)
    dset_keys = ds_table.dataset_key.values
    metadata = ds_table.metadata.values
    split = 'random'
    task_names = []
    num_cmpds = []
    for i, dset_key in enumerate(dset_keys):
        md_dict = dsf.metadata_to_dict(metadata[i])
        task_name = md_dict['task_name']
        num_cmpds = md_dict['CMPD_COUNT'][0]
        log.warning("Loading dataset for %s, %d compounds" %
                    (task_name, num_cmpds))
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket,
                                                     ds_client)
        dataset_dir = os.path.dirname(dset_key)
        dataset_file = os.path.basename(dset_key)
        if feat_type == 'descriptors':
            params = argparse.Namespace(
                dataset_dir=dataset_dir,
                dataset_file=dataset_file,
                y=task_name,
                bucket=bucket,
                descriptor_key=
                'all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi',
                descriptor_type='MOE',
                splitter=split,
                id_col='compound_id',
                smiles_col='rdkit_smiles',
                featurizer='descriptors',
                prediction_type='regression',
                system='twintron-blue',
                datastore=True,
                transformers=True)
        elif feat_type == 'ECFP':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                                        dataset_file=dataset_file,
                                        y=task_name,
                                        bucket=bucket,
                                        splitter=split,
                                        id_col='compound_id',
                                        smiles_col='rdkit_smiles',
                                        featurizer='ECFP',
                                        prediction_type='regression',
                                        system='twintron-blue',
                                        datastore=True,
                                        ecfp_radius=2,
                                        ecfp_size=1024,
                                        transformers=True)
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        log.warning("Featurizing data with %s featurizer" % feat_type)
        model_dataset = md.MinimalDataset(params)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" %
                        num_cmpds)
            continue
        plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric,
                                task_name, **metric_kwargs)