Пример #1
0
def liability_dset_diversity(bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine', **metric_kwargs):
    """
    Load datasets from datastore, featurize them, and plot distributions of their inter-compound
    distances.
    """
    log = logging.getLogger('ATOM')
    ds_client = dsf.config_client()
    ds_table = dsf.search_datasets_by_key_value(key='param', value=['PIC50','PEC50'], operator='in', 
                                                bucket=bucket, client=ds_client)
    dset_keys = ds_table.dataset_key.values
    metadata = ds_table.metadata.values
    split = 'random'
    task_names = []
    num_cmpds = []
    for i, dset_key in enumerate(dset_keys):
        md_dict = dsf.metadata_to_dict(metadata[i])
        task_name = md_dict['task_name']
        num_cmpds = md_dict['CMPD_COUNT'][0]
        log.warning("Loading dataset for %s, %d compounds" % (task_name, num_cmpds))
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
        dataset_dir = os.path.dirname(dset_key)
        dataset_file = os.path.basename(dset_key)
        if feat_type == 'descriptors':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                            dataset_file=dataset_file,
                            y=task_name,
                            bucket=bucket,
                            descriptor_key='all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi',
                            descriptor_type='MOE',
                            splitter=split,
                            id_col='compound_id',
                            smiles_col='rdkit_smiles',
                            featurizer='descriptors',
                            prediction_type='regression', 
                            system='twintron-blue',
                            datastore=True,
                            transformers=True)
        elif feat_type == 'ECFP':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                            dataset_file=dataset_file,
                            y=task_name,
                            bucket=bucket,
                            splitter=split,
                            id_col='compound_id',
                            smiles_col='rdkit_smiles',
                            featurizer='ECFP',
                            prediction_type='regression', 
                            system='twintron-blue',
                            datastore=True,
                            ecfp_radius=2, ecfp_size=1024, 
                            transformers=True)
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        log.warning("Featurizing data with %s featurizer" % feat_type)
        model_dataset = md.MinimalDataset(params)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
            continue
        plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, task_name, **metric_kwargs)

    # ------------------------------------------------------------------------------------------------------------------
    def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine',
                           **metric_kwargs):
        """
        Load datasets from datastore, featurize them, and plot distributions of their inter-compound
        distances.
        """
        log = logging.getLogger('ATOM')
    
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
    
        if feat_type == 'descriptors':
            params = parse.wrapper(dict(
                dataset_key=dset_key,
                bucket=bucket,
                descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather',
                descriptor_type='moe',
                featurizer='descriptors',
                system='twintron-blue',
                datastore=True,
                transformers=True))
        elif feat_type == 'ECFP':
            params = parse.wrapper(dict(
                dataset_key=dset_key,
                bucket=bucket,
                featurizer='ECFP',
                system='twintron-blue',
                datastore=True,
                ecfp_radius=2,
                ecfp_size=1024,
                transformers=True))
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket)
        if 'id_col' in metadata.keys():
            params.id_col = metadata['id_col']
        if 'param' in metadata.keys():
            params.response_cols = [metadata['param']]
        elif 'response_col' in metadata.keys():
            params.response_cols = [metadata['response_col']]
        elif 'response_cols' in metadata.keys():
            params.response_cols = metadata['response_cols']
    
        if 'smiles_col' in metadata.keys():
            params.smiles_col = metadata['smiles_col']
    
        if 'class_number' in metadata.keys():
            params.class_number = metadata['class_number']
        params.dataset_name = dset_key.split('/')[-1].rstrip('.csv')
    
        log.warning("Featurizing data with %s featurizer" % feat_type)
        featurization = feat.create_featurization(params)
        model_dataset = md.MinimalDataset(params, featurization)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
            return
        # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs)
        dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all')
        import scipy
        dists = scipy.spatial.distance.squareform(dists)
        res_dir = '/ds/projdata/gsk_data/model_analysis/'
        plt_dir = '%s/Plots' % res_dir
        file_prefix = dset_key.split('/')[-1].rstrip('.csv')
        mcs_linkage = linkage(dists, method='complete')
        pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix)
        pdf = PdfPages(pdf_path)
        g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma')
        if plt_dir is not None:
            pdf.savefig(g.fig)
            pdf.close()
        return dists
Пример #2
0
 def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine',
                        **metric_kwargs):
     """
     Load datasets from datastore, featurize them, and plot distributions of their inter-compound
     distances.
     """
     log = logging.getLogger('ATOM')
 
     dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
 
     if feat_type == 'descriptors':
         params = parse.wrapper(dict(
             dataset_key=dset_key,
             bucket=bucket,
             descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather',
             descriptor_type='moe',
             featurizer='descriptors',
             system='twintron-blue',
             datastore=True,
             transformers=True))
     elif feat_type == 'ECFP':
         params = parse.wrapper(dict(
             dataset_key=dset_key,
             bucket=bucket,
             featurizer='ECFP',
             system='twintron-blue',
             datastore=True,
             ecfp_radius=2,
             ecfp_size=1024,
             transformers=True))
     else:
         log.error("Feature type %s not supported" % feat_type)
         return
     metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket)
     if 'id_col' in metadata.keys():
         params.id_col = metadata['id_col']
     if 'param' in metadata.keys():
         params.response_cols = [metadata['param']]
     elif 'response_col' in metadata.keys():
         params.response_cols = [metadata['response_col']]
     elif 'response_cols' in metadata.keys():
         params.response_cols = metadata['response_cols']
 
     if 'smiles_col' in metadata.keys():
         params.smiles_col = metadata['smiles_col']
 
     if 'class_number' in metadata.keys():
         params.class_number = metadata['class_number']
     params.dataset_name = dset_key.split('/')[-1].rstrip('.csv')
 
     log.warning("Featurizing data with %s featurizer" % feat_type)
     featurization = feat.create_featurization(params)
     model_dataset = md.MinimalDataset(params, featurization)
     model_dataset.get_featurized_data(dset_df)
     num_cmpds = model_dataset.dataset.X.shape[0]
     if num_cmpds > 50000:
         log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
         return
     # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs)
     dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all')
     import scipy
     dists = scipy.spatial.distance.squareform(dists)
     res_dir = '/ds/projdata/gsk_data/model_analysis/'
     plt_dir = '%s/Plots' % res_dir
     file_prefix = dset_key.split('/')[-1].rstrip('.csv')
     mcs_linkage = linkage(dists, method='complete')
     pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix)
     pdf = PdfPages(pdf_path)
     g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma')
     if plt_dir is not None:
         pdf.savefig(g.fig)
         pdf.close()
     return dists
Пример #3
0
def _liability_dset_diversity(bucket='public',
                              feat_type='descriptors',
                              dist_metric='cosine',
                              **metric_kwargs):
    """
    Load datasets from datastore, featurize them, and plot distributions of their inter-compound
    distances.
    """
    log = logging.getLogger('ATOM')
    ds_client = dsf.config_client()
    ds_table = dsf.search_datasets_by_key_value(key='param',
                                                value=['PIC50', 'PEC50'],
                                                operator='in',
                                                bucket=bucket,
                                                client=ds_client)
    dset_keys = ds_table.dataset_key.values
    metadata = ds_table.metadata.values
    split = 'random'
    task_names = []
    num_cmpds = []
    for i, dset_key in enumerate(dset_keys):
        md_dict = dsf.metadata_to_dict(metadata[i])
        task_name = md_dict['task_name']
        num_cmpds = md_dict['CMPD_COUNT'][0]
        log.warning("Loading dataset for %s, %d compounds" %
                    (task_name, num_cmpds))
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket,
                                                     ds_client)
        dataset_dir = os.path.dirname(dset_key)
        dataset_file = os.path.basename(dset_key)
        if feat_type == 'descriptors':
            params = argparse.Namespace(
                dataset_dir=dataset_dir,
                dataset_file=dataset_file,
                y=task_name,
                bucket=bucket,
                descriptor_key=
                'all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi',
                descriptor_type='MOE',
                splitter=split,
                id_col='compound_id',
                smiles_col='rdkit_smiles',
                featurizer='descriptors',
                prediction_type='regression',
                system='twintron-blue',
                datastore=True,
                transformers=True)
        elif feat_type == 'ECFP':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                                        dataset_file=dataset_file,
                                        y=task_name,
                                        bucket=bucket,
                                        splitter=split,
                                        id_col='compound_id',
                                        smiles_col='rdkit_smiles',
                                        featurizer='ECFP',
                                        prediction_type='regression',
                                        system='twintron-blue',
                                        datastore=True,
                                        ecfp_radius=2,
                                        ecfp_size=1024,
                                        transformers=True)
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        log.warning("Featurizing data with %s featurizer" % feat_type)
        model_dataset = md.MinimalDataset(params)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" %
                        num_cmpds)
            continue
        plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric,
                                task_name, **metric_kwargs)