def load_from_cache(self, cache, params):
     param_fname = '{}.params.json'.format(cache)
     if not os.path.isfile(param_fname):
         logger.warning('Cache parameter file does not exist: %s',
                        param_fname)
         return False
     with open(param_fname) as param_file:
         try:
             cached_params = json.load(param_file)
         except json.JSONDecodeError as e:
             logger.warning('Could not decode parameter file %s: %s',
                            param_fname, e)
             return False
     ignore_keys = ['cache', 'partition_by', 'single']
     equal, diffs = dict_compare(params, cached_params, ignore_keys)
     if not equal:
         logger.warning(
             'Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s',
             diffs, cached_params, params)
         logger.warning('\nRemove %s to rebuild data cache.\n', param_fname)
         raise ValueError(
             'Could not load from a cache with incompatible keys:', diffs)
     else:
         fname = '{}.pkl'.format(cache)
         if not os.path.isfile(fname):
             logger.warning('Cache file does not exist: %s', fname)
             return False
         with open(fname, 'rb') as f:
             obj = pickle.load(f)
         self.__dict__.update(obj.__dict__)
         logger.info('Loaded data from cache: %s', fname)
         return True
     return False
示例#2
0
def load_drug_data(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True):
    df_info = load_drug_info()
    df_info['Drug'] = df_info['PUBCHEM']

    df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols)
    df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols)

    df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'})
    df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'})

    df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None)
    df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None)

    df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True)
    df1 = pd.DataFrame(df_desc.loc[:, 'Drug'])
    df2 = df_desc.drop('Drug', 1)
    df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=scaling, imputing=imputing, dropna=dropna)
    if add_prefix:
        df2 = df2.add_prefix('dragon7.')
    df_desc = pd.concat([df1, df2], axis=1)

    df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True)
    df1 = pd.DataFrame(df_fp.loc[:, 'Drug'])
    df2 = df_fp.drop('Drug', 1)
    df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=None, imputing=imputing, dropna=dropna)
    if add_prefix:
        df2 = df2.add_prefix('dragon7.')
    df_fp = pd.concat([df1, df2], axis=1)

    logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape)
    logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape)

    return df_desc, df_fp
 def save_to_cache(self, cache, params):
     for k in ['self', 'cache', 'single']:
         if k in params:
             del params[k]
     param_fname = '{}.params.json'.format(cache)
     with open(param_fname, 'w') as param_file:
         json.dump(params, param_file, sort_keys=True)
     fname = '{}.pkl'.format(cache)
     with open(fname, 'wb') as f:
         pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
     logger.info('Saved data to cache: %s', fname)
def assign_partition_groups(df, partition_by='drug_pair'):
    if partition_by == 'cell':
        group = df['Sample']
    elif partition_by == 'drug_pair':
        df_info = drug_data.load_drug_info()
        id_dict = df_info[['ID', 'PUBCHEM'
                           ]].drop_duplicates(['ID']).set_index('ID').iloc[:,
                                                                           0]
        group = df['Drug1'].copy()
        group[(df['Drug2'].notnull())
              & (df['Drug1'] <= df['Drug2'])] = df['Drug1'] + ',' + df['Drug2']
        group[(df['Drug2'].notnull())
              & (df['Drug1'] > df['Drug2'])] = df['Drug2'] + ',' + df['Drug1']
        group2 = group.map(id_dict)
        mapped = group2.notnull()
        group[mapped] = group2[mapped]
    elif partition_by == 'index':
        group = df.reset_index()['index']
    logger.info('Grouped response data by %s: %d groups', partition_by,
                group.nunique())
    return group
示例#5
0
def load_aggregated_single_response(target='AUC', min_r2_fit=0.3, max_ec50_se=3, combo_format=False, rename=True):
    path = get_file(DATA_URL + 'combined_single_response_agg')

    df = global_cache.get(path)
    if df is None:
        df = pd.read_csv(path, engine='c', sep='\t',
                         dtype={'SOURCE': str, 'CELL': str, 'DRUG': str, 'STUDY': str,
                                'AUC': np.float32, 'IC50': np.float32,
                                'EC50': np.float32, 'EC50se': np.float32,
                                'R2fit': np.float32, 'Einf': np.float32,
                                'HS': np.float32, 'AAC1': np.float32,
                                'AUC1': np.float32, 'DSS1': np.float32})
        global_cache[path] = df

    total = len(df)

    df = df[(df['R2fit'] >= min_r2_fit) & (df['EC50se'] <= max_ec50_se)]
    df = df[['SOURCE', 'CELL', 'DRUG', target, 'STUDY']]
    df = df[~df[target].isnull()]

    logger.info('Loaded %d dose independent response samples (filtered by EC50se <= %f & R2fit >=%f from a total of %d).', len(df), max_ec50_se, min_r2_fit, total)

    if combo_format:
        df = df.rename(columns={'DRUG': 'DRUG1'})
        df['DRUG2'] = np.nan
        df['DRUG2'] = df['DRUG2'].astype(object)
        df = df[['SOURCE', 'CELL', 'DRUG1', 'DRUG2', target, 'STUDY']]
        if rename:
            df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample',
                                    'DRUG1': 'Drug1', 'DRUG2': 'Drug2', 'STUDY': 'Study'})
    else:
        if rename:
            df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample',
                                    'DRUG': 'Drug', 'STUDY': 'Study'})

    return df
示例#6
0
def load_combined_dose_response(rename=True):
    df1 = load_single_dose_response(combo_format=True)
    logger.info('Loaded {} single drug dose response measurements'.format(df1.shape[0]))

    df2 = load_combo_dose_response()
    logger.info('Loaded {} drug pair dose response measurements'.format(df2.shape[0]))

    df = pd.concat([df1, df2])
    logger.info('Combined dose response data contains sources: {}'.format(df['SOURCE'].unique()))

    if rename:
        df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample',
                                'DRUG1': 'Drug1', 'DRUG2': 'Drug2',
                                'DOSE1': 'Dose1', 'DOSE2': 'Dose2',
                                'GROWTH': 'Growth', 'STUDY': 'Study'})
    return df
    def build_feature_list(self, single=False):
        input_features = collections.OrderedDict()
        feature_shapes = collections.OrderedDict()

        if not self.agg_dose:
            doses = ['dose1', 'dose2'] if not single else ['dose1']
            for dose in doses:
                input_features[dose] = 'dose'
                feature_shapes['dose'] = (1, )

        if self.encode_response_source:
            input_features['response.source'] = 'response.source'
            feature_shapes['response.source'] = (self.df_source.shape[1] - 1, )

        for fea in self.cell_features:
            feature_type = 'cell.' + fea
            feature_name = 'cell.' + fea
            df_cell = getattr(self, self.cell_df_dict[fea])
            input_features[feature_name] = feature_type
            feature_shapes[feature_type] = (df_cell.shape[1] - 1, )

        drugs = ['drug1', 'drug2'] if not single else ['drug1']
        for drug in drugs:
            for fea in self.drug_features:
                feature_type = 'drug.' + fea
                feature_name = drug + '.' + fea
                df_drug = getattr(self, self.drug_df_dict[fea])
                input_features[feature_name] = feature_type
                feature_shapes[feature_type] = (df_drug.shape[1] - 1, )

        input_dim = sum(
            [np.prod(feature_shapes[x]) for x in input_features.values()])

        self.input_features = input_features
        self.feature_shapes = feature_shapes
        self.input_dim = input_dim

        logger.info('Input features shapes:')
        for k, v in self.input_features.items():
            logger.info('  {}: {}'.format(k, self.feature_shapes[v]))
        logger.info('Total input dimensions: {}'.format(self.input_dim))
示例#8
0
def load_cell_rnaseq(ncols=None,
                     scaling='std',
                     imputing='mean',
                     add_prefix=True,
                     use_landmark_genes=False,
                     use_filtered_genes=False,
                     feature_subset=None,
                     preprocess_rnaseq=None,
                     embed_feature_source=False,
                     sample_set=None,
                     index_by_sample=False):

    if use_landmark_genes:
        filename = 'combined_rnaseq_data_lincs1000'
    elif use_filtered_genes:
        filename = 'combined_rnaseq_data_filtered'
    else:
        filename = 'combined_rnaseq_data'

    if preprocess_rnaseq and preprocess_rnaseq != 'none':
        scaling = None
        filename += ('_' + preprocess_rnaseq)  # 'source_scale' or 'combat'

    path = get_file(DATA_URL + filename)
    df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0)
    total = df_cols.shape[1] - 1  # remove Sample column
    if 'Cancer_type_id' in df_cols.columns:
        total -= 1
    usecols = None
    if ncols and ncols < total:
        usecols = np.random.choice(total, size=ncols, replace=False)
        usecols = np.append([0], np.add(sorted(usecols), 2))
        df_cols = df_cols.iloc[:, usecols]
    if feature_subset:
        with_prefix = lambda x: 'rnaseq.' + x if add_prefix else x
        usecols = [0] + [
            i for i, c in enumerate(df_cols.columns)
            if with_prefix(c) in feature_subset
        ]
        df_cols = df_cols.iloc[:, usecols]

    dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:])
    df = pd.read_csv(path,
                     engine='c',
                     sep='\t',
                     usecols=usecols,
                     dtype=dtype_dict)
    if 'Cancer_type_id' in df.columns:
        df.drop('Cancer_type_id', axis=1, inplace=True)

    prefixes = df['Sample'].str.extract('^([^.]*)',
                                        expand=False).rename('Source')
    sources = prefixes.drop_duplicates().reset_index(drop=True)
    df_source = pd.get_dummies(sources, prefix='rnaseq.source', prefix_sep='.')
    df_source = pd.concat([sources, df_source], axis=1)

    df1 = df['Sample']
    if embed_feature_source:
        df_sample_source = pd.concat([df1, prefixes], axis=1)
        df1 = df_sample_source.merge(df_source, on='Source',
                                     how='left').drop('Source', axis=1)
        logger.info(
            'Embedding RNAseq data source into features: %d additional columns',
            df1.shape[1] - 1)

    df2 = df.drop('Sample', 1)
    if add_prefix:
        df2 = df2.add_prefix('rnaseq.')

    df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing)

    df = pd.concat([df1, df2], axis=1)

    # scaling needs to be done before subsampling
    if sample_set:
        chosen = df['Sample'].str.startswith(sample_set)
        df = df[chosen].reset_index(drop=True)

    if index_by_sample:
        df = df.set_index('Sample')

    logger.info('Loaded combined RNAseq data: %s', df.shape)

    return df
    def load(
            self,
            cache=None,
            ncols=None,
            scaling='std',
            dropna=None,
            agg_dose=None,
            embed_feature_source=True,
            encode_response_source=True,
            cell_features=['rnaseq'],
            drug_features=['descriptors', 'fingerprints'],
            cell_feature_subset_path=None,
            drug_feature_subset_path=None,
            drug_lower_response=1,
            drug_upper_response=-1,
            drug_response_span=0,
            drug_median_response_min=-1,
            drug_median_response_max=1,
            use_landmark_genes=False,
            use_filtered_genes=False,
            preprocess_rnaseq=None,
            single=False,
            # train_sources=['GDSC', 'CTRP', 'ALMANAC', 'NCI60'],
            train_sources=['GDSC', 'CTRP', 'ALMANAC'],
            # val_sources='train',
            # test_sources=['CCLE', 'gCSI'],
            test_sources=['train'],
            partition_by='drug_pair'):

        params = locals().copy()
        del params['self']

        if not cell_features or 'none' in [x.lower() for x in cell_features]:
            cell_features = []

        if not drug_features or 'none' in [x.lower() for x in drug_features]:
            drug_features = []

        if cache and self.load_from_cache(cache, params):
            self.build_feature_list(single=single)
            return

        logger.info('Loading data from scratch ...')

        if agg_dose:
            df_response = response_data.load_aggregated_single_response(
                target=agg_dose, combo_format=True)
        else:
            df_response = response_data.load_combined_dose_response()

        if logger.isEnabledFor(logging.INFO):
            logger.info('Summary of combined dose response by source:')
            logger.info(
                response_data.summarize_response_data(df_response,
                                                      target=agg_dose))

        all_sources = df_response['Source'].unique()
        df_source = encode_sources(all_sources)

        if 'all' in train_sources:
            train_sources = all_sources
        if 'all' in test_sources:
            test_sources = all_sources
        elif 'train' in test_sources:
            test_sources = train_sources

        train_sep_sources = [
            x for x in all_sources for y in train_sources if x.startswith(y)
        ]
        test_sep_sources = [
            x for x in all_sources for y in test_sources if x.startswith(y)
        ]

        ids1 = df_response[[
            'Drug1'
        ]].drop_duplicates().rename(columns={'Drug1': 'Drug'})
        ids2 = df_response[[
            'Drug2'
        ]].drop_duplicates().rename(columns={'Drug2': 'Drug'})
        df_drugs_with_response = pd.concat(
            [ids1, ids2]).drop_duplicates().dropna().reset_index(drop=True)
        df_cells_with_response = df_response[[
            'Sample'
        ]].drop_duplicates().reset_index(drop=True)
        logger.info(
            'Combined raw dose response data has %d unique samples and %d unique drugs',
            df_cells_with_response.shape[0], df_drugs_with_response.shape[0])

        if agg_dose:
            df_selected_drugs = None
        else:
            logger.info(
                'Limiting drugs to those with response min <= %g, max >= %g, span >= %g, median_min <= %g, median_max >= %g ...',
                drug_lower_response, drug_upper_response, drug_response_span,
                drug_median_response_min, drug_median_response_max)
            df_selected_drugs = response_data.select_drugs_with_response_range(
                df_response,
                span=drug_response_span,
                lower=drug_lower_response,
                upper=drug_upper_response,
                lower_median=drug_median_response_min,
                upper_median=drug_median_response_max)
            logger.info('Selected %d drugs from %d',
                        df_selected_drugs.shape[0],
                        df_response['Drug1'].nunique())

        cell_feature_subset = read_set_from_file(cell_feature_subset_path)
        drug_feature_subset = read_set_from_file(drug_feature_subset_path)

        for fea in cell_features:
            fea = fea.lower()
            if fea == 'rnaseq' or fea == 'expression':
                df_cell_rnaseq = cellline_data.load_cell_rnaseq(
                    ncols=ncols,
                    scaling=scaling,
                    use_landmark_genes=use_landmark_genes,
                    use_filtered_genes=use_filtered_genes,
                    feature_subset=cell_feature_subset,
                    preprocess_rnaseq=preprocess_rnaseq,
                    embed_feature_source=embed_feature_source)

        for fea in drug_features:
            fea = fea.lower()
            if fea == 'descriptors':
                df_drug_desc = drug_data.load_drug_descriptors(
                    ncols=ncols,
                    scaling=scaling,
                    dropna=dropna,
                    feature_subset=drug_feature_subset)
            elif fea == 'fingerprints':
                df_drug_fp = drug_data.load_drug_fingerprints(
                    ncols=ncols,
                    scaling=scaling,
                    dropna=dropna,
                    feature_subset=drug_feature_subset)

        # df_drug_desc, df_drug_fp = drug_data.load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna)

        cell_df_dict = {'rnaseq': 'df_cell_rnaseq'}

        drug_df_dict = {
            'descriptors': 'df_drug_desc',
            'fingerprints': 'df_drug_fp'
        }

        # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates()
        # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates()

        logger.info('Filtering drug response data...')

        df_cell_ids = df_cells_with_response
        for fea in cell_features:
            df_cell = locals()[cell_df_dict[fea]]
            df_cell_ids = df_cell_ids.merge(df_cell[['Sample'
                                                     ]]).drop_duplicates()
        logger.info('  %d molecular samples with feature and response data',
                    df_cell_ids.shape[0])

        df_drug_ids = df_drugs_with_response
        for fea in drug_features:
            df_drug = locals()[drug_df_dict[fea]]
            df_drug_ids = df_drug_ids.merge(df_drug[['Drug'
                                                     ]]).drop_duplicates()

        if df_selected_drugs is not None:
            df_drug_ids = df_drug_ids.merge(
                df_selected_drugs).drop_duplicates()
        logger.info('  %d selected drugs with feature and response data',
                    df_drug_ids.shape[0])

        df_response = df_response[
            df_response['Sample'].isin(df_cell_ids['Sample'])
            & df_response['Drug1'].isin(df_drug_ids['Drug']) &
            (df_response['Drug2'].isin(df_drug_ids['Drug'])
             | df_response['Drug2'].isnull())]

        df_response = df_response[df_response['Source'].isin(
            train_sep_sources + test_sep_sources)]

        df_response.reset_index(drop=True, inplace=True)

        if logger.isEnabledFor(logging.INFO):
            logger.info('Summary of filtered dose response by source:')
            logger.info(
                response_data.summarize_response_data(df_response,
                                                      target=agg_dose))

        df_response = df_response.assign(
            Group=assign_partition_groups(df_response, partition_by))

        self.agg_dose = agg_dose
        self.cell_features = cell_features
        self.drug_features = drug_features
        self.cell_df_dict = cell_df_dict
        self.drug_df_dict = drug_df_dict
        self.df_source = df_source
        self.df_response = df_response
        self.embed_feature_source = embed_feature_source
        self.encode_response_source = encode_response_source
        self.all_sources = all_sources
        self.train_sources = train_sources
        self.test_sources = test_sources
        self.train_sep_sources = train_sep_sources
        self.test_sep_sources = test_sep_sources
        self.partition_by = partition_by

        for var in (list(drug_df_dict.values()) + list(cell_df_dict.values())):
            value = locals().get(var)
            if value is not None:
                setattr(self, var, value)

        self.build_feature_list(single=single)

        if cache:
            self.save_to_cache(cache, params)
    def partition_data(self,
                       partition_by=None,
                       cv_folds=1,
                       train_split=0.7,
                       val_split=0.2,
                       cell_types=None,
                       by_cell=None,
                       by_drug=None,
                       cell_subset_path=None,
                       drug_subset_path=None,
                       exclude_cells=[],
                       exclude_drugs=[],
                       exclude_indices=[]):

        seed = self.seed
        train_sep_sources = self.train_sep_sources
        test_sep_sources = self.test_sep_sources
        df_response = self.df_response

        if not partition_by:
            if by_drug and by_cell:
                partition_by = 'index'
            elif by_drug:
                partition_by = 'cell'
            else:
                partition_by = 'drug_pair'

    # Exclude specified cells / drugs / indices
        if exclude_cells != []:
            df_response = df_response[~df_response['Sample'].isin(exclude_cells
                                                                  )]
        if exclude_drugs != []:
            if np.isin('Drug', df_response.columns.values):
                df_response = df_response[~df_response['Drug1'].
                                          isin(exclude_drugs)]
            else:
                df_response = df_response[
                    ~df_response['Drug1'].isin(exclude_drugs)
                    & ~df_response['Drug2'].isin(exclude_drugs)]
        if exclude_indices != []:
            df_response = df_response.drop(exclude_indices, axis=0)
            logger.info('Excluding indices specified')

        if partition_by != self.partition_by:
            df_response = df_response.assign(
                Group=assign_partition_groups(df_response, partition_by))

        mask = df_response['Source'].isin(train_sep_sources)
        test_mask = df_response['Source'].isin(test_sep_sources)

        if by_drug:
            drug_ids = drug_data.drug_name_to_ids(by_drug)
            logger.info('Mapped drug IDs for %s: %s', by_drug, drug_ids)
            mask &= (df_response['Drug1'].isin(drug_ids)) & (
                df_response['Drug2'].isnull())
            test_mask &= (df_response['Drug1'].isin(drug_ids)) & (
                df_response['Drug2'].isnull())

        if by_cell:
            cell_ids = cellline_data.cell_name_to_ids(by_cell)
            logger.info('Mapped sample IDs for %s: %s', by_cell, cell_ids)
            mask &= (df_response['Sample'].isin(cell_ids))
            test_mask &= (df_response['Sample'].isin(cell_ids))

        if cell_subset_path:
            cell_subset = read_set_from_file(cell_subset_path)
            mask &= (df_response['Sample'].isin(cell_subset))
            test_mask &= (df_response['Sample'].isin(cell_subset))

        if drug_subset_path:
            drug_subset = read_set_from_file(drug_subset_path)
            mask &= (df_response['Drug1'].isin(drug_subset)) & (
                (df_response['Drug2'].isnull()) |
                (df_response['Drug2'].isin(drug_subset)))
            test_mask &= (df_response['Drug1'].isin(drug_subset)) & (
                (df_response['Drug2'].isnull()) |
                (df_response['Drug2'].isin(drug_subset)))

        if cell_types:
            df_type = cellline_data.load_cell_metadata()
            cell_ids = set()
            for cell_type in cell_types:
                cells = df_type[~df_type['TUMOR_TYPE'].isnull()
                                & df_type['TUMOR_TYPE'].str.contains(
                                    cell_type, case=False)]
                cell_ids |= set(cells['ANL_ID'].tolist())
                logger.info('Mapped sample tissue types for %s: %s', cell_type,
                            set(cells['TUMOR_TYPE'].tolist()))