def load_from_cache(self, cache, params): param_fname = '{}.params.json'.format(cache) if not os.path.isfile(param_fname): logger.warning('Cache parameter file does not exist: %s', param_fname) return False with open(param_fname) as param_file: try: cached_params = json.load(param_file) except json.JSONDecodeError as e: logger.warning('Could not decode parameter file %s: %s', param_fname, e) return False ignore_keys = ['cache', 'partition_by', 'single'] equal, diffs = dict_compare(params, cached_params, ignore_keys) if not equal: logger.warning( 'Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s', diffs, cached_params, params) logger.warning('\nRemove %s to rebuild data cache.\n', param_fname) raise ValueError( 'Could not load from a cache with incompatible keys:', diffs) else: fname = '{}.pkl'.format(cache) if not os.path.isfile(fname): logger.warning('Cache file does not exist: %s', fname) return False with open(fname, 'rb') as f: obj = pickle.load(f) self.__dict__.update(obj.__dict__) logger.info('Loaded data from cache: %s', fname) return True return False
def load_drug_data(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True): df_info = load_drug_info() df_info['Drug'] = df_info['PUBCHEM'] df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols) df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols) df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None) df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None) df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True) df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) df2 = df_desc.drop('Drug', 1) df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=scaling, imputing=imputing, dropna=dropna) if add_prefix: df2 = df2.add_prefix('dragon7.') df_desc = pd.concat([df1, df2], axis=1) df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True) df1 = pd.DataFrame(df_fp.loc[:, 'Drug']) df2 = df_fp.drop('Drug', 1) df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=None, imputing=imputing, dropna=dropna) if add_prefix: df2 = df2.add_prefix('dragon7.') df_fp = pd.concat([df1, df2], axis=1) logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape) logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape) return df_desc, df_fp
def save_to_cache(self, cache, params): for k in ['self', 'cache', 'single']: if k in params: del params[k] param_fname = '{}.params.json'.format(cache) with open(param_fname, 'w') as param_file: json.dump(params, param_file, sort_keys=True) fname = '{}.pkl'.format(cache) with open(fname, 'wb') as f: pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) logger.info('Saved data to cache: %s', fname)
def assign_partition_groups(df, partition_by='drug_pair'): if partition_by == 'cell': group = df['Sample'] elif partition_by == 'drug_pair': df_info = drug_data.load_drug_info() id_dict = df_info[['ID', 'PUBCHEM' ]].drop_duplicates(['ID']).set_index('ID').iloc[:, 0] group = df['Drug1'].copy() group[(df['Drug2'].notnull()) & (df['Drug1'] <= df['Drug2'])] = df['Drug1'] + ',' + df['Drug2'] group[(df['Drug2'].notnull()) & (df['Drug1'] > df['Drug2'])] = df['Drug2'] + ',' + df['Drug1'] group2 = group.map(id_dict) mapped = group2.notnull() group[mapped] = group2[mapped] elif partition_by == 'index': group = df.reset_index()['index'] logger.info('Grouped response data by %s: %d groups', partition_by, group.nunique()) return group
def load_aggregated_single_response(target='AUC', min_r2_fit=0.3, max_ec50_se=3, combo_format=False, rename=True): path = get_file(DATA_URL + 'combined_single_response_agg') df = global_cache.get(path) if df is None: df = pd.read_csv(path, engine='c', sep='\t', dtype={'SOURCE': str, 'CELL': str, 'DRUG': str, 'STUDY': str, 'AUC': np.float32, 'IC50': np.float32, 'EC50': np.float32, 'EC50se': np.float32, 'R2fit': np.float32, 'Einf': np.float32, 'HS': np.float32, 'AAC1': np.float32, 'AUC1': np.float32, 'DSS1': np.float32}) global_cache[path] = df total = len(df) df = df[(df['R2fit'] >= min_r2_fit) & (df['EC50se'] <= max_ec50_se)] df = df[['SOURCE', 'CELL', 'DRUG', target, 'STUDY']] df = df[~df[target].isnull()] logger.info('Loaded %d dose independent response samples (filtered by EC50se <= %f & R2fit >=%f from a total of %d).', len(df), max_ec50_se, min_r2_fit, total) if combo_format: df = df.rename(columns={'DRUG': 'DRUG1'}) df['DRUG2'] = np.nan df['DRUG2'] = df['DRUG2'].astype(object) df = df[['SOURCE', 'CELL', 'DRUG1', 'DRUG2', target, 'STUDY']] if rename: df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', 'STUDY': 'Study'}) else: if rename: df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', 'DRUG': 'Drug', 'STUDY': 'Study'}) return df
def load_combined_dose_response(rename=True): df1 = load_single_dose_response(combo_format=True) logger.info('Loaded {} single drug dose response measurements'.format(df1.shape[0])) df2 = load_combo_dose_response() logger.info('Loaded {} drug pair dose response measurements'.format(df2.shape[0])) df = pd.concat([df1, df2]) logger.info('Combined dose response data contains sources: {}'.format(df['SOURCE'].unique())) if rename: df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', 'DOSE1': 'Dose1', 'DOSE2': 'Dose2', 'GROWTH': 'Growth', 'STUDY': 'Study'}) return df
def build_feature_list(self, single=False): input_features = collections.OrderedDict() feature_shapes = collections.OrderedDict() if not self.agg_dose: doses = ['dose1', 'dose2'] if not single else ['dose1'] for dose in doses: input_features[dose] = 'dose' feature_shapes['dose'] = (1, ) if self.encode_response_source: input_features['response.source'] = 'response.source' feature_shapes['response.source'] = (self.df_source.shape[1] - 1, ) for fea in self.cell_features: feature_type = 'cell.' + fea feature_name = 'cell.' + fea df_cell = getattr(self, self.cell_df_dict[fea]) input_features[feature_name] = feature_type feature_shapes[feature_type] = (df_cell.shape[1] - 1, ) drugs = ['drug1', 'drug2'] if not single else ['drug1'] for drug in drugs: for fea in self.drug_features: feature_type = 'drug.' + fea feature_name = drug + '.' + fea df_drug = getattr(self, self.drug_df_dict[fea]) input_features[feature_name] = feature_type feature_shapes[feature_type] = (df_drug.shape[1] - 1, ) input_dim = sum( [np.prod(feature_shapes[x]) for x in input_features.values()]) self.input_features = input_features self.feature_shapes = feature_shapes self.input_dim = input_dim logger.info('Input features shapes:') for k, v in self.input_features.items(): logger.info(' {}: {}'.format(k, self.feature_shapes[v])) logger.info('Total input dimensions: {}'.format(self.input_dim))
def load_cell_rnaseq(ncols=None, scaling='std', imputing='mean', add_prefix=True, use_landmark_genes=False, use_filtered_genes=False, feature_subset=None, preprocess_rnaseq=None, embed_feature_source=False, sample_set=None, index_by_sample=False): if use_landmark_genes: filename = 'combined_rnaseq_data_lincs1000' elif use_filtered_genes: filename = 'combined_rnaseq_data_filtered' else: filename = 'combined_rnaseq_data' if preprocess_rnaseq and preprocess_rnaseq != 'none': scaling = None filename += ('_' + preprocess_rnaseq) # 'source_scale' or 'combat' path = get_file(DATA_URL + filename) df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) total = df_cols.shape[1] - 1 # remove Sample column if 'Cancer_type_id' in df_cols.columns: total -= 1 usecols = None if ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) usecols = np.append([0], np.add(sorted(usecols), 2)) df_cols = df_cols.iloc[:, usecols] if feature_subset: with_prefix = lambda x: 'rnaseq.' + x if add_prefix else x usecols = [0] + [ i for i, c in enumerate(df_cols.columns) if with_prefix(c) in feature_subset ] df_cols = df_cols.iloc[:, usecols] dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict) if 'Cancer_type_id' in df.columns: df.drop('Cancer_type_id', axis=1, inplace=True) prefixes = df['Sample'].str.extract('^([^.]*)', expand=False).rename('Source') sources = prefixes.drop_duplicates().reset_index(drop=True) df_source = pd.get_dummies(sources, prefix='rnaseq.source', prefix_sep='.') df_source = pd.concat([sources, df_source], axis=1) df1 = df['Sample'] if embed_feature_source: df_sample_source = pd.concat([df1, prefixes], axis=1) df1 = df_sample_source.merge(df_source, on='Source', how='left').drop('Source', axis=1) logger.info( 'Embedding RNAseq data source into features: %d additional columns', df1.shape[1] - 1) df2 = df.drop('Sample', 1) if add_prefix: df2 = df2.add_prefix('rnaseq.') df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing) df = pd.concat([df1, df2], axis=1) # scaling needs to be done before subsampling if sample_set: chosen = df['Sample'].str.startswith(sample_set) df = df[chosen].reset_index(drop=True) if index_by_sample: df = df.set_index('Sample') logger.info('Loaded combined RNAseq data: %s', df.shape) return df
def load( self, cache=None, ncols=None, scaling='std', dropna=None, agg_dose=None, embed_feature_source=True, encode_response_source=True, cell_features=['rnaseq'], drug_features=['descriptors', 'fingerprints'], cell_feature_subset_path=None, drug_feature_subset_path=None, drug_lower_response=1, drug_upper_response=-1, drug_response_span=0, drug_median_response_min=-1, drug_median_response_max=1, use_landmark_genes=False, use_filtered_genes=False, preprocess_rnaseq=None, single=False, # train_sources=['GDSC', 'CTRP', 'ALMANAC', 'NCI60'], train_sources=['GDSC', 'CTRP', 'ALMANAC'], # val_sources='train', # test_sources=['CCLE', 'gCSI'], test_sources=['train'], partition_by='drug_pair'): params = locals().copy() del params['self'] if not cell_features or 'none' in [x.lower() for x in cell_features]: cell_features = [] if not drug_features or 'none' in [x.lower() for x in drug_features]: drug_features = [] if cache and self.load_from_cache(cache, params): self.build_feature_list(single=single) return logger.info('Loading data from scratch ...') if agg_dose: df_response = response_data.load_aggregated_single_response( target=agg_dose, combo_format=True) else: df_response = response_data.load_combined_dose_response() if logger.isEnabledFor(logging.INFO): logger.info('Summary of combined dose response by source:') logger.info( response_data.summarize_response_data(df_response, target=agg_dose)) all_sources = df_response['Source'].unique() df_source = encode_sources(all_sources) if 'all' in train_sources: train_sources = all_sources if 'all' in test_sources: test_sources = all_sources elif 'train' in test_sources: test_sources = train_sources train_sep_sources = [ x for x in all_sources for y in train_sources if x.startswith(y) ] test_sep_sources = [ x for x in all_sources for y in test_sources if x.startswith(y) ] ids1 = df_response[[ 'Drug1' ]].drop_duplicates().rename(columns={'Drug1': 'Drug'}) ids2 = df_response[[ 'Drug2' ]].drop_duplicates().rename(columns={'Drug2': 'Drug'}) df_drugs_with_response = pd.concat( [ids1, ids2]).drop_duplicates().dropna().reset_index(drop=True) df_cells_with_response = df_response[[ 'Sample' ]].drop_duplicates().reset_index(drop=True) logger.info( 'Combined raw dose response data has %d unique samples and %d unique drugs', df_cells_with_response.shape[0], df_drugs_with_response.shape[0]) if agg_dose: df_selected_drugs = None else: logger.info( 'Limiting drugs to those with response min <= %g, max >= %g, span >= %g, median_min <= %g, median_max >= %g ...', drug_lower_response, drug_upper_response, drug_response_span, drug_median_response_min, drug_median_response_max) df_selected_drugs = response_data.select_drugs_with_response_range( df_response, span=drug_response_span, lower=drug_lower_response, upper=drug_upper_response, lower_median=drug_median_response_min, upper_median=drug_median_response_max) logger.info('Selected %d drugs from %d', df_selected_drugs.shape[0], df_response['Drug1'].nunique()) cell_feature_subset = read_set_from_file(cell_feature_subset_path) drug_feature_subset = read_set_from_file(drug_feature_subset_path) for fea in cell_features: fea = fea.lower() if fea == 'rnaseq' or fea == 'expression': df_cell_rnaseq = cellline_data.load_cell_rnaseq( ncols=ncols, scaling=scaling, use_landmark_genes=use_landmark_genes, use_filtered_genes=use_filtered_genes, feature_subset=cell_feature_subset, preprocess_rnaseq=preprocess_rnaseq, embed_feature_source=embed_feature_source) for fea in drug_features: fea = fea.lower() if fea == 'descriptors': df_drug_desc = drug_data.load_drug_descriptors( ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) elif fea == 'fingerprints': df_drug_fp = drug_data.load_drug_fingerprints( ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) # df_drug_desc, df_drug_fp = drug_data.load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna) cell_df_dict = {'rnaseq': 'df_cell_rnaseq'} drug_df_dict = { 'descriptors': 'df_drug_desc', 'fingerprints': 'df_drug_fp' } # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates() # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates() logger.info('Filtering drug response data...') df_cell_ids = df_cells_with_response for fea in cell_features: df_cell = locals()[cell_df_dict[fea]] df_cell_ids = df_cell_ids.merge(df_cell[['Sample' ]]).drop_duplicates() logger.info(' %d molecular samples with feature and response data', df_cell_ids.shape[0]) df_drug_ids = df_drugs_with_response for fea in drug_features: df_drug = locals()[drug_df_dict[fea]] df_drug_ids = df_drug_ids.merge(df_drug[['Drug' ]]).drop_duplicates() if df_selected_drugs is not None: df_drug_ids = df_drug_ids.merge( df_selected_drugs).drop_duplicates() logger.info(' %d selected drugs with feature and response data', df_drug_ids.shape[0]) df_response = df_response[ df_response['Sample'].isin(df_cell_ids['Sample']) & df_response['Drug1'].isin(df_drug_ids['Drug']) & (df_response['Drug2'].isin(df_drug_ids['Drug']) | df_response['Drug2'].isnull())] df_response = df_response[df_response['Source'].isin( train_sep_sources + test_sep_sources)] df_response.reset_index(drop=True, inplace=True) if logger.isEnabledFor(logging.INFO): logger.info('Summary of filtered dose response by source:') logger.info( response_data.summarize_response_data(df_response, target=agg_dose)) df_response = df_response.assign( Group=assign_partition_groups(df_response, partition_by)) self.agg_dose = agg_dose self.cell_features = cell_features self.drug_features = drug_features self.cell_df_dict = cell_df_dict self.drug_df_dict = drug_df_dict self.df_source = df_source self.df_response = df_response self.embed_feature_source = embed_feature_source self.encode_response_source = encode_response_source self.all_sources = all_sources self.train_sources = train_sources self.test_sources = test_sources self.train_sep_sources = train_sep_sources self.test_sep_sources = test_sep_sources self.partition_by = partition_by for var in (list(drug_df_dict.values()) + list(cell_df_dict.values())): value = locals().get(var) if value is not None: setattr(self, var, value) self.build_feature_list(single=single) if cache: self.save_to_cache(cache, params)
def partition_data(self, partition_by=None, cv_folds=1, train_split=0.7, val_split=0.2, cell_types=None, by_cell=None, by_drug=None, cell_subset_path=None, drug_subset_path=None, exclude_cells=[], exclude_drugs=[], exclude_indices=[]): seed = self.seed train_sep_sources = self.train_sep_sources test_sep_sources = self.test_sep_sources df_response = self.df_response if not partition_by: if by_drug and by_cell: partition_by = 'index' elif by_drug: partition_by = 'cell' else: partition_by = 'drug_pair' # Exclude specified cells / drugs / indices if exclude_cells != []: df_response = df_response[~df_response['Sample'].isin(exclude_cells )] if exclude_drugs != []: if np.isin('Drug', df_response.columns.values): df_response = df_response[~df_response['Drug1']. isin(exclude_drugs)] else: df_response = df_response[ ~df_response['Drug1'].isin(exclude_drugs) & ~df_response['Drug2'].isin(exclude_drugs)] if exclude_indices != []: df_response = df_response.drop(exclude_indices, axis=0) logger.info('Excluding indices specified') if partition_by != self.partition_by: df_response = df_response.assign( Group=assign_partition_groups(df_response, partition_by)) mask = df_response['Source'].isin(train_sep_sources) test_mask = df_response['Source'].isin(test_sep_sources) if by_drug: drug_ids = drug_data.drug_name_to_ids(by_drug) logger.info('Mapped drug IDs for %s: %s', by_drug, drug_ids) mask &= (df_response['Drug1'].isin(drug_ids)) & ( df_response['Drug2'].isnull()) test_mask &= (df_response['Drug1'].isin(drug_ids)) & ( df_response['Drug2'].isnull()) if by_cell: cell_ids = cellline_data.cell_name_to_ids(by_cell) logger.info('Mapped sample IDs for %s: %s', by_cell, cell_ids) mask &= (df_response['Sample'].isin(cell_ids)) test_mask &= (df_response['Sample'].isin(cell_ids)) if cell_subset_path: cell_subset = read_set_from_file(cell_subset_path) mask &= (df_response['Sample'].isin(cell_subset)) test_mask &= (df_response['Sample'].isin(cell_subset)) if drug_subset_path: drug_subset = read_set_from_file(drug_subset_path) mask &= (df_response['Drug1'].isin(drug_subset)) & ( (df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset))) test_mask &= (df_response['Drug1'].isin(drug_subset)) & ( (df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset))) if cell_types: df_type = cellline_data.load_cell_metadata() cell_ids = set() for cell_type in cell_types: cells = df_type[~df_type['TUMOR_TYPE'].isnull() & df_type['TUMOR_TYPE'].str.contains( cell_type, case=False)] cell_ids |= set(cells['ANL_ID'].tolist()) logger.info('Mapped sample tissue types for %s: %s', cell_type, set(cells['TUMOR_TYPE'].tolist()))