Exemplo n.º 1
0
    def __init__(
        self,
        data_root: str,
        training: bool,
        rand_state: int = 0,
        summary: bool = True,

        # Data type settings (for storage and data loading)
        int_dtype: type = np.int8,
        float_dtype: type = np.float16,
        output_dtype: type = np.float32,

        # Pre-processing settings
        rnaseq_scaling: str = 'std',
        predict_target: str = 'class',

        # Partitioning (train/validation) and data usage settings
        rnaseq_feature_usage: str = 'source_scale',
        validation_ratio: float = 0.2,
    ):
        """dataset = CLClassDataset('./data/', True)

        Construct a RNA sequence dataset based on the parameters provided.
        The process includes:
            * Downloading source data files;
            * Pre-processing (scaling);
            * Public attributes and other preparations.

        Args:
            data_root (str): path to data root folder.
            training (bool): indicator for training.
            rand_state (int): random seed used for training/validation split
                and other processes that requires randomness.
            summary (bool): set True for printing dataset summary.

            int_dtype (type): integer dtype for data storage in RAM.
            float_dtype (type): float dtype for data storage in RAM.
            output_dtype (type): output dtype for neural network.

            rnaseq_scaling (str): scaling method for RNA sequence. Choose
                between 'none', 'std', and 'minmax'.
            predict_target (str): prediction target for RNA sequence. Note
                that any labels except for target will be in one-hot
                encoding, while the target will be encoded as integers.
                Choose between 'none', 'class', and 'source'.

            rnaseq_feature_usage: RNA sequence data usage. Choose between
                'source_scale' and 'combat'.
            validation_ratio (float): portion of validation data out of all
                data samples.
        """

        # Initialization ######################################################
        self.__data_root = data_root

        # Class-wise variables
        self.training = training
        self.__rand_state = rand_state
        self.__output_dtype = output_dtype

        # Feature scaling
        if rnaseq_scaling is None or rnaseq_scaling == '':
            rnaseq_scaling = 'none'
        self.__rnaseq_scaling = rnaseq_scaling.lower()
        if predict_target is None or predict_target == '':
            predict_target = 'none'
        assert predict_target.lower() in ['none', 'class', 'source']
        self.__predict_target = predict_target.lower()

        self.__rnaseq_feature_usage = rnaseq_feature_usage
        self.__validation_ratio = validation_ratio

        # Load all dataframes #################################################
        self.__rnaseq_df = get_rna_seq_df(
            data_root=data_root,
            rnaseq_feature_usage=rnaseq_feature_usage,
            rnaseq_scaling=rnaseq_scaling,
            float_dtype=float_dtype)

        self.__cl_meta_df = get_cl_meta_df(data_root=data_root,
                                           int_dtype=int_dtype)

        # Put all the sequence in one column as list and specify dtype
        self.__rnaseq_df['seq'] = \
            list(map(float_dtype, self.__rnaseq_df.values.tolist()))

        # Join the RNA sequence data with meta data. cl_df will have columns:
        # ['data_src', 'site', 'type', 'category', 'seq']
        self.__cl_df = pd.concat(
            [self.__cl_meta_df, self.__rnaseq_df[['seq']]],
            axis=1,
            join='inner')

        # Exclude 'GDC' and 'NCI60' during data source prediction
        # GDC has too many samples while NCI60 has not enough
        if self.__predict_target == 'source':
            logger.warning('Taking out GDC and NCI60 samples to make dataset '
                           'balanced among all data sources ...')
            self.__cl_df = self.__cl_df[~self.__cl_df['data_src'].isin([2, 5])]

        # Encode labels (except for prediction targets) into one-hot encoding
        if self.__predict_target != 'source':
            enc_data_src = encode_int_to_onehot(
                self.__cl_df['data_src'].tolist(),
                len(get_label_dict(data_root, 'data_src_dict.txt')))
            self.__cl_df['data_src'] = list(map(int_dtype, enc_data_src))

        if self.__predict_target != 'class':
            for label in ['site', 'type', 'category']:
                enc_label = encode_int_to_onehot(
                    self.__cl_df[label].tolist(),
                    len(get_label_dict(data_root, '%s_dict.txt' % label)))
                self.__cl_df[label] = list(map(int_dtype, enc_label))

        # Train/validation split ##############################################
        self.__split_drug_resp()

        # Converting dataframes to arrays for rapid access ####################
        self.__cl_array = self.__cl_df.values

        # Public attributes ###################################################
        self.cells = self.__cl_df.index.tolist()
        self.num_cells = self.__cl_df.shape[0]
        self.rnaseq_dim = len(self.__cl_df.iloc[0]['seq'])

        # Clear the dataframes ################################################
        self.__rnaseq_df = None
        self.__cl_meta_df = None
        self.__cl_df = None

        # Dataset summary #####################################################
        if summary:
            print('=' * 80)
            print(('Training' if self.training else 'Validation') +
                  ' RNA Sequence Dataset Summary:')
            print('\t%i Unique Cell Lines (feature dim: %4i).' %
                  (self.num_cells, self.rnaseq_dim))
            print('=' * 80)
Exemplo n.º 2
0
    def __split_drug_resp(self):
        """self.__split_drug_resp()

        This function split training and validation drug response data based
        on the splitting specifications (disjoint drugs and/or disjoint cells).

        Upon the call, the function summarize all the drugs and cells. If
        disjoint (drugs/cells) is set to True, then it will split the list
        (of drugs/cells) into training/validation (drugs/cells).

        Otherwise, if disjoint (drugs/cells) is set to False, we make sure
        that the training/validation set contains the same (drugs/cells).

        Then it trims all three dataframes to make sure all the data in RAM is
        relevant for training/validation

        Note that the validation size is not guaranteed during splitting.
        What the function really splits by the ratio is the list of
        drugs/cell lines. Also, if both drugs and cell lines are marked
        disjoint, the function will split drug and cell lists with ratio of
        (validation_size ** 0.7).

        Warnings will be raise if the validation ratio is off too much.

        Returns:
            None
        """

        # Trim dataframes based on data source and common drugs/cells
        # Now drug response dataframe contains training + validation
        # data samples from the same data source, like 'NCI60'
        self.__trim_dataframes()

        # Get lists of all drugs & cells corresponding from data source
        cell_list = self.__drug_resp_df['CELLNAME'].unique().tolist()
        drug_list = self.__drug_resp_df['DRUG_ID'].unique().tolist()

        # Create an array to store all drugs' analysis results
        drug_anlys_dict = {idx: row.values for idx, row in
                           get_drug_anlys_df(self.__data_root).iterrows()}
        drug_anlys_array = np.array([drug_anlys_dict[d] for d in drug_list])

        # Create a list to store all cell lines types
        cell_type_dict = {idx: row.values for idx, row in
                          get_cl_meta_df(self.__data_root)
                          [['type']].iterrows()}
        cell_type_list = [cell_type_dict[c] for c in cell_list]

        # Change validation size when both features are disjoint in splitting
        # Note that theoretically should use validation_ratio ** 0.5,
        # but 0.7 simply works better in most cases.
        if self.__disjoint_cells and self.__disjoint_drugs:
            adjusted_val_ratio = self.__validation_ratio ** 0.7
        else:
            adjusted_val_ratio = self.__validation_ratio

        split_kwargs = {
            'test_size': adjusted_val_ratio,
            'random_state': self.__rand_state,
            'shuffle': True, }

        # Try to split the cells stratified on type list
        try:
            training_cell_list, validation_cell_list = \
                train_test_split(cell_list, **split_kwargs,
                                 stratify=cell_type_list)
        except ValueError:
            logger.warning('Failed to split %s cells in stratified '
                           'way. Splitting randomly ...' % self.data_source)
            training_cell_list, validation_cell_list = \
                train_test_split(cell_list, **split_kwargs)

        # Try to split the drugs stratified on the drug analysis results
        try:
            training_drug_list, validation_drug_list = \
                train_test_split(drug_list, **split_kwargs,
                                 stratify=drug_anlys_array)
        except ValueError:
            logger.warning('Failed to split %s drugs stratified on growth '
                           'and correlation. Splitting solely on avg growth'
                           ' ...' % self.data_source)

            try:
                training_drug_list, validation_drug_list = \
                    train_test_split(drug_list, **split_kwargs,
                                     stratify=drug_anlys_array[:, 0])
            except ValueError:
                logger.warning('Failed to split %s drugs on avg growth. '
                               'Splitting solely on avg correlation ...'
                               % self.data_source)

                try:
                    training_drug_list, validation_drug_list = \
                        train_test_split(drug_list, **split_kwargs,
                                         stratify=drug_anlys_array[:, 1])
                except ValueError:
                    logger.warning('Failed to split %s drugs on avg '
                                   'correlation. Splitting randomly ...'
                                   % self.data_source)
                    training_drug_list, validation_drug_list = \
                        train_test_split(drug_list, **split_kwargs)

        # Split data based on disjoint cell/drug strategy
        if self.__disjoint_cells and self.__disjoint_drugs:

            training_drug_resp_df = self.__drug_resp_df.loc[
                (self.__drug_resp_df['CELLNAME'].isin(training_cell_list)) &
                (self.__drug_resp_df['DRUG_ID'].isin(training_drug_list))]

            validation_drug_resp_df = self.__drug_resp_df.loc[
                (self.__drug_resp_df['CELLNAME'].isin(validation_cell_list)) &
                (self.__drug_resp_df['DRUG_ID'].isin(validation_drug_list))]

        elif self.__disjoint_cells and (not self.__disjoint_drugs):

            training_drug_resp_df = self.__drug_resp_df.loc[
                self.__drug_resp_df['CELLNAME'].isin(training_cell_list)]

            validation_drug_resp_df = self.__drug_resp_df.loc[
                self.__drug_resp_df['CELLNAME'].isin(validation_cell_list)]

        elif (not self.__disjoint_cells) and self.__disjoint_drugs:

            training_drug_resp_df = self.__drug_resp_df.loc[
                self.__drug_resp_df['DRUG_ID'].isin(training_drug_list)]

            validation_drug_resp_df = self.__drug_resp_df.loc[
                self.__drug_resp_df['DRUG_ID'].isin(validation_drug_list)]

        else:
            logger.warning('Stratified on drug + cell combo ...')

            combo_list = [(cell + drug) for cell, drug in
                          zip(self.__drug_resp_df['CELLNAME'].tolist(),
                              self.__drug_resp_df['DRUG_ID'].tolist())]

            training_drug_resp_df, validation_drug_resp_df = \
                train_test_split(self.__drug_resp_df,
                                 test_size=self.__validation_ratio,
                                 random_state=self.__rand_state,
                                 stratify=combo_list,
                                 shuffle=True)

        # Make sure that if not disjoint, the training/validation set should
        #  share the same drugs/cells
        if not self.__disjoint_cells:
            # Make sure that cell lines are common
            common_cells = set(training_drug_resp_df['CELLNAME'].unique()) & \
                           set(validation_drug_resp_df['CELLNAME'].unique())

            training_drug_resp_df = training_drug_resp_df.loc[
                training_drug_resp_df['CELLNAME'].isin(common_cells)]
            validation_drug_resp_df = validation_drug_resp_df.loc[
                validation_drug_resp_df['CELLNAME'].isin(common_cells)]

        if not self.__disjoint_drugs:
            # Make sure that drugs are common
            common_drugs = set(training_drug_resp_df['DRUG_ID'].unique()) & \
                           set(validation_drug_resp_df['DRUG_ID'].unique())

            training_drug_resp_df = training_drug_resp_df.loc[
                training_drug_resp_df['DRUG_ID'].isin(common_drugs)]
            validation_drug_resp_df = validation_drug_resp_df.loc[
                validation_drug_resp_df['DRUG_ID'].isin(common_drugs)]

        # Check if the validation ratio is in a reasonable range
        validation_ratio = len(validation_drug_resp_df) \
            / (len(training_drug_resp_df) + len(validation_drug_resp_df))
        if (validation_ratio < self.__validation_ratio * 0.8) \
                or (validation_ratio > self.__validation_ratio * 1.2):
            logger.warning('Bad validation ratio: %.3f' %
                           validation_ratio)

        # Keep only training_drug_resp_df or validation_drug_resp_df
        self.__drug_resp_df = training_drug_resp_df if self.training \
            else validation_drug_resp_df
def plot_error_bar_over_cell(cl_class: str,

                             trn_src: str,
                             val_src: str,

                             results_dir: str,
                             epoch: int = None,
                             early_stop_patience: int = 5,

                             error_type: str = 'mse',
                             image_dir: str = '../../results/images/'):

    # This function plots error (MSE/MAE) and UQ over cell types
    # This is going to be a bar plot
    cl_class = cl_class.lower()
    if cl_class not in ['site', 'type', 'category']:
        raise ValueError('Cell line class must be one of '
                         '\'site\', \'type\', or \'category\'.')

    if error_type.lower() not in ['mse', 'mae']:
        raise ValueError('Error type must be \'MSE\' or \'MSE\'')

    # Load result file
    epoch, results = load_result_file(
        trn_src, val_src, results_dir, epoch, early_stop_patience)

    # Get the (un-encoded) cell line metadata for classification
    cl_meta_df = get_cl_meta_df('../../data/', encoding=False)
    cl_classes = cl_meta_df[cl_class].unique()

    bar_labels = []
    # class_indicators = []
    avg_error_in_class = []
    scaled_std_error_in_class = []

    for c in cl_classes:

        # Get all the cell lines that are in this classes
        cl_in_class = cl_meta_df.loc[cl_meta_df[cl_class] == c].index.tolist()

        error_array_in_class = results.loc[results['cell_id'].isin(
            cl_in_class)][error_type.lower()].values.flatten()

        bar_labels.append('n=%i' % len(error_array_in_class))

        if len(error_array_in_class) != 0:
            avg_error_in_class.append(np.mean(error_array_in_class))
            scaled_std_error_in_class.append(
                np.std(error_array_in_class) * STD_SCALE)
        else:
            avg_error_in_class.append(0.)
            scaled_std_error_in_class.append(0.)

    plt.figure(figsize=IMAGE_SIZE)
    plt.xlabel('Cell Line Classes')
    plt.ylabel('Averaged %s' % error_type.upper())
    plt.title('Averaged Error over Cell Line %s '
              '(Trained on %s and Validated on %s, Epoch %i)'
              % (cl_class, trn_src, val_src, epoch))

    # Labeling each bar with the scaled std and the number of samples
    bars = plt.bar(cl_classes, avg_error_in_class,
                   yerr=scaled_std_error_in_class, align='center',
                   alpha=0.5, ecolor='black', capsize=4)
    plt.xticks(cl_classes, rotation=-75)
    for bar, label in zip(bars, bar_labels):
        plt.text(bar.get_x() + bar.get_width() / 2.0, bar.get_height(),
                 label, ha='center', va='bottom')

    # Save the plot into image folder
    try:
        os.makedirs(image_dir)
    except FileExistsError:
        pass
    img_name = '%s_over_CL_[trn=%s][val=%s][epoch=%02i].png' \
               % (error_type.upper(), trn_src, val_src, epoch)
    img_path = os.path.join(image_dir, img_name)
    plt.savefig(img_path)

    plt.close('all')