예제 #1
0
def get_drug_qed_df(data_root: str,
                    qed_scaling: str,
                    float_dtype: type = np.float32):
    """df = get_drug_qed_df('./data/', 'none')


    This function the drug property dataframe, process it and return the
    drug weighted QED dataframe. The processing includes:
        * removing all columns but 'QED';
        * drop drugs/rows that have NaN as weighted QED;
        * scaling the QED accordingly;
        * convert data types for more compact structure;

    Args:
        data_root (str): path to the data root folder.
        qed_scaling (str): scaling strategy for weighted QED.
        float_dtype (float): float dtype for storage in RAM.

    Returns:
        pd.DataFrame: drug weighted QED dataframe.
    """

    df = get_drug_prop_df(data_root=data_root)[['QED']]

    # Drop all the NaN values before scaling
    df.dropna(axis=0, inplace=True)

    # Note that weighted QED is by default already in the range of [0, 1]
    # Scaling the weighted QED with given scaling method
    df = scale_dataframe(df, qed_scaling)

    # Convert the dtypes for a more efficient, compact dataframe
    return df.astype(float_dtype)
예제 #2
0
def get_drug_lat_df(data_root: str,
                    lat_scaling: str,
                    float_dtype: type = np.int8):
    """
    df = get_drug_latent_df('./data/')
    """

    df_filename = 'drug_lat_df(scaling=%s).pkl' % lat_scaling
    df_path = os.path.join(data_root, PROC_FOLDER, df_filename)

    # If the dataframe already exists, load and continue ######################
    if os.path.exists(df_path):
        df = pd.read_pickle(df_path)

    # Otherwise load from raw files, process it and save ######################
    else:
        logger.debug('Processing drug latent dataframe ... ')

        # Download the raw file if not exist
        download_files(filenames=LAT_FILENAME,
                       target_folder=os.path.join(data_root, RAW_FOLDER))

        df = pd.read_csv(
            os.path.join(data_root, RAW_FOLDER, LAT_FILENAME),
            sep='\t',
            header=0,
            index_col=0)

        # Scaling the descriptor with given scaling method
        df = scale_dataframe(df, lat_scaling)

        # Convert data type into generic python types
        df = df.astype(float)

        # save to disk for future usage
        try:
            os.makedirs(os.path.join(data_root, PROC_FOLDER))
        except FileExistsError:
            pass
        df.to_pickle(df_path)

    # Convert the dtypes for a more efficient, compact dataframe ##############
    df = df.astype(float_dtype)
    return df
예제 #3
0
def get_rna_seq_df(data_root: str,
                   rnaseq_feature_usage: str,
                   rnaseq_scaling: str,
                   float_dtype: type = np.float32):
    """df = get_rna_seq_df('./data/', 'source_scale', 'std')

    This function loads the RNA sequence file, process it and return
    as a dataframe. The processing includes:
        * remove the '-' in cell line names;
        * remove duplicate indices;
        * scaling all the sequence features accordingly;
        * convert data types for more compact structure;

    Note that if the dataframe is already stored in the processed folder,
    the function simply read from file and return after converting dtypes.

    Args:
        data_root (str): path to the data root folder.
        rnaseq_feature_usage (str): feature usage indicator, Choose between
            'source_scale' and 'combat'.
        rnaseq_scaling (str): scaling strategy for RNA sequence.
        float_dtype (float): int dtype for storage in RAM.

    Returns:
        pd.DataFrame: processed RNA sequence dataframe.
    """

    df_filename = 'rnaseq_df(%s, scaling=%s).pkl' \
                  % (rnaseq_feature_usage, rnaseq_scaling)
    df_path = os.path.join(data_root, PROC_FOLDER, df_filename)

    # If the dataframe already exists, load and continue ######################
    if os.path.exists(df_path):
        df = pd.read_pickle(df_path)

    # Otherwise load from raw files, process it and save ######################
    else:
        logger.debug('Processing RNA sequence dataframe ... ')

        if rnaseq_feature_usage == 'source_scale':
            raw_data_filename = RNASEQ_SOURCE_SCALE_FILENAME
        elif rnaseq_feature_usage == 'combat':
            raw_data_filename = RNASEQ_COMBAT_FILENAME
        elif rnaseq_feature_usage == 'livermore':
            raw_data_filename = RNASEQ_LIVERMORE_FILENAME
        else:
            logger.error('Unknown RNA feature %s.' % rnaseq_feature_usage,
                         exc_info=True)
            raise ValueError('RNA feature usage must be one of '
                             '\'source_scale\' or \'combat\'.')

        # Download the raw file if not exist
        download_files(filenames=raw_data_filename,
                       target_folder=os.path.join(data_root, RAW_FOLDER))

        df = pd.read_csv(os.path.join(data_root, RAW_FOLDER,
                                      raw_data_filename),
                         sep='\t',
                         header=0,
                         index_col=0)

        # Delete '-', which could be inconsistent between seq and meta
        df.index = df.index.str.replace('-', '')

        # Note that after this name changing, some rows will have the same
        # name like 'GDSC.TT' and 'GDSC.T-T', but they are actually the same
        # Drop the duplicates for consistency
        df = df[~df.index.duplicated(keep='first')]

        # Scaling the descriptor with given scaling method
        df = scale_dataframe(df, rnaseq_scaling)

        # Convert data type into generic python types
        df = df.astype(float)

        # save to disk for future usage
        try:
            os.makedirs(os.path.join(data_root, PROC_FOLDER))
        except FileExistsError:
            pass
        df.to_pickle(df_path)

    # Convert the dtypes for a more efficient, compact dataframe ##############
    df = df.astype(float_dtype)
    return df
def get_drug_resp_df(data_root: str,
                     grth_scaling: str,
                     int_dtype: type = np.int8,
                     float_dtype: type = np.float32):
    """df = get_drug_resp_df('./data/', 'std')

    This function loads the whole drug response file, process it and return
    as a dataframe. The processing includes:
        * remove the '-' in cell line names;
        * encode str format data sources into integer;
        * scaling the growth accordingly;
        * convert data types for more compact structure;

    Note that if the dataframe is already stored in the processed folder,
    the function simply read from file and return after converting dtypes.

    Args:
        data_root (str): path to the data root folder.
        grth_scaling (str): scaling strategy for growth in drug response.
        int_dtype (type): int dtype for storage in RAM.
        float_dtype (float): int dtype for storage in RAM.

    Returns:
        pd.DataFrame: processed drug response dataframe.
    """

    df_filename = 'drug_resp_df(scaling=%s).pkl' % grth_scaling
    df_path = os.path.join(data_root, PROC_FOLDER, df_filename)

    # If the dataframe already exists, load and continue ######################
    if os.path.exists(df_path):
        df = pd.read_pickle(df_path)

    # Otherwise load from raw files, process it and save ######################
    else:
        logger.debug('Processing drug response dataframe ... ')

        # Download the raw file if not exist
        download_files(filenames=DRUG_RESP_FILENAME,
                       target_folder=os.path.join(data_root, RAW_FOLDER))

        df = pd.read_csv(os.path.join(data_root, RAW_FOLDER,
                                      DRUG_RESP_FILENAME),
                         sep='\t',
                         header=0,
                         index_col=None,
                         usecols=[
                             0,
                             1,
                             2,
                             4,
                             6,
                         ])

        # Delete '-', which could be inconsistent between seq and meta
        df['CELLNAME'] = df['CELLNAME'].str.replace('-', '')

        # Encode data sources into numeric
        df['SOURCE'] = encode_label_to_int(data_root=data_root,
                                           dict_name='data_src_dict.txt',
                                           labels=df['SOURCE'].tolist())

        # Scaling the growth with given scaling method
        df['GROWTH'] = scale_dataframe(df['GROWTH'], grth_scaling)

        # Convert data type into generic python types
        df[['SOURCE']] = df[['SOURCE']].astype(int)
        df[['LOG_CONCENTRATION', 'GROWTH']] = \
            df[['LOG_CONCENTRATION', 'GROWTH']].astype(float)

        # save to disk for future usage
        try:
            os.makedirs(os.path.join(data_root, PROC_FOLDER))
        except FileExistsError:
            pass
        df.to_pickle(df_path)

    # Convert the dtypes for a more efficient, compact dataframe ##############
    df[['SOURCE']] = df[['SOURCE']].astype(int_dtype)
    df[['LOG_CONCENTRATION', 'GROWTH']] = \
        df[['LOG_CONCENTRATION', 'GROWTH']].astype(float_dtype)
    return df
예제 #5
0
def get_drug_dscptr_df(data_root: str,
                       dscptr_scaling: str,
                       dscptr_nan_thresh: float,
                       float_dtype: type = np.float32):
    """df = get_drug_dscptr_df('./data/', 'std', 0.0)

    This function loads the drug descriptor file, process it and return
    as a dataframe. The processing includes:
        * removing columns (features) and rows (drugs) that have exceeding
            ratio of NaN values comparing to nan_thresh;
        * scaling all the descriptor features accordingly;
        * convert data types for more compact structure;

    Note that if the dataframe is already stored in the processed folder,
    the function simply read from file and return after converting dtypes.

    Args:
        data_root (str): path to the data root folder.
        dscptr_scaling (str): scaling strategy for all descriptor features.
        dscptr_nan_thresh (float): threshold ratio of NaN values.
        float_dtype (float): float dtype for storage in RAM.

    Returns:
        pd.DataFrame: processed drug descriptor dataframe.
    """

    df_filename = 'drug_dscptr_df(scaling=%s, nan_thresh=%.2f).pkl' \
                  % (dscptr_scaling, dscptr_nan_thresh)
    df_path = os.path.join(data_root, PROC_FOLDER, df_filename)

    # If the dataframe already exists, load and continue ######################
    if os.path.exists(df_path):
        df = pd.read_pickle(df_path)

    # Otherwise load from raw files, process it and save ######################
    else:
        logger.debug('Processing drug descriptor dataframe ... ')

        # Download the raw file if not exist
        download_files(filenames=DSCPTR_FILENAME,
                       target_folder=os.path.join(data_root, RAW_FOLDER))

        df = pd.read_csv(os.path.join(data_root, RAW_FOLDER, DSCPTR_FILENAME),
                         sep='\t',
                         header=0,
                         index_col=0,
                         na_values='na')

        # Drop NaN values if the percentage of NaN exceeds nan_threshold
        # Note that columns (features) are dropped first, and then rows (drugs)
        valid_thresh = 1.0 - dscptr_nan_thresh

        df.dropna(axis=1, inplace=True, thresh=int(df.shape[0] * valid_thresh))
        df.dropna(axis=0, inplace=True, thresh=int(df.shape[1] * valid_thresh))

        # Fill the rest of NaN with column means
        df.fillna(df.mean(), inplace=True)

        # Scaling the descriptor with given scaling method
        df = scale_dataframe(df, dscptr_scaling)

        # Convert data type into generic python types
        df = df.astype(float)

        # save to disk for future usage
        try:
            os.makedirs(os.path.join(data_root, PROC_FOLDER))
        except FileExistsError:
            pass
        df.to_pickle(df_path)

    # Convert the dtypes for a more efficient, compact dataframe ##############
    df = df.astype(float_dtype)
    return df