예제 #1
0
def get_drug_target_df(data_root: str, int_dtype: type = np.int8):
    """df = get_drug_target_df('./data/')

    This function the drug property dataframe, process it and return the
    drug target families dataframe. The processing includes:
        * removing all columns but 'TARGET';
        * drop drugs/rows that are not in the TGT_FAMS list;
        * encode target families into integer labels;
        * convert data types for more compact structure;

    Args:
        data_root (str): path to the data root folder.
        int_dtype (type): int dtype for storage in RAM.

    Returns:
        pd.DataFrame: drug target families dataframe.
    """

    df = get_drug_prop_df(data_root=data_root)[['TARGET']]

    # Only take the rows with specific target families for classification
    df = df[df['TARGET'].isin(TGT_FAMS)][['TARGET']]

    # Encode str formatted target families into integers
    df['TARGET'] = encode_label_to_int(data_root=data_root,
                                       dict_name='drug_target_dict.txt',
                                       labels=df['TARGET'])

    # Convert the dtypes for a more efficient, compact dataframe
    # Note that it is safe to use int8 here for there are only 10 classes
    return df.astype(int_dtype)
예제 #2
0
def get_cl_meta_df(data_root: str,
                   encoding: bool = True,
                   int_dtype: type = np.int8):
    """df = get_cl_meta_df('./data/')

    This function loads the metadata for cell lines, process it and return
    as a dataframe. The processing includes:
        * change column names to ['data_src', 'site', 'type', 'category'];
        * remove the '-' in cell line names;
        * convert data types for more compact structure;

    Note that if the dataframe is already stored in the processed folder,
    the function simply read from file and return after converting dtypes.

    Args:
        data_root (str): path to the data root folder.
        encoding (bool): indicator for encoding types/sites into integers.
        int_dtype (type): int dtype for storage in RAM.

    Returns:
        pd.DataFrame: processed cell line metadata dataframe.
    """

    df_filename = 'cl_meta_df.pkl'
    df_path = os.path.join(data_root, PROC_FOLDER, df_filename)

    # If the dataframe already exists, load and continue ######################
    if os.path.exists(df_path) and encoding:
        df = pd.read_pickle(df_path)

    # Otherwise load from raw files, process it and save ######################
    else:
        logger.debug('Processing cell line meta dataframe ... ')

        # Download the raw file if not exist
        download_files(filenames=CL_METADATA_FILENAME,
                       target_folder=os.path.join(data_root, RAW_FOLDER))

        df = pd.read_csv(os.path.join(data_root, RAW_FOLDER,
                                      CL_METADATA_FILENAME),
                         sep='\t',
                         header=0,
                         index_col=0,
                         usecols=[
                             'sample_name', 'dataset', 'simplified_tumor_site',
                             'simplified_tumor_type', 'sample_category'
                         ],
                         dtype=str)

        # Renaming columns for shorter and better column names
        df.index.names = ['sample']
        df.columns = ['data_src', 'site', 'type', 'category']

        # Delete '-', which could be inconsistent between seq and meta
        df.index = df.index.str.replace('-', '')

        # Convert all the categorical data from text to numeric
        if encoding:
            columns = df.columns
            dict_names = [i + '_dict.txt' for i in columns]
            for col, dict_name in zip(columns, dict_names):
                df[col] = encode_label_to_int(data_root=data_root,
                                              dict_name=dict_name,
                                              labels=df[col])

            # Convert data type into generic python types
            df = df.astype(int)

        else:
            return df

        # save to disk for future usage
        try:
            os.makedirs(os.path.join(data_root, PROC_FOLDER))
        except FileExistsError:
            pass
        df.to_pickle(df_path)

    df = df.astype(int_dtype)
    return df
def get_drug_resp_df(data_root: str,
                     grth_scaling: str,
                     int_dtype: type = np.int8,
                     float_dtype: type = np.float32):
    """df = get_drug_resp_df('./data/', 'std')

    This function loads the whole drug response file, process it and return
    as a dataframe. The processing includes:
        * remove the '-' in cell line names;
        * encode str format data sources into integer;
        * scaling the growth accordingly;
        * convert data types for more compact structure;

    Note that if the dataframe is already stored in the processed folder,
    the function simply read from file and return after converting dtypes.

    Args:
        data_root (str): path to the data root folder.
        grth_scaling (str): scaling strategy for growth in drug response.
        int_dtype (type): int dtype for storage in RAM.
        float_dtype (float): int dtype for storage in RAM.

    Returns:
        pd.DataFrame: processed drug response dataframe.
    """

    df_filename = 'drug_resp_df(scaling=%s).pkl' % grth_scaling
    df_path = os.path.join(data_root, PROC_FOLDER, df_filename)

    # If the dataframe already exists, load and continue ######################
    if os.path.exists(df_path):
        df = pd.read_pickle(df_path)

    # Otherwise load from raw files, process it and save ######################
    else:
        logger.debug('Processing drug response dataframe ... ')

        # Download the raw file if not exist
        download_files(filenames=DRUG_RESP_FILENAME,
                       target_folder=os.path.join(data_root, RAW_FOLDER))

        df = pd.read_csv(os.path.join(data_root, RAW_FOLDER,
                                      DRUG_RESP_FILENAME),
                         sep='\t',
                         header=0,
                         index_col=None,
                         usecols=[
                             0,
                             1,
                             2,
                             4,
                             6,
                         ])

        # Delete '-', which could be inconsistent between seq and meta
        df['CELLNAME'] = df['CELLNAME'].str.replace('-', '')

        # Encode data sources into numeric
        df['SOURCE'] = encode_label_to_int(data_root=data_root,
                                           dict_name='data_src_dict.txt',
                                           labels=df['SOURCE'].tolist())

        # Scaling the growth with given scaling method
        df['GROWTH'] = scale_dataframe(df['GROWTH'], grth_scaling)

        # Convert data type into generic python types
        df[['SOURCE']] = df[['SOURCE']].astype(int)
        df[['LOG_CONCENTRATION', 'GROWTH']] = \
            df[['LOG_CONCENTRATION', 'GROWTH']].astype(float)

        # save to disk for future usage
        try:
            os.makedirs(os.path.join(data_root, PROC_FOLDER))
        except FileExistsError:
            pass
        df.to_pickle(df_path)

    # Convert the dtypes for a more efficient, compact dataframe ##############
    df[['SOURCE']] = df[['SOURCE']].astype(int_dtype)
    df[['LOG_CONCENTRATION', 'GROWTH']] = \
        df[['LOG_CONCENTRATION', 'GROWTH']].astype(float_dtype)
    return df