Exemplo n.º 1
0
    def __init__(self, no_internet, version):
        """Load all of the bcmbrca dataframes as values in the self._data dict variable, with names as keys, and format them properly.

        Parameters:
        version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest".
        no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False.
        """
        #ignore logging messages
        logger = logging.getLogger()
        logger.setLevel(logging.CRITICAL)

        # Set some needed variables, and pass them to the parent Dataset class __init__ function

        # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle.
        valid_versions = ["1.0"]

        data_files = {
            "1.0": [
                "HNSCC_discovery.dnp.annotated.exonic.maf.gz",
                "HNSCC_NAT_RNA-Seq_Expr_WashU_FPKM.tsv.gz",
                "HNSCC_tumor_RNA-Seq_Expr_WashU_FPKM.tsv.gz",
                "HNSCC_mature_miRNA_combined.tsv",
                "HNSCC_precursor_miRNA_combined.tsv",
                "HNSCC_total_miRNA_combined.tsv", "HNSCC_xCell.txt",
                "CIBERSORT.Output_Abs_HNSCC.txt",
                "HNSCC.gene_level.from_seg.filtered.tsv",
                "gencode.v22.annotation.gtf.gz",
                "CPTAC_pancan_RNA_tumor_purity_ESTIMATE_WashU.tsv.gz",
                "README_miRNA", "README_CIBERSORT", "README_xCell",
                "README_somatic_mutation_WXS", "README_gene_expression",
                "README.boxnote", "README_ESTIMATE_WashU"
            ]
        }

        # Call the parent class __init__ function
        super().__init__(cancer_type="washuhnscc",
                         version=version,
                         valid_versions=valid_versions,
                         data_files=data_files,
                         no_internet=no_internet)

        # get clinical df (used to slice out cancer specific patient_IDs in tumor_purity file)
        mssmclin = MssmClinical(
            no_internet=no_internet,
            version=version,
            filter_type='pancanhnscc')  #_get_version - pancandataset
        clinical_df = mssmclin.get_clinical()

        # Load the data into dataframes in the self._data dict
        loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}"
        for file_path in self._data_files_paths:  # Loops through files variable

            # Print a loading message. We add a dot every time, so the user knows it's not frozen.
            loading_msg = loading_msg + "."
            print(loading_msg, end='\r')

            path_elements = file_path.split(
                os.sep)  # Get a list of the levels of the path
            file_name = path_elements[
                -1]  # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below

            if file_name == "HNSCC_discovery.dnp.annotated.exonic.maf.gz":  # Note that we use the "file_name" variable to identify files. That way we don't have to use the whole path.
                df = pd.read_csv(file_path, sep='\t')
                # Rename the columns we want to keep to the appropriate names
                df = pd.read_csv(file_path, sep='\t')
                df['Patient_ID'] = df.loc[:, 'Tumor_Sample_Barcode']
                df = df.rename(
                    columns={
                        "Hugo_Symbol": "Gene",
                        "Gene": "Gene_Database_ID",
                        "Variant_Classification": "Mutation",
                        "HGVSp_Short": "Location"
                    })

                df = df.set_index("Patient_ID")
                df = df[['Gene'] + ["Mutation"] + ["Location"] + [
                    col for col in df.columns
                    if col not in ["Gene", "Mutation", "Location"]
                ]]
                df.index = df.index.str.replace(r"_T", "", regex=True)
                self._data["somatic_mutation"] = df

            if file_name == "HNSCC_NAT_RNA-Seq_Expr_WashU_FPKM.tsv.gz":
                df_norm = pd.read_csv(file_path, sep='\t')
                #change names to universal package names
                df_norm = df_norm.rename(columns={
                    "gene_name": "Name",
                    "gene_id": "Database_ID"
                })
                df_norm = df_norm.set_index(["Name", "Database_ID"])
                df_norm = df_norm.sort_index()
                df_norm = df_norm.T  #transpose
                df_norm.index.name = "Patient_ID"
                df_norm.index = df_norm.index.str.replace(
                    r"-A", ".N", regex=True)  #remove label for tumor samples
                self._helper_tables["transcriptomics_norm"] = df_norm

            if file_name == "HNSCC_tumor_RNA-Seq_Expr_WashU_FPKM.tsv.gz":
                df = pd.read_csv(file_path, sep='\t')
                #change names to universal package names
                df = df.rename(columns={
                    "gene_name": "Name",
                    "gene_id": "Database_ID"
                })
                df = df.set_index(["Name", "Database_ID"])
                df = df.sort_index()
                df = df.T  #transpose
                df.index.name = "Patient_ID"
                df.index = df.index.str.replace(
                    r"-T", "", regex=True)  #remove label for tumor samples
                self._helper_tables["transcriptomics_tumor"] = df

            elif 'miRNA_combined' in file_name:
                miRNA_type = file_name.split('_')[
                    1]  # get type of miRNA data (precursor, mature, or total)
                if miRNA_type == 'mature':
                    df = pd.read_csv(
                        file_path,
                        delimiter='\t',
                        index_col=['Name', 'ID', 'Alias', 'Derives_from'])
                else:
                    df = pd.read_csv(file_path,
                                     delimiter='\t',
                                     index_col=['Name', 'ID', 'Alias'])
                df = df.transpose()
                df = average_replicates(
                    df, common='\.\d$'
                )  # average duplicates for C3L-02617 and C3N-02727
                df.index = df.index.str.replace('\.T$', '', regex=True)
                df.index = df.index.str.replace('\.A$', '.N', regex=True)
                df.index.name = 'Patient_ID'
                # Sort
                normal = df.loc[df.index.str.contains('\.N$', regex=True)]
                normal = normal.sort_values(by=["Patient_ID"])
                tumor = df.loc[~df.index.str.contains('\.N$', regex=True)]
                tumor = tumor.sort_values(by=["Patient_ID"])
                all_df = tumor.append(normal)
                self._data[miRNA_type + '_miRNA'] = all_df

            elif file_name == "HNSCC_xCell.txt":
                df = pd.read_csv(file_path, sep='\t', index_col=0)
                df = df.transpose()
                df.columns.name = 'Name'
                df.index.name = 'Patient_ID'
                df.index = df.index.str.replace(
                    r'-T$', '', regex=True)  # remove label for tumor samples
                df.index = df.index.str.replace(
                    r'-A$', '.N',
                    regex=True)  # change label for normal samples
                self._data["xcell"] = df

            elif file_name == "CIBERSORT.Output_Abs_HNSCC.txt":
                df = pd.read_csv(file_path, sep='\t', index_col=0)
                df.index.name = 'Patient_ID'
                df.columns.name = 'Name'
                df.index = df.index.str.replace(r'-T$', '', regex=True)
                df.index = df.index.str.replace(r'-A$', '.N', regex=True)
                self._data["cibersort"] = df

            elif file_name == "HNSCC.gene_level.from_seg.filtered.tsv":
                df = pd.read_csv(file_path, sep="\t")
                df = df.rename(columns={"Gene": "Name"})
                df = df.set_index("Name")
                self._data["CNV"] = df

            elif file_name == "gencode.v22.annotation.gtf.gz":
                df = read_gtf(file_path)
                df = df[["gene_name", "gene_id"]]
                df = df.drop_duplicates()
                df = df.rename(columns={
                    "gene_name": "Name",
                    "gene_id": "Database_ID"
                })
                df = df.set_index("Name")
                self._helper_tables["CNV_gene_ids"] = df

            elif file_name == "CPTAC_pancan_RNA_tumor_purity_ESTIMATE_WashU.tsv.gz":
                df = pd.read_csv(file_path, sep="\t", na_values='NA')
                df.Sample_ID = df.Sample_ID.str.replace(
                    r'-T', '', regex=True)  # only tumor samples in file
                df = df.set_index('Sample_ID')
                df.index.name = 'Patient_ID'
                # Use list of patient_ids to slice out cancers
                patient_ids = clinical_df.index.to_list()
                df = df.loc[df.index.isin(patient_ids)]
                self._data["tumor_purity"] = df

            elif file_name == "README_miRNA":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_miRNA"] = reader.read()

            elif file_name == "README_CIBERSORT":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_cibersort"] = reader.read()

            elif file_name == "README_xCell":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_xcell"] = reader.read()

            elif file_name == "README_somatic_mutation_WXS":
                with open(file_path, 'r') as reader:
                    self._readme_files[
                        "readme_somatic_mutation"] = reader.read()

            elif file_name == "README_gene_expression":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_transcriptomics"] = reader.read(
                    )

            elif file_name == "README.boxnote":
                self._readme_files["readme_cnv"] = get_boxnote_text(file_path)

            elif file_name == "README_ESTIMATE_WashU":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_tumor_purity"] = reader.read()

        print(' ' * len(loading_msg), end='\r')  # Erase the loading message
        formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..."
        print(formatting_msg, end='\r')

        # combine and create transcriptomic dataframe
        rna_tumor = self._helper_tables.get("transcriptomics_tumor")
        rna_normal = self._helper_tables.get(
            "transcriptomics_norm"
        )  # Normal entries are already marked with 'N' on the end of the ID
        rna_combined = rna_tumor.append(rna_normal)
        self._data["transcriptomics"] = rna_combined

        # CNV
        cnv = self._data["CNV"]
        gene_ids = self._helper_tables["CNV_gene_ids"]
        df = cnv.join(gene_ids, how="left")  #merge in gene_ids
        df = df.reset_index()
        df = df.set_index(["Name", "Database_ID"])  #create multi-index
        df = df.T
        df.index.name = 'Patient_ID'
        self._data["CNV"] = df

        self._data = sort_all_rows_pancan(
            self._data)  # Sort IDs (tumor first then normal)

        # Use the master index to reindex the clinical dataframe, so the clinical dataframe has a record of every sample in the dataset. Rows that didn't exist before (such as the rows for normal samples) are filled with NaN.
        #        new_clinical = self._data["clinical"]
        #        new_clinical = new_clinical.reindex(master_index)

        # Add a column called Sample_Tumor_Normal to the clinical dataframe indicating whether each sample was a tumor or normal sample. Use a function from dataframe_tools to generate it.

        ###FILL: Your dataset should have some way that it marks the Patient IDs
        ### of normal samples. The example code below is for a dataset that
        ### marks them by putting an 'N' at the beginning of each one. You will
        ### need to write a lambda function that takes a given Patient_ID string
        ### and returns a bool indicating whether it corresponds to a normal
        ### sample. Pass that lambda function to the 'normal_test' parameter of
        ### the  generate_sample_status_col function when you call it. See
        ### cptac/dataframe_tools.py for further function documentation.
        ###START EXAMPLE CODE###################################################
        #        sample_status_col = generate_sample_status_col(new_clinical, normal_test=lambda sample: sample[0] == 'N')
        ###END EXAMPLE CODE#####################################################

        #        new_clinical.insert(0, "Sample_Tumor_Normal", sample_status_col)

        # Replace the clinical dataframe in the data dictionary with our new and improved version!
        #        self._data['clinical'] = new_clinical

        # Edit the format of the Patient_IDs to have normal samples marked the same way as in other datasets.

        ###FILL: You will need to pass the proper parameters to correctly
        ### reformat the patient IDs in your dataset. The standard format is to
        ### have the string '.N' appended to the end of the normal patient IDs,
        ### e.g. the  normal patient ID corresponding to C3L-00378 would be
        ### C3L-00378.N (this way we can easily match two samples from the same
        ### patient). The example code below is for a dataset where all the
        ### normal samples have  an "N" prepended to the patient IDs. The
        ### reformat_normal_patient_ids function erases that and puts a ".N" at
        ### the end. See cptac/dataframe_tools.py for further function
        ### documentation.
        ###START EXAMPLE CODE###################################################
        #        self._data = reformat_normal_patient_ids(self._data, existing_identifier="N", existing_identifier_location="start")
        ###END EXAMPLE CODE#####################################################

        # Call function from dataframe_tools.py to sort all tables first by sample status, and then by the index
        #        self._data = sort_all_rows(self._data)

        # Call function from dataframe_tools.py to standardize the names of the index and column axes
        #        self._data = standardize_axes_names(self._data)

        print(" " * len(formatting_msg),
              end='\r')  # Erase the formatting message
Exemplo n.º 2
0
    def __init__(self, no_internet, version):
        """Load all of the umichucec dataframes as values in the self._data dict variable, with names as keys, and format them properly.

        Parameters:
        version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest".
        no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False.
        """

        # Set some needed variables, and pass them to the parent Dataset class __init__ function

        # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle.
        valid_versions = ["1.0"]

        data_files = {
            "1.0": ["Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv",                    
                    "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv",
                    "aliquot_to_patient_ID.tsv",
                    "README_v3.boxnote", # proteomics 
                    "README.boxnote" # phosphoproteomics
             
            ]
        }

        # Call the parent class __init__ function
        super().__init__(cancer_type="umichucec", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet)

        # Load the data into dataframes in the self._data dict
        loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}"
        for file_path in self._data_files_paths: # Loops through files variable

            # Print a loading message. We add a dot every time, so the user knows it's not frozen.
            loading_msg = loading_msg + "."
            print(loading_msg, end='\r')

            path_elements = file_path.split(os.sep) # Get a list of the levels of the path
            file_name = path_elements[-1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below
            
            
            if file_name == "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv":
                df = pd.read_csv(file_path, sep = "\t") 
                df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # get protein identifier 
                df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # get protein name 
                df = df.set_index(['Name', 'Database_ID']) # set multiindex
                df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary  columns
                df = df.transpose()
                ref_intensities = df.loc["ReferenceIntensity"] # get reference intensities to use to calculate ratios 
                df = df.subtract(ref_intensities, axis="columns") # subtract reference intensities from all the values
                df = df.iloc[1:,:] # drop ReferenceIntensity row 
                df.index.name = 'Patient_ID'
                self._data["proteomics"] = df
               
                
            elif file_name == "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv":
                df = pd.read_csv(file_path, sep = "\t") 
                # Parse a few columns out of the "Index" column that we'll need for our multiindex
                df[['Database_ID','Transcript_ID',"Gene_ID","Havana_gene","Havana_transcript","Transcript","Name","Site"]] = df.Index.str.split("\\|",expand=True)
                df[['num1','start',"end","detected_phos","localized_phos","Site"]] = df.Site.str.split("_",expand=True) 

                # Some rows have at least one localized phosphorylation site, but also have other 
                # phosphorylations that aren't localized. We'll drop those rows, if their localized sites 
                # are duplicated in another row, to avoid creating duplicates, because we only preserve information 
                # about the localized sites in a given row. However, if the localized sites aren't duplicated in 
                # another row, we'll keep the row.
                unlocalized_to_drop = df.index[~df["detected_phos"].eq(df["localized_phos"]) & df.duplicated(["Name", "Site", "Peptide", "Database_ID"], keep=False)]# dectected_phos of the split "Index" column is number of phosphorylations detected, and localized_phos is number of phosphorylations localized, so if the two values aren't equal, the row has at least one unlocalized site
                df = df.drop(index=unlocalized_to_drop)
                df = df[df['Site'].notna()] # only keep columns with phospho site 
                df = df.set_index(['Name', 'Site', 'Peptide', 'Database_ID']) # This will create a multiindex from these columns
                #drop columns not needed in df 
                df.drop(["Gene", "Index", "num1", "start", "end", "detected_phos", "localized_phos", "Havana_gene", 
                         "Havana_transcript", "MaxPepProb", "Gene_ID", "Transcript_ID", "Transcript"], axis=1, inplace=True)
                df = df.T # transpose 
                ref_intensities = df.loc["ReferenceIntensity"]# Get reference intensities to use to calculate ratios 
                df = df.subtract(ref_intensities, axis="columns") # Subtract ref intensities from all the values, to get ratios
                df = df.iloc[1:,:] # drop ReferenceIntensity row 
                self._data["phosphoproteomics"] = df
             
            
            # aliquot_to_patient_ID.tsv contains only unique aliquots (no duplicates), 
            # so there is no need to slice out cancer specific aliquots
            # This file can be found on Box under CPTAC/cptac/pancan/helper_files
            elif file_name == "aliquot_to_patient_ID.tsv":
                df = pd.read_csv(file_path, sep = "\t", index_col = 'aliquot_ID', usecols = ['aliquot_ID', 'patient_ID'])
                map_dict = df.to_dict()['patient_ID'] # create dictionary with aliquot_ID as keys and patient_ID as values
                self._helper_tables["map_ids"] = map_dict
                
            elif file_name == "README_v3.boxnote":
                self._readme_files["readme_proteomics"] = get_boxnote_text(file_path)
                
            elif file_name == "README.boxnote":
                self._readme_files["readme_phosphoproteomics"] = get_boxnote_text(file_path)

        
        print(' ' * len(loading_msg), end='\r') # Erase the loading message
        formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..."
        print(formatting_msg, end='\r')
        
        # There was 1 duplicate ID (C3N-01825) in the proteomic and phosphoproteomic data. 
        # I used the Payne lab mapping file "aliquot_to_patient_ID.tsv" to determine the tissue type 
        # for these duplicates, and they were both tumor samples. Next, I ran a pearson correlation 
        # to check how well the values from each duplicate correlated to its tumor flagship sample. 
        # The first occurrence in the file had a higher correlation with the flagship sample 
        # than the second occurrence. I also created scatterplots comparing each duplicate to its flagship sample.  
        # We dropped the second occurrence of the duplicate because it didn't correlate very well to its flagship sample.
        # A file containing the correlations can be downloaded at: 
        # https://byu.box.com/shared/static/jzsq69bd079oq0zbicw4w616hyicd5ev.xlsx
           
        # Drop quality control and ref intensity cols
        drop_cols = ['RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03', 'RefInt_pool04',
                   'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07', 'RefInt_pool08',
                   'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11', 'RefInt_pool12',
                   'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15', 'RefInt_pool16',
                   'RefInt_pool17']
                
        # Get dictionary with aliquots as keys and patient IDs as values
        mapping_dict = self._helper_tables["map_ids"]
            
        # Proteomics
        prot = self._data["proteomics"]
        prot = prot.drop(drop_cols, axis = 'index') # drop quality control and ref intensity cols
        prot = prot.reset_index()
        prot['Patient_ID'] = prot['Patient_ID'].replace(mapping_dict) # replace aliquots with patient IDs
        prot['Patient_ID'] = prot['Patient_ID'].apply(lambda x: x+'.N' if 'NX' in x else x) # 'NX' are enriched normals 
        prot = prot.set_index('Patient_ID')
        prot = rename_duplicate_labels(prot, 'index') # add ".1" to the second ocurrence of the ID with a duplicate
        prot = prot.drop('C3N-01825.1', axis = 'index') # drop the duplicate that didn't correlate well with flagship
        self._data["proteomics"] = prot       
        
        # Phosphoproteomics         
        phos = self._data["phosphoproteomics"]
        phos = phos.drop(drop_cols, axis = 'index') # drop quality control and ref intensity cols
        phos = phos.rename(index = mapping_dict) # replace aliquots with patient IDs (normal samples have .N appended)
        # Add '.N' to enriched normal samples ('NX')
        phos.index.name = 'Patient_ID'
        phos = phos.reset_index()
        phos['Patient_ID'] = phos['Patient_ID'].apply(lambda x: x+'.N' if 'NX' in x else x) # 'NX' are enriched normals
        phos = phos.set_index('Patient_ID')         
        phos = rename_duplicate_labels(phos, 'index') # add ".1" to the second ocurrence of the ID with a duplicate
        phos = phos.drop('C3N-01825.1', axis = 'index') # drop the duplicate that didn't correlate well with flagship       
        self._data["phosphoproteomics"] = phos
        
        
        # Sort rows (tumor first then normal) and columns by first level (protein/gene name)
        self._data = sort_all_rows_pancan(self._data) 


        print(" " * len(formatting_msg), end='\r') # Erase the formatting message
Exemplo n.º 3
0
    def __init__(self, no_internet, version):
        """Load all of the bcmbrca dataframes as values in the self._data dict variable, with names as keys, and format them properly.

        Parameters:
        version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest".
        no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False.
        """
        #ignore logging messages
        logger = logging.getLogger()
        logger.setLevel(logging.CRITICAL)

        # Set some needed variables, and pass them to the parent Dataset class __init__ function

        # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle.
        valid_versions = ["1.0"]

        data_files = {
            "1.0": [
                "OV_prospective.dnp.annotated.exonic.addrecovercases.maf.gz",
                "OV_tumor_RNA-Seq_Expr_WashU_FPKM.tsv.gz",
                #"OV_precursor_miRNA_combined.tsv", # waiting for data
                #"OV_mature_miRNA_combined.tsv",
                #"OV_total_miRNA_combined.tsv",
                "CIBERSORT.Output_Abs_OV.txt",
                "OV_xCell.txt",
                "gencode.v22.annotation.gtf.gz",
                "OV.gene_level.from_seg.filtered.tsv",
                "CPTAC_pancan_RNA_tumor_purity_ESTIMATE_WashU.tsv.gz",
                "README_miRNA",
                "README_CIBERSORT",
                "README_xCell",
                "README_somatic_mutation_WXS",
                "README_gene_expression",
                "README.boxnote",
                "README_ESTIMATE_WashU"
            ]
        }

        # Call the parent class __init__ function
        super().__init__(cancer_type="washuov",
                         version=version,
                         valid_versions=valid_versions,
                         data_files=data_files,
                         no_internet=no_internet)

        # get clinical df (used to slice out cancer specific patient_IDs in tumor_purity file)
        mssmclin = MssmClinical(
            no_internet=no_internet, version=version,
            filter_type='pancanov')  #_get_version - pancandataset
        clinical_df = mssmclin.get_clinical()

        # Load the data into dataframes in the self._data dict
        loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}"
        for file_path in self._data_files_paths:  # Loops through files variable

            # Print a loading message. We add a dot every time, so the user knows it's not frozen.
            loading_msg = loading_msg + "."
            print(loading_msg, end='\r')

            path_elements = file_path.split(
                os.sep)  # Get a list of the levels of the path
            file_name = path_elements[
                -1]  # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below

            if file_name == "OV_prospective.dnp.annotated.exonic.addrecovercases.maf.gz":  # Note that we use the "file_name" variable to identify files. That way we don't have to use the whole path.
                df = pd.read_csv(file_path, sep='\t')
                # Rename the columns we want to keep to the appropriate names
                df = pd.read_csv(file_path, sep='\t')
                df['Patient_ID'] = df.loc[:, 'Tumor_Sample_Barcode']
                df = df.rename(
                    columns={
                        "Hugo_Symbol": "Gene",
                        "Gene": "Gene_Database_ID",
                        "Variant_Classification": "Mutation",
                        "HGVSp_Short": "Location"
                    })

                df = df.set_index("Patient_ID")
                df = df[['Gene'] + ["Mutation"] + ["Location"] + [
                    col for col in df.columns
                    if col not in ["Gene", "Mutation", "Location"]
                ]]
                df.index = df.index.str.replace(r"_T", "", regex=True)
                self._data["somatic_mutation"] = df

            if file_name == "OV_tumor_RNA-Seq_Expr_WashU_FPKM.tsv.gz":
                df = pd.read_csv(file_path, sep="\t")
                df = df.rename(columns={
                    "gene_name": "Name",
                    "gene_id": "Database_ID"
                })
                df = df.set_index(["Name", "Database_ID"])
                df = df.sort_index()
                df = df.T
                df.index.name = "Patient_ID"
                #remove label for tumor samples. All samples are tumors
                df.index = df.index.str.replace(r"-T", "", regex=True)
                self._data["transcriptomics"] = df

            elif file_name == "OV_xCell.txt":
                df = pd.read_csv(file_path, sep='\t', index_col=0)
                df = df.transpose()
                df.columns.name = 'Name'
                df.index.name = 'Patient_ID'
                df.index = df.index.str.replace(
                    r'-T$', '', regex=True)  # remove label for tumor samples
                df.index = df.index.str.replace(
                    r'-A$', '.N',
                    regex=True)  # change label for normal samples
                self._data["xcell"] = df

            elif file_name == "CIBERSORT.Output_Abs_OV.txt":
                df = pd.read_csv(file_path, sep='\t', index_col=0)
                df.index.name = 'Patient_ID'
                df.columns.name = 'Name'
                df.index = df.index.str.replace(r'-T$', '', regex=True)
                df.index = df.index.str.replace(r'-A$', '.N', regex=True)
                self._data["cibersort"] = df

            elif file_name == "OV.gene_level.from_seg.filtered.tsv":
                df = pd.read_csv(file_path, sep="\t")
                df = df.rename(columns={"Gene": "Name"})
                df = df.set_index("Name")
                self._data["CNV"] = df

            elif file_name == "gencode.v22.annotation.gtf.gz":
                df = read_gtf(file_path)
                df = df[["gene_name", "gene_id"]]
                df = df.drop_duplicates()
                df = df.rename(columns={
                    "gene_name": "Name",
                    "gene_id": "Database_ID"
                })
                df = df.set_index("Name")
                self._helper_tables["CNV_gene_ids"] = df

            elif file_name == "CPTAC_pancan_RNA_tumor_purity_ESTIMATE_WashU.tsv.gz":
                df = pd.read_csv(file_path, sep="\t", na_values='NA')
                df.Sample_ID = df.Sample_ID.str.replace(
                    r'-T', '', regex=True)  # only tumor samples in file
                df = df.set_index('Sample_ID')
                df.index.name = 'Patient_ID'
                # Use list of patient_ids to slice out cancers
                patient_ids = clinical_df.index.to_list()
                df = df.loc[df.index.isin(patient_ids)]
                self._data["tumor_purity"] = df

            elif file_name == "README_miRNA":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_miRNA"] = reader.read()

            elif file_name == "README_CIBERSORT":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_cibersort"] = reader.read()

            elif file_name == "README_xCell":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_xcell"] = reader.read()

            elif file_name == "README_somatic_mutation_WXS":
                with open(file_path, 'r') as reader:
                    self._readme_files[
                        "readme_somatic_mutation"] = reader.read()

            elif file_name == "README_gene_expression":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_transcriptomics"] = reader.read(
                    )

            elif file_name == "README.boxnote":
                self._readme_files["readme_cnv"] = get_boxnote_text(file_path)

            elif file_name == "README_ESTIMATE_WashU":
                with open(file_path, 'r') as reader:
                    self._readme_files["readme_tumor_purity"] = reader.read()

        print(' ' * len(loading_msg), end='\r')  # Erase the loading message
        formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..."
        print(formatting_msg, end='\r')

        # CNV
        cnv = self._data["CNV"]
        gene_ids = self._helper_tables["CNV_gene_ids"]
        df = cnv.join(gene_ids, how="left")  #merge in gene_ids
        df = df.reset_index()
        df = df.set_index(["Name", "Database_ID"])  #create multi-index
        df = df.T
        df.index.name = 'Patient_ID'
        self._data["CNV"] = df

        self._data = sort_all_rows_pancan(
            self._data)  # Sort IDs (tumor first then normal)

        print(" " * len(formatting_msg),
              end='\r')  # Erase the formatting message
Exemplo n.º 4
0
    def __init__(self, no_internet, version):
        """Load all of the umichcoad dataframes as values in the self._data dict variable, with names as keys, and format them properly.

        Parameters:
        version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest".
        no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False.
        """

        # Set some needed variables, and pass them to the parent Dataset class __init__ function

        # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle.
        valid_versions = ["1.0", "1.1"]

        data_files = {
            "1.0": [
                "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv",
                "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv",
                "CRC_Prospective sample info.xlsx",
                "README_v3.boxnote",  # proteomics 
                "README.boxnote"  # phosphoproteomics
                #"S039_BCprospective_observed_0920.tsv.gz",
                #"S039_BCprospective_imputed_0920.tsv.gz"
            ],
            "1.1": [
                "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv",
                "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv",
                "CRC_Prospective sample info.xlsx",
                "README_v3.boxnote",  # proteomics 
                "README.boxnote"  # phosphoproteomics
            ]
        }

        # Call the parent class __init__ function
        super().__init__(cancer_type="umichcoad",
                         version=version,
                         valid_versions=valid_versions,
                         data_files=data_files,
                         no_internet=no_internet)

        # Load the data into dataframes in the self._data dict
        loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}"
        for file_path in self._data_files_paths:  # Loops through files variable

            # Print a loading message. We add a dot every time, so the user knows it's not frozen.
            loading_msg = loading_msg + "."
            print(loading_msg, end='\r')

            path_elements = file_path.split(
                os.sep)  # Get a list of the levels of the path
            file_name = path_elements[
                -1]  # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below

            # Proteomics
            if file_name == "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv":
                df = pd.read_csv(file_path, sep="\t")
                df['Database_ID'] = df.Index.apply(
                    lambda x: x.split('|')[0])  # get protein identifier
                df['Name'] = df.Index.apply(
                    lambda x: x.split('|')[6])  # get protein name
                df = df.set_index(['Name', 'Database_ID'])  # set multiindex
                df = df.drop(
                    columns=['Index', 'MaxPepProb', 'NumberPSM',
                             'Gene'])  # drop unnecessary  columns
                df = df.transpose()
                ref_intensities = df.loc[
                    "ReferenceIntensity"]  # get reference intensities to use to calculate ratios
                df = df.subtract(
                    ref_intensities, axis="columns"
                )  # subtract reference intensities from all the values
                df = df.iloc[1:, :]  # drop ReferenceIntensity row
                df.index.name = 'Patient_ID'
                self._data["proteomics"] = df

            # Phosphoproteomics
            elif file_name == "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv":
                df = pd.read_csv(file_path, sep="\t")
                # Parse a few columns out of the "Index" column that we'll need for our multiindex
                df[[
                    'Database_ID', 'Transcript_ID', "Gene_ID", "Havana_gene",
                    "Havana_transcript", "Transcript", "Name", "Site"
                ]] = df.Index.str.split("\\|", expand=True)
                df[[
                    'num1', 'start', "end", "detected_phos", "localized_phos",
                    "Site"
                ]] = df.Site.str.split("_", expand=True)

                # Some rows have at least one localized phosphorylation site, but also have other
                # phosphorylations that aren't localized. We'll drop those rows, if their localized sites
                # are duplicated in another row, to avoid creating duplicates, because we only preserve information
                # about the localized sites in a given row. However, if the localized sites aren't duplicated in
                # another row, we'll keep the row.
                unlocalized_to_drop = df.index[~df["detected_phos"].eq(df["localized_phos"]) & \
                                               df.duplicated(["Name", "Site", "Peptide", "Database_ID"], keep=False)]
                # dectected_phos of the split "Index" column is number of phosphorylations detected, and
                # localized_phos is number of phosphorylations localized, so if the two values aren't equal,
                #the row has at least one unlocalized site
                df = df.drop(index=unlocalized_to_drop)

                df = df[
                    df['Site'].notna()]  # only keep columns with phospho site
                df = df.set_index(['Name', 'Site', 'Peptide', 'Database_ID'
                                   ])  # create a multiindex in this order.
                #drop columns not needed in df
                df.drop([
                    'Gene', "Index", "num1", "start", "end", "detected_phos",
                    "localized_phos", "Havana_gene", "Havana_transcript",
                    "MaxPepProb", "Gene_ID", "Transcript_ID", "Transcript"
                ],
                        axis=1,
                        inplace=True)

                df = df.transpose()
                ref_intensities = df.loc[
                    "ReferenceIntensity"]  # Get reference intensities to use to calculate ratios
                df = df.subtract(
                    ref_intensities, axis="columns"
                )  # Subtract ref intensities from all the values, to get ratios
                df = df.iloc[1:, :]  # drop ReferenceIntensity row
                df.index.name = 'Patient_ID'
                self._data["phosphoproteomics"] = df

            # Mapping file to convert aliquots to patient_IDs for Colon
            # This file can be found on Box under CPTAC/cptac/pancan/helper_files
            elif file_name == "CRC_Prospective sample info.xlsx":
                df = pd.read_excel(file_path,
                                   index_col='Label',
                                   usecols=['Label', 'Sample Code'])
                map_dict = df.to_dict(
                )['Sample Code']  # create dictionary with aliquots as keys and patient IDs as values
                self._helper_tables["map_ids"] = map_dict

            elif file_name == "README_v3.boxnote":
                self._readme_files["readme_proteomics"] = get_boxnote_text(
                    file_path)

            elif file_name == "README.boxnote":
                self._readme_files[
                    "readme_phosphoproteomics"] = get_boxnote_text(file_path)
            '''
            if file_name == "S039_BCprospective_observed_0920.tsv.gz":
                df = pd.read_csv(file_path, sep="\t")
                df = df.transpose()
                df.index.name = 'Patient_ID'
                df.columns.name = 'Name'
                df = average_replicates(df)
                df = df.sort_values(by=["Patient_ID"])
                self._data["proteomics"] = df  
                
            if file_name == "S039_BCprospective_imputed_0920.tsv.gz":
                df = pd.read_csv(file_path, sep="\t")
                df = df.transpose()
                df.index.name = 'Patient_ID'
                df.columns.name = 'Name'
                df = average_replicates(df)
                df = df.sort_values(by=["Patient_ID"])
                self._data["proteomics_imputed"] = df'''

        print(' ' * len(loading_msg), end='\r')  # Erase the loading message
        formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..."
        print(formatting_msg, end='\r')

        drop_cols = [
            'colonRef22-2', 'RefInt_ColonRef01', 'RefInt_ColonRef02',
            'RefInt_ColonRef03', 'RefInt_ColonRef04', 'RefInt_ColonRef05',
            'RefInt_ColonRef06', 'RefInt_ColonRef07', 'RefInt_ColonRef08',
            'RefInt_ColonRef09', 'RefInt_ColonRef10', 'RefInt_ColonRef11',
            'RefInt_ColonRef12', 'RefInt_ColonRef13', 'RefInt_ColonRef14',
            'RefInt_ColonRef15', 'RefInt_ColonRef16', 'RefInt_ColonRef17',
            'RefInt_ColonRef18', 'RefInt_ColonRef19', 'RefInt_ColonRef20',
            'RefInt_ColonRef21', 'RefInt_ColonRef22-1'
        ]

        # Proteomics
        prot = self._data['proteomics']
        prot = prot.drop(
            drop_cols,
            axis='index')  # drop quality control and ref intensity cols

        # Phosphoproteomics
        phos = self._data["phosphoproteomics"]
        phos = phos.drop(
            drop_cols,
            axis='index')  # drop quality control and ref intensity cols

        if self._version == "1.1":
            # Get dictionary to map aliquot to patient IDs
            mapping_dict = self._helper_tables["map_ids"]

            # Proteomics
            prot = prot.reset_index()
            prot['Patient_ID'] = prot['Patient_ID'].replace(
                mapping_dict)  # replace aliquots with Patient_IDs
            prot.Patient_ID = prot.Patient_ID.apply(
                lambda x: x[1:] + '.N'
                if x[0] == 'N' else x[1:])  # change normals to have .N
            prot = prot.set_index('Patient_ID')

            # Phosphoproteomics
            phos = phos.reset_index()
            phos['Patient_ID'] = phos['Patient_ID'].replace(
                mapping_dict)  # replace aliquots with Patient_IDs
            phos.Patient_ID = phos.Patient_ID.apply(
                lambda x: x[1:] + '.N'
                if x[0] == 'N' else x[1:])  # change normals to have .N
            phos = phos.set_index('Patient_ID')

        self._data['proteomics'] = prot
        self._data["phosphoproteomics"] = phos

        # Sort rows (tumor first then normal) and columns by first level (protein/gene name)
        self._data = sort_all_rows_pancan(self._data)

        print(" " * len(formatting_msg),
              end='\r')  # Erase the formatting message
Exemplo n.º 5
0
    def __init__(self, no_internet, version):
        """Load all of the umichucec dataframes as values in the self._data dict variable, with names as keys, and format them properly.

        Parameters:
        version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest".
        no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False.
        """

        # Set some needed variables, and pass them to the parent Dataset class __init__ function

        # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle.
        valid_versions = ["1.0"]

        data_files = {
            "1.0": ["Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv",
                    "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv",
                    "aliquot_to_patient_ID.tsv",
                    "README_v3.boxnote", # proteomics 
                    "README.boxnote" # phosphoproteomics
             
            ]
        }

        # Call the parent class __init__ function
        super().__init__(cancer_type="umichpdac", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet)

        # Load the data into dataframes in the self._data dict
        loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}"
        for file_path in self._data_files_paths: # Loops through files variable

            # Print a loading message. We add a dot every time, so the user knows it's not frozen.
            loading_msg = loading_msg + "."
            print(loading_msg, end='\r')

            path_elements = file_path.split(os.sep) # Get a list of the levels of the path
            file_name = path_elements[-1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below
                   
            if file_name == 'Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv':
                df = pd.read_csv(file_path, sep = "\t") 
                df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # get protein identifier 
                df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # get protein name 
                df = df.set_index(['Name', 'Database_ID']) # set multiindex
                df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary  columns
                df = df.transpose()
                ref_intensities = df.loc["ReferenceIntensity"] # get reference intensities to use to calculate ratios 
                df = df.subtract(ref_intensities, axis="columns") # subtract reference intensities from all the values
                df.index.name = 'Patient_ID'
                # Drop quality control and ref intensity 
                drop_cols = ['ReferenceIntensity', 'QC1', 'QC2', 'QC3', 'QC4', 'QC5', 'QC6', 'KoreanReference1',
                   'KoreanReference2', 'KoreanReference3', 'Pool-24-2', 'WU-PDA1', 'WU-Pool-25']
                df = df.drop(drop_cols, axis = 'index')   
                self._data["proteomics"] = df

                
            elif file_name == "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv":
                df = pd.read_csv(file_path, sep = "\t") 
                # Parse a few columns out of the "Index" column that we'll need for our multiindex
                df[['Database_ID','Transcript_ID',"Gene_ID","Havana_gene","Havana_transcript","Transcript","Name","Site"]] = df.Index.str.split("\\|",expand=True)
                df[['num1','start',"end","detected_phos","localized_phos","Site"]] = df.Site.str.split("_",expand=True) 

                 # Some rows have at least one localized phosphorylation site, but also have other phosphorylations 
                # that aren't localized. We'll drop those rows, if their localized sites are duplicated in another row, 
                # to avoid creating duplicates, because we only preserve information about the localized sites in a 
                # given row. However, if the localized sites aren't duplicated in another row, we'll keep the row.
                unlocalized_to_drop = df.index[~df["detected_phos"].eq(df["localized_phos"]) & \
                                               df.duplicated(["Name", "Site", "Peptide", "Database_ID"], keep=False)]
                # dectected_phos of the split "Index" column is number of phosphorylations detected, and 
                # localized_phos is number of phosphorylations localized, so if the two values aren't equal, 
                # the row has at least one unlocalized site
                df = df.drop(index=unlocalized_to_drop)

                df = df[df['Site'].notna()] # only keep columns with phospho site 
                df = df.set_index(['Name', 'Site', 'Peptide', 'Database_ID']) # create a multiindex in this order
                #drop columns not needed in df 
                df.drop(['Gene',  "Index", "num1", "start", "end", "detected_phos", "localized_phos", "Havana_gene", "Havana_transcript", "MaxPepProb", "Gene_ID", "Transcript_ID", "Transcript"], axis=1, inplace=True)
                 
                df = df.T #transpose df 
                ref_intensities = df.loc["ReferenceIntensity"]# Get reference intensities to use to calculate ratios 
                df = df.subtract(ref_intensities, axis="columns") # Subtract ref intensities from all the values, to get ratios
                # Drop qauality control and ref intensity 
                drop_cols = ['ReferenceIntensity', 'QC1', 'QC2', 'QC3', 'QC4', 'QC5', 'QC6', 'KoreanReference1',
                           'KoreanReference2', 'KoreanReference3', 'Pool-24-2', 'WU-PDA1', 'WU-Pool-25','RefInt_pool-01',
                         'RefInt_pool-02','RefInt_pool-03','RefInt_pool-04','RefInt_pool-05','RefInt_pool-06','RefInt_pool-07',
                         'RefInt_pool-08','RefInt_pool-09', 'RefInt_pool-10','RefInt_pool-11','RefInt_pool-12','RefInt_pool-13',
                         'RefInt_pool-14','RefInt_pool-15','RefInt_pool-16','RefInt_pool-17','RefInt_pool-18','RefInt_pool-19',
                         'RefInt_pool-20','RefInt_pool-21','RefInt_pool-22','RefInt_pool-23','RefInt_pool-24','RefInt_pool-25']
                df = df.drop(drop_cols, axis = 'index')
                self._data["phosphoproteomics"] = df

                
            # aliquot_to_patient_ID.tsv contains only unique aliquots (no duplicates), 
            # so there is no need to slice out cancer specific aliquots
            # This file can be found on Box under CPTAC/cptac/pancan/helper_files
            elif file_name == "aliquot_to_patient_ID.tsv":
                df = pd.read_csv(file_path, sep = "\t", index_col = 'aliquot_ID', usecols = ['aliquot_ID', 'patient_ID'])
                map_dict = df.to_dict()['patient_ID'] # create dictionary with aliquots as keys and patient IDs as values
                self._helper_tables["map_ids"] = map_dict
                
            elif file_name == "README_v3.boxnote":
                self._readme_files["readme_proteomics"] = get_boxnote_text(file_path)
                
            elif file_name == "README.boxnote":
                self._readme_files["readme_phosphoproteomics"] = get_boxnote_text(file_path)
                
        
        print(' ' * len(loading_msg), end='\r') # Erase the loading message
        formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..."
        print(formatting_msg, end='\r')
        
        
        # These 8 aliquots were not in the mapping file. Yize said they are all normal samples.
        manually_mapped = {'CPT0347760002': 'C3L-07032.N', 'CPT0347790002': 'C3L-07033.N',
            'CPT0347820002': 'C3L-07034.N', 'CPT0347850002': 'C3L-07035.N', 'CPT0347880002': 'C3L-07036.N',
            'CPT0355180003': 'C3L-03513.N', 'CPT0355190003': 'C3L-03515.N', 'CPT0355200003': 'C3L-03514.N'}
        
        
        # Get dictionary to map aliquots to patient IDs 
        mapping_dict = self._helper_tables["map_ids"]
        
        # Proteomics
        prot = self._data["proteomics"]  
        prot = prot.rename(index = mapping_dict) # replace aliquots with patient IDs (normals have .N)
        prot = prot.rename(index = manually_mapped) # map 8 aliquots that were not in the mapping file
        self._data["proteomics"] = prot
        
        # Phosphoproteomics 
        phos = self._data["phosphoproteomics"]
        phos = phos.rename(index = mapping_dict) # replace aliquots with patient IDs (normals have .N) 
        phos = phos.rename(index = manually_mapped) # map 8 aliquots that were not in the mapping file
        self._data["phosphoproteomics"] = phos
        
        
        # Sort rows (tumor first then normal) and columns by first level (protein/gene name)
        self._data = sort_all_rows_pancan(self._data) 
        
        print(" " * len(formatting_msg), end='\r') # Erase the formatting message
Exemplo n.º 6
0
    def __init__(self, no_internet, version):
        """Load all of the umichhnscc dataframes as values in the self._data dict variable, with names as keys, and format them properly.

        Parameters:
        version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest".
        no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False.
        """

        # Set some needed variables, and pass them to the parent Dataset class __init__ function

        # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle.
        valid_versions = ["1.0"]

        data_files = {
            "1.0": ["Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv",
                    "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv",
                    "README_v3.boxnote", # proteomics 
                    "README.boxnote" # phosphoproteomics
                #"S039_BCprospective_observed_0920.tsv.gz",
                #"S039_BCprospective_imputed_0920.tsv.gz"
            ]
        }

        # Call the parent class __init__ function
        super().__init__(cancer_type="umichhnscc", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet)

        # Load the data into dataframes in the self._data dict
        loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}"
        for file_path in self._data_files_paths: # Loops through files variable

            # Print a loading message. We add a dot every time, so the user knows it's not frozen.
            loading_msg = loading_msg + "."
            print(loading_msg, end='\r')

            path_elements = file_path.split(os.sep) # Get a list of the levels of the path
            file_name = path_elements[-1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below
            
            
            if file_name == "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv":
                df = pd.read_csv(file_path, sep = "\t") 
                df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # get protein identifier 
                df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # get protein name 
                df = df.set_index(['Name', 'Database_ID']) # set multiindex
                df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary  columns
                df = df.transpose()
                ref_intensities = df.loc["ReferenceIntensity"] # get reference intensities to use to calculate ratios 
                df = df.subtract(ref_intensities, axis="columns") # subtract reference intensities from all the values 
                df = df.iloc[1:,:] # drop ReferenceIntensity row 
                df.index.name = 'Patient_ID'
                self._data["proteomics"] = df              
                
            elif file_name == "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv":
                df = pd.read_csv(file_path, sep = "\t") 
               # Parse a few columns out of the "Index" column that we'll need for our multiindex
                df[['Database_ID','Transcript_ID',"Gene_ID","Havana_gene","Havana_transcript","Transcript","Name", "Site"]] = df.Index.str.split("\\|",expand=True)
                df[['num1','start',"end","detected_phos","localized_phos","Site"]] = df.Site.str.split("_",expand=True) 

                # Some rows have at least one localized phosphorylation site, but also have other
                # phosphorylations that aren't localized. We'll drop those rows, if their localized 
                # sites are duplicated in another row, to avoid creating duplicates, because we only 
                # preserve information about the localized sites in a given row. However, if the localized 
                # sites aren't duplicated in another row, we'll keep the row.
                unlocalized_to_drop = df.index[~df["detected_phos"].eq(df["localized_phos"]) & \
                                               df.duplicated(["Name", "Site", "Peptide", "Database_ID"], keep=False)]
                # dectected_phos of the split "Index" column is number of phosphorylations detected, and 
                # localized_phos is number of phosphorylations localized, so if the two values aren't equal, the 
                # row has at least one unlocalized site
                df = df.drop(index=unlocalized_to_drop)
                df = df[df['Site'].notna()] # only keep columns with phospho site 
                df = df.set_index(['Name', 'Site', 'Peptide', 'Database_ID']) # create a multiindex, in this order.
                #drop columns not needed in df 
                df.drop(['Gene', "Index", "num1", "start", "end", "detected_phos", "localized_phos", "Havana_gene", 
                         "Havana_transcript", "MaxPepProb", "Gene_ID", "Transcript_ID", "Transcript"], axis=1, inplace=True)
                df = df.T #transpose df 
                ref_intensities = df.loc["ReferenceIntensity"]# Get reference intensities to use to calculate ratios 
                df = df.subtract(ref_intensities, axis="columns") # Subtract ref intensities from all the values, to get ratios
                df = df.iloc[1:,:] # drop ReferenceIntensity row 
                self._data["phosphoproteomics"] = df
                
            elif file_name == "README_v3.boxnote":
                self._readme_files["readme_proteomics"] = get_boxnote_text(file_path)
                
            elif file_name == "README.boxnote":
                self._readme_files["readme_phosphoproteomics"] = get_boxnote_text(file_path)
            
            '''
            if file_name == "S039_BCprospective_observed_0920.tsv.gz":
                df = pd.read_csv(file_path, sep="\t")
                df = df.transpose()
                df.index.name = 'Patient_ID'
                df.columns.name = 'Name'
                df = average_replicates(df)
                df = df.sort_values(by=["Patient_ID"])
                self._data["proteomics"] = df  
                
            if file_name == "S039_BCprospective_imputed_0920.tsv.gz":
                df = pd.read_csv(file_path, sep="\t")
                df = df.transpose()
                df.index.name = 'Patient_ID'
                df.columns.name = 'Name'
                df = average_replicates(df)
                df = df.sort_values(by=["Patient_ID"])
                self._data["proteomics_imputed"] = df'''
                
        
        print(' ' * len(loading_msg), end='\r') # Erase the loading message
        formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..."
        print(formatting_msg, end='\r')               
        
        # There were 4 labels with "-duplicate" appended in proteomics and phosphoproteomics files.
        # I ran a pearson correlation to check how well the values from each duplicate correlated to 
        # the other duplicates for the same case ID. Three of the duplicates correlated well with their 
        # respective case IDs. C3L-02617-N-duplicate2 did not correlate well with the other C3L-02617 duplicates, 
        # so we dropped it and averaged the other two. I also created a scatterplot to compare each duplicate to 
        # the first occurence of its case ID. The linear scatterplots indicated similarity between the aliquots. 
        # We averaged the duplicates that correlated well together and were the same tissue type.        
        # A file containing the correlations can be downloaded at: 
        # https://byu.box.com/shared/static/jzsq69bd079oq0zbicw4w616hyicd5ev.xlsx
        
        drop_cols = ['128C', 'QC2', 'QC3', 'QC4', '129N', 'LungTumor1', 'Pooled-sample14',
                   'LungTumor2', 'QC6', 'LungTumor3', 'Pooled-sample17', 'QC7',
                   'Pooled-sample19', 'QC9', 'RefInt_pool01', 'RefInt_pool02',
                   'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
                   'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
                   'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
                   'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
                   'RefInt_pool19', 'RefInt_pool20']    
        
        # Proteomics
        prot = self._data["proteomics"]        
        prot = prot.drop(drop_cols, axis = 'index') # drop quality control and ref intensity cols        
        prot = prot.drop(['C3L-02617-N-duplicate2'], axis = 'index') # drop duplicate that did not correlate well  
        # These IDs had a high correlation with their respective duplicates, so we average them
        # duplicates: 'C3L-02617-T-duplicate', 'C3L-00994-N-duplicate', 'C3L-02617-N-duplicate'
        prot = average_replicates(prot, ['C3L-02617-T','C3L-02617-N','C3L-00994-N'], normal_identifier = '-N') 
        prot.index = prot.index.str.replace('-T$','', regex = True)
        prot.index = prot.index.str.replace('-N$','.N', regex = True)
        prot.index = prot.index.str.replace('-C$','.C', regex = True) # 6 cored normal samples in Hnscc
        self._data["proteomics"] = prot
        
        # Phosphoproteomics
        phos = self._data["phosphoproteomics"]
        phos = phos.drop(drop_cols, axis = 'index') # drop quality control and ref intensity cols        
        phos = phos.drop(['C3L-02617-N-duplicate2'], axis = 'index') # drop duplicate that did not correlate well
        # average IDs that correlated well to their respective duplicates
        phos = average_replicates(phos, ['C3L-02617-T','C3L-02617-N','C3L-00994-N'], normal_identifier = '-N') 
        phos.index = phos.index.str.replace('-T$','', regex = True)
        phos.index = phos.index.str.replace('-N$','.N', regex = True)
        phos.index = phos.index.str.replace('-C$','.C', regex = True) # 6 cored normal samples in Hnscc
        self._data["phosphoproteomics"] = phos

        
        # Sort rows (tumor first then normal) and columns by first level (protein/gene name)
        self._data = sort_all_rows_pancan(self._data) 
        

        print(" " * len(formatting_msg), end='\r') # Erase the formatting message