def main(): parser = argparse.ArgumentParser() parser.add_argument('--cancerType', dest='type',\ help='Cancer type to be collected') parser.add_argument('--getData',dest='get', action='store_true',\ default=False,help='Set flag to get all data') opts = parser.parse_args() if opts.get: for ds in ['brca', 'ccrcc', 'colon', 'ovarian', 'endometrial', 'luad']: cptac.download(dataset=ds) if opts.type.lower() == 'brca': dat = cptac.Brca() elif opts.type.lower() == 'ccrcc': dat = cptac.Ccrcc() elif opts.type.lower() == 'coad': dat = cptac.Colon() elif opts.type.lower() == 'ovca': dat = cptac.Ovarian() elif opts.type.lower() == 'luad': dat = cptac.Luad() elif opts.type.lower() == 'endometrial': dat = cptac.Endometrial() else: exit() df = dat.get_phosphoproteomics() pdf = dat.get_proteomics() # df.columns = [' '.join(col).strip() for col in df.columns.values] df.to_csv(path_or_buf="phos_file.tsv", sep='\t') pdf.to_csv(path_or_buf='prot_file.tsv', sep='\t')
def download(dataset, version="latest", redownload=False): dataset = dataset.lower() if dataset.startswith("pdc"): return _pdc_download(dataset, version=version, redownload=redownload) elif dataset.startswith("pancan") or dataset == "all": box_token = get_box_token() if dataset == "pancanbrca": sources = BRCA_SOURCES elif dataset == "pancanccrcc": sources = CCRCC_SOURCES elif dataset == "pancancoad": sources = COAD_SOURCES elif dataset == "pancangbm": sources = GBM_SOURCES elif dataset == "pancanhnscc": sources = HNSCC_SOURCES elif dataset == "pancanlscc": sources = LSCC_SOURCES elif dataset == "pancanluad": sources = LUAD_SOURCES elif dataset == "pancanov": sources = OV_SOURCES elif dataset == "pancanucec": sources = UCEC_SOURCES elif dataset == "all": sources = BRCA_SOURCES + CCRCC_SOURCES + COAD_SOURCES + GBM_SOURCES + HNSCC_SOURCES + LSCC_SOURCES + LUAD_SOURCES + OV_SOURCES + UCEC_SOURCES else: raise InvalidParameterError(f"{dataset} is not a valid dataset.") overall_success = True for source in sources: if source.startswith("pdc"): single_success = download(source, version=version, redownload=redownload) else: single_success = cptac.download(source, version=version, redownload=redownload, box_auth=True, box_token=box_token) if not single_success: overall_success = False return overall_success else: return cptac.download(dataset, version=version, redownload=redownload, box_auth=True)
def download_datasets(get_datasets_lists): # Download public datasets for cancer in get_datasets_lists["public"]: try: print(f"Downloading {cancer}...", end='\r') cptac.download(cancer, redownload=True) except: pytest.fail(f"Unable to download data for {cancer} dataset.") # TODO: Download restricted datasets return True
def downloadCptac(): # To view available datasets, enter 'cptac.list_data()'. cptac.list_datasets() cptac.download(dataset = "endometrial") cptac.download(dataset = 'colon') cptac.download(dataset = 'ovarian') cptac.download(dataset = 'RenalCcrcc')
def cptacData(): ''' We need to collect and load CPTAC data ''' print("Loading cptac datasets") #we need to make sure all datasets are downloaded ##here are the cancers that are available without login information allcans = ['brca', 'ccrcc', 'colon', 'ovarian', 'luad',\ #'hnscc','gbm','lscc',\ 'endometrial'] print("Downloading cptac data") for ct in allcans: cptac.download(dataset=ct) #then we load them into a dictionary fdict = {'brca':cptac.Brca(), 'ccrcc':cptac.Ccrcc(),\ 'colon':cptac.Colon(), 'ovarian':cptac.Ovarian(),\ #'hnscc':cptac.Hnscc(),'gbm':cptac.Gbm(), 'lscc':cptac.Lscc(),\ 'endometrial':cptac.Endometrial(), 'luad':cptac.Luad()} return fdict
def __init__(self): cptac.download(dataset="endometrial", version='latest') # cptac.download(dataset="brca", version='latest') # cptac.download(dataset="gbm", version='latest') # cptac.download(dataset="hsncc", version='latest') # cptac.download(dataset="luad", version='latest') cptac.download(dataset="ovarian", version='latest') cptac.download(dataset="ccrcc", version='latest') cptac.download(dataset="colon", version='latest') self.en = cptac.Endometrial() # self.brca = cptac.Brca() # self.gbm = cptac.Gbm() # self.hsncc = cptac.Hnscc() # self.luad= cptac.Luad() self.ovarian = cptac.Ovarian() self.ccrcc = cptac.Ccrcc() self.colon = cptac.Colon() # self.datasets = list(self.en,self.brca,self.gbm,self.hsncc,self.luad,self.ovarian,self.ccrcc) self.datasets = list([self.en, self.ovarian, self.ccrcc, self.colon])
#!/usr/bin/env python # coding: utf-8 ### Creation of BoxPlots for ESR1 Protien Expression ### import cptac # import cptac to download cptac protein and clinical data import pandas as pd # import pandas for import matplotlib.pyplot as plt # import matplotlib for boxplot import seaborn as sns # import seaborn for boxplot from scipy import stats from statannot import add_stat_annotation cptac.download(dataset="Brca") # download breast cancer dataset br = cptac.Brca() # save data in br variable protein_data = br.get_proteomics() # save proteomic data protein_data = protein_data.droplevel(1, axis=1) # remove multi index clinical_data = br.get_clinical() # save clinical data esr1 = protein_data["ESR1"] # save ESR1 protein expression column clinical_data["ER.IHC.Score"] = clinical_data["ER.IHC.Score"].fillna( "Not reported") # fill in null values er_mask = clinical_data[ "ER.IHC.Score"] == "3+" # 3+ is ER-positive, create mask of ER-positive patients patients = esr1[er_mask] # apply mask to protein expression data ages = clinical_data["Age.in.Month"][er_mask] / 12 # calculate ages in years er_positive_patients = pd.DataFrame(patients, columns=["ESR1"]) # create new dataframe er_positive_patients["Age"] = ages # apply ages to new column in dataframe category = [] # set categories list
import cptac import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import scipy.stats as stats #Alyssa's code cptac.download(dataset="Brca") br = cptac.Brca() protein_data = br.get_proteomics() #Remove the "multi" part of the dataframe (the dataframes are MultIndex pandas dataframes) protein_data = protein_data.droplevel(1, axis=1) rna_data = br.get_transcriptomics() clinical_data = br.get_clinical() clinical_data["Age_in_years"] = clinical_data["Age.in.Month"]/12 # clinical_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/clinical_data.csv") # protein_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/protein_data.csv") # rna_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/rna_data.csv") # clinical_data_readin = pd.read_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/clinical_data.csv", index_col=1) #The index_col=0 creates the index (rownames) as the first column. #Check that the patients are in the same order in rna_data and protein_data. #We are looking at the gene expression and protein information of EACH patient, so the data needs to be in pairs.
# -*- coding: utf-8 -*- “””bioinfoProteomicsHW4.ipynb # Loading CPTAC Data in google colab using a Python package developed in the Payne lab at BYU. """ #install and import cptac and other python packages (just pandas in this case) !pip install -q cptac import cptac import pandas as pd #download endometrial data and creates a new endometrial class cptac.download(dataset="Endometrial") en = cptac.Endometrial() #obtain proteomic data (enProt) and patient information (enInfo) enProt = en.get_proteomics() enInfo = en.get_clinical() #i used google colab to do this, so need to mount my drive to be able to export the data from google.colab import drive drive.mount('drive') #save proteome and clinical information tables to export and work on locally/in another language enProt.to_csv("enProt.csv") !cp enProt.csv "drive/My Drive" enInfo.to_csv("enInfo.csv") !cp enInfo.csv "drive/My Drive"
def setup_class(cls): """Download all datasets, and do any other setup.""" cptac.download(dataset="all", version="latest", redownload=False)
def _pdc_download(dataset, version, redownload, box_token): """Download data for the specified cancer type from the PDC.""" dataset = str.lower(dataset) if dataset == "pdcall": overall_result = True for dataset in STUDY_IDS_MAP.keys(): if not _pdc_download(dataset, version, redownload): overall_result = False return overall_result if not dataset.startswith("pdc"): raise InvalidParameterError( f"_pdc_download function can only be used for PDC datasets, which start with the prefix 'pdc'. You tried to download '{dataset}'." ) if dataset not in STUDY_IDS_MAP.keys(): raise InvalidParameterError( f"PDC dataset must be one of the following:\n{list(STUDY_IDS_MAP.keys())}\nYou passed '{dataset}'." ) dataset_ids = STUDY_IDS_MAP[dataset] # Download the file for mapping aliquots to patient IDs if not cptac.download(dataset, version=version, redownload=redownload, _box_auth=True, _box_token=box_token): return False path_here = os.path.abspath(os.path.dirname(__file__)) cancer_dir = os.path.join(path_here, f"data_{dataset}") # Check that the index file exists. If not, there was an uncaught error in the mapping file download. index_path = os.path.join(cancer_dir, "index.txt") if not os.path.isfile(index_path): raise CptacDevError( f"Index file not found at {index_path}. Mapping file download probably failed." ) # See what data files we need to download data_dir = os.path.join(cancer_dir, f"{dataset}_v1.0") # If any of the files are missing, we're going to delete any remaining and redownload all, in case the missing files are a sign of a previous data problem data_files = [f"{data_type}.tsv.gz" for data_type in dataset_ids.keys()] + ["clinical.tsv.gz"] for data_file in data_files: data_file_path = os.path.join(data_dir, data_file) if not os.path.isfile(data_file_path): redownload = True break if redownload: for data_file in data_files: data_file_path = os.path.join(data_dir, data_file) if os.path.isfile(data_file_path): os.remove(data_file_path) else: return True # If all the files are there and the user didn't ask to redownload, we're done. # Now download all the data files # We'll combine all the clinical tables in case there are differences master_clin = pd.DataFrame() for data_type in dataset_ids.keys(): # Print an update download_msg = f"Downloading {dataset} {data_type} files..." print(download_msg, end="\r") # Get the clinical and quantitative tables for the study ID clin, quant = download_pdc_id(dataset_ids[data_type], _download_msg=False) # Print a new update print(" " * len(download_msg), end="\r") save_msg = f"Saving {dataset} {data_type} files..." print(save_msg, end="\r") # Append the clinical dataframe #master_clin = master_clin.append(clin) master_clin = pd.concat([master_clin, clin], axis=0, join='outer') # Save the quantitative table quant.to_csv(os.path.join(data_dir, f"{data_type}.tsv.gz"), sep="\t") # Erase update print(" " * len(save_msg), end="\r") # Print an update save_msg = f"Saving {dataset} clinical file..." print(save_msg, end="\r") # Drop any duplicated rows in combined clinical table, then save it too master_clin = master_clin.drop_duplicates(keep="first") master_clin.to_csv(os.path.join(data_dir, "clinical.tsv.gz"), sep="\t") # Erase update print(" " * len(save_msg), end="\r") return True
def test_invalid_dataset(self): with pytest.raises(InvalidParameterError) as exception_raised: cptac.download("abc") assert exception_raised.type == InvalidParameterError
''' Get all cptac data downloads data into docker image ''' import cptac for ds in ['brca', 'ccrcc', 'colon', 'ovarian']: cptac.download(dataset=ds)