def main(): parser = argparse.ArgumentParser() parser.add_argument('--cancerType', dest='type',\ help='Cancer type to be collected') parser.add_argument('--getData',dest='get', action='store_true',\ default=False,help='Set flag to get all data') opts = parser.parse_args() if opts.get: for ds in ['brca', 'ccrcc', 'colon', 'ovarian', 'endometrial', 'luad']: cptac.download(dataset=ds) if opts.type.lower() == 'brca': dat = cptac.Brca() elif opts.type.lower() == 'ccrcc': dat = cptac.Ccrcc() elif opts.type.lower() == 'coad': dat = cptac.Colon() elif opts.type.lower() == 'ovca': dat = cptac.Ovarian() elif opts.type.lower() == 'luad': dat = cptac.Luad() elif opts.type.lower() == 'endometrial': dat = cptac.Endometrial() else: exit() df = dat.get_phosphoproteomics() pdf = dat.get_proteomics() # df.columns = [' '.join(col).strip() for col in df.columns.values] df.to_csv(path_or_buf="phos_file.tsv", sep='\t') pdf.to_csv(path_or_buf='prot_file.tsv', sep='\t')
def getDataForCancer(ctype): if ctype.lower() == 'brca': dat = cptac.Brca() elif ctype.lower() == 'ccrcc': dat = cptac.Ccrcc() elif ctype.lower() == 'coad': dat = cptac.Colon() elif ctype.lower() == 'ovca': dat = cptac.Ovarian() elif ctype.lower() == 'luad': dat = cptac.Luad() elif ctype.lower() == 'endometrial': dat = cptac.Endometrial() else: exit() return dat
def cptacData(): ''' We need to collect and load CPTAC data ''' print("Loading cptac datasets") #we need to make sure all datasets are downloaded ##here are the cancers that are available without login information allcans = ['brca', 'ccrcc', 'colon', 'ovarian', 'luad',\ #'hnscc','gbm','lscc',\ 'endometrial'] print("Downloading cptac data") for ct in allcans: cptac.download(dataset=ct) #then we load them into a dictionary fdict = {'brca':cptac.Brca(), 'ccrcc':cptac.Ccrcc(),\ 'colon':cptac.Colon(), 'ovarian':cptac.Ovarian(),\ #'hnscc':cptac.Hnscc(),'gbm':cptac.Gbm(), 'lscc':cptac.Lscc(),\ 'endometrial':cptac.Endometrial(), 'luad':cptac.Luad()} return fdict
#!/usr/bin/env python # coding: utf-8 ### Creation of BoxPlots for ESR1 Protien Expression ### import cptac # import cptac to download cptac protein and clinical data import pandas as pd # import pandas for import matplotlib.pyplot as plt # import matplotlib for boxplot import seaborn as sns # import seaborn for boxplot from scipy import stats from statannot import add_stat_annotation cptac.download(dataset="Brca") # download breast cancer dataset br = cptac.Brca() # save data in br variable protein_data = br.get_proteomics() # save proteomic data protein_data = protein_data.droplevel(1, axis=1) # remove multi index clinical_data = br.get_clinical() # save clinical data esr1 = protein_data["ESR1"] # save ESR1 protein expression column clinical_data["ER.IHC.Score"] = clinical_data["ER.IHC.Score"].fillna( "Not reported") # fill in null values er_mask = clinical_data[ "ER.IHC.Score"] == "3+" # 3+ is ER-positive, create mask of ER-positive patients patients = esr1[er_mask] # apply mask to protein expression data ages = clinical_data["Age.in.Month"][er_mask] / 12 # calculate ages in years er_positive_patients = pd.DataFrame(patients, columns=["ESR1"]) # create new dataframe er_positive_patients["Age"] = ages # apply ages to new column in dataframe category = [] # set categories list
import cptac import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import scipy.stats as stats #Alyssa's code cptac.download(dataset="Brca") br = cptac.Brca() protein_data = br.get_proteomics() #Remove the "multi" part of the dataframe (the dataframes are MultIndex pandas dataframes) protein_data = protein_data.droplevel(1, axis=1) rna_data = br.get_transcriptomics() clinical_data = br.get_clinical() clinical_data["Age_in_years"] = clinical_data["Age.in.Month"]/12 # clinical_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/clinical_data.csv") # protein_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/protein_data.csv") # rna_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/rna_data.csv") # clinical_data_readin = pd.read_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/clinical_data.csv", index_col=1) #The index_col=0 creates the index (rownames) as the first column. #Check that the patients are in the same order in rna_data and protein_data. #We are looking at the gene expression and protein information of EACH patient, so the data needs to be in pairs.
# To view available datasets, enter 'cptac.list_data()'. cptac.list_datasets() cptac.download(dataset = "endometrial") cptac.download(dataset = 'colon') cptac.download(dataset = 'ovarian') cptac.download(dataset = 'RenalCcrcc') #cptac.download(dataset ='luad') #cptac.download(dataset ='brca') downloadCptac() endometrialData = cptac.Endometrial() colorectalData = cptac.Colon() ovarianData = cptac.Ovarian() renalData = cptac.RenalCcrcc() lungData = cptac.Luad() breastData = cptac.Brca() def listDataForEachCancer(): print("endometrial") endometrialData.list_data() print("\n\ncolorectal") colorectalData.list_data() print("\n\novarian") ovarianData.list_data() print("\n\nrenal") renalData.list_data() listDataForEachCancer() ################################################################# # Correlation: Proteomics vs Transcriptom in Endometrial Cancer #