Пример #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cancerType', dest='type',\
                        help='Cancer type to be collected')
    parser.add_argument('--getData',dest='get', action='store_true',\
                        default=False,help='Set flag to get all data')
    opts = parser.parse_args()

    if opts.get:
        for ds in ['brca', 'ccrcc', 'colon', 'ovarian', 'endometrial', 'luad']:
            cptac.download(dataset=ds)

    if opts.type.lower() == 'brca':
        dat = cptac.Brca()
    elif opts.type.lower() == 'ccrcc':
        dat = cptac.Ccrcc()
    elif opts.type.lower() == 'coad':
        dat = cptac.Colon()
    elif opts.type.lower() == 'ovca':
        dat = cptac.Ovarian()
    elif opts.type.lower() == 'luad':
        dat = cptac.Luad()
    elif opts.type.lower() == 'endometrial':
        dat = cptac.Endometrial()
    else:
        exit()

    df = dat.get_phosphoproteomics()
    pdf = dat.get_proteomics()
    # df.columns = [' '.join(col).strip() for col in df.columns.values]

    df.to_csv(path_or_buf="phos_file.tsv", sep='\t')
    pdf.to_csv(path_or_buf='prot_file.tsv', sep='\t')
Пример #2
0
def download(dataset, version="latest", redownload=False):

    dataset = dataset.lower()

    if dataset.startswith("pdc"):
        return _pdc_download(dataset, version=version, redownload=redownload)

    elif dataset.startswith("pancan") or dataset == "all":
        box_token = get_box_token()

        if dataset == "pancanbrca":
            sources = BRCA_SOURCES
        elif dataset == "pancanccrcc":
            sources = CCRCC_SOURCES
        elif dataset == "pancancoad":
            sources = COAD_SOURCES
        elif dataset == "pancangbm":
            sources = GBM_SOURCES
        elif dataset == "pancanhnscc":
            sources = HNSCC_SOURCES
        elif dataset == "pancanlscc":
            sources = LSCC_SOURCES
        elif dataset == "pancanluad":
            sources = LUAD_SOURCES
        elif dataset == "pancanov":
            sources = OV_SOURCES
        elif dataset == "pancanucec":
            sources = UCEC_SOURCES
        elif dataset == "all":
            sources = BRCA_SOURCES + CCRCC_SOURCES + COAD_SOURCES + GBM_SOURCES + HNSCC_SOURCES + LSCC_SOURCES + LUAD_SOURCES + OV_SOURCES + UCEC_SOURCES
        else:
            raise InvalidParameterError(f"{dataset} is not a valid dataset.")

        overall_success = True
        for source in sources:

            if source.startswith("pdc"):
                single_success = download(source,
                                          version=version,
                                          redownload=redownload)
            else:
                single_success = cptac.download(source,
                                                version=version,
                                                redownload=redownload,
                                                box_auth=True,
                                                box_token=box_token)

            if not single_success:
                overall_success = False

        return overall_success

    else:
        return cptac.download(dataset,
                              version=version,
                              redownload=redownload,
                              box_auth=True)
Пример #3
0
def download_datasets(get_datasets_lists):
    # Download public datasets
    for cancer in get_datasets_lists["public"]:
        try:
            print(f"Downloading {cancer}...", end='\r')
            cptac.download(cancer, redownload=True)
        except:
            pytest.fail(f"Unable to download data for {cancer} dataset.")

    # TODO: Download restricted datasets

    return True
Пример #4
0
def downloadCptac():
    # To view available datasets, enter 'cptac.list_data()'.
    cptac.list_datasets()
    cptac.download(dataset = "endometrial")
    cptac.download(dataset = 'colon')
    cptac.download(dataset = 'ovarian')
    cptac.download(dataset = 'RenalCcrcc')
Пример #5
0
def cptacData():
    '''
    We need to collect and load CPTAC data
    '''
    print("Loading cptac datasets")
    #we need to make sure all datasets are downloaded
    ##here are the cancers that are available without login information
    allcans = ['brca', 'ccrcc', 'colon', 'ovarian', 'luad',\
             #'hnscc','gbm','lscc',\
             'endometrial']
    print("Downloading cptac data")
    for ct in allcans:
        cptac.download(dataset=ct)
    #then we load them into a dictionary
    fdict = {'brca':cptac.Brca(), 'ccrcc':cptac.Ccrcc(),\
           'colon':cptac.Colon(), 'ovarian':cptac.Ovarian(),\
             #'hnscc':cptac.Hnscc(),'gbm':cptac.Gbm(), 'lscc':cptac.Lscc(),\
           'endometrial':cptac.Endometrial(), 'luad':cptac.Luad()}
    return fdict
Пример #6
0
 def __init__(self):
     cptac.download(dataset="endometrial", version='latest')
     # cptac.download(dataset="brca", version='latest')
     # cptac.download(dataset="gbm", version='latest')
     # cptac.download(dataset="hsncc", version='latest')
     # cptac.download(dataset="luad", version='latest')
     cptac.download(dataset="ovarian", version='latest')
     cptac.download(dataset="ccrcc", version='latest')
     cptac.download(dataset="colon", version='latest')
     self.en = cptac.Endometrial()
     # self.brca = cptac.Brca()
     # self.gbm = cptac.Gbm()
     # self.hsncc = cptac.Hnscc()
     # self.luad= cptac.Luad()
     self.ovarian = cptac.Ovarian()
     self.ccrcc = cptac.Ccrcc()
     self.colon = cptac.Colon()
     # self.datasets = list(self.en,self.brca,self.gbm,self.hsncc,self.luad,self.ovarian,self.ccrcc)
     self.datasets = list([self.en, self.ovarian, self.ccrcc, self.colon])
#!/usr/bin/env python
# coding: utf-8

### Creation of BoxPlots for ESR1 Protien Expression ###

import cptac  # import cptac to download cptac protein and clinical data
import pandas as pd  # import pandas for
import matplotlib.pyplot as plt  # import matplotlib for boxplot
import seaborn as sns  # import seaborn for boxplot
from scipy import stats
from statannot import add_stat_annotation

cptac.download(dataset="Brca")  # download breast cancer dataset
br = cptac.Brca()  # save data in br variable

protein_data = br.get_proteomics()  # save proteomic data
protein_data = protein_data.droplevel(1, axis=1)  # remove multi index
clinical_data = br.get_clinical()  # save clinical data

esr1 = protein_data["ESR1"]  # save ESR1 protein expression column
clinical_data["ER.IHC.Score"] = clinical_data["ER.IHC.Score"].fillna(
    "Not reported")  # fill in null values

er_mask = clinical_data[
    "ER.IHC.Score"] == "3+"  # 3+ is ER-positive, create mask of ER-positive patients
patients = esr1[er_mask]  # apply mask to protein expression data
ages = clinical_data["Age.in.Month"][er_mask] / 12  # calculate ages in years
er_positive_patients = pd.DataFrame(patients,
                                    columns=["ESR1"])  # create new dataframe
er_positive_patients["Age"] = ages  # apply ages to new column in dataframe
category = []  # set categories list
Пример #8
0
import cptac
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

#Alyssa's code
cptac.download(dataset="Brca")
br = cptac.Brca()

protein_data = br.get_proteomics()

#Remove the "multi" part of the dataframe (the dataframes are MultIndex pandas dataframes)
protein_data = protein_data.droplevel(1, axis=1)

rna_data = br.get_transcriptomics()
clinical_data = br.get_clinical()

clinical_data["Age_in_years"] = clinical_data["Age.in.Month"]/12


# clinical_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/clinical_data.csv")
# protein_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/protein_data.csv")
# rna_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/rna_data.csv")

# clinical_data_readin = pd.read_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/clinical_data.csv", index_col=1) #The index_col=0 creates the index (rownames) as the first column. 


#Check that the patients are in the same order in rna_data and protein_data.
#We are looking at the gene expression and protein information of EACH patient, so the data needs to be in pairs.
# -*- coding: utf-8 -*-
“””bioinfoProteomicsHW4.ipynb
# Loading CPTAC Data in google colab using a Python package developed in the Payne lab at BYU.
"""
#install and import cptac and other python packages (just pandas in this case)
!pip install -q cptac  
import cptac 
import pandas as pd

#download endometrial data and creates a new endometrial class
cptac.download(dataset="Endometrial")
en = cptac.Endometrial()

#obtain proteomic data (enProt) and patient information (enInfo)
enProt = en.get_proteomics()
enInfo = en.get_clinical()

#i used google colab to do this, so need to mount my drive to be able to export the data
from google.colab import drive
drive.mount('drive') 

#save proteome and clinical information tables to export and work on locally/in another language
enProt.to_csv("enProt.csv")
!cp enProt.csv "drive/My Drive"
enInfo.to_csv("enInfo.csv")
!cp enInfo.csv "drive/My Drive"
Пример #10
0
 def setup_class(cls):
     """Download all datasets, and do any other setup."""
     cptac.download(dataset="all", version="latest", redownload=False)
Пример #11
0
def _pdc_download(dataset, version, redownload, box_token):
    """Download data for the specified cancer type from the PDC."""

    dataset = str.lower(dataset)

    if dataset == "pdcall":
        overall_result = True
        for dataset in STUDY_IDS_MAP.keys():
            if not _pdc_download(dataset, version, redownload):
                overall_result = False

        return overall_result

    if not dataset.startswith("pdc"):
        raise InvalidParameterError(
            f"_pdc_download function can only be used for PDC datasets, which start with the prefix 'pdc'. You tried to download '{dataset}'."
        )

    if dataset not in STUDY_IDS_MAP.keys():
        raise InvalidParameterError(
            f"PDC dataset must be one of the following:\n{list(STUDY_IDS_MAP.keys())}\nYou passed '{dataset}'."
        )

    dataset_ids = STUDY_IDS_MAP[dataset]

    # Download the file for mapping aliquots to patient IDs
    if not cptac.download(dataset,
                          version=version,
                          redownload=redownload,
                          _box_auth=True,
                          _box_token=box_token):
        return False

    path_here = os.path.abspath(os.path.dirname(__file__))
    cancer_dir = os.path.join(path_here, f"data_{dataset}")

    # Check that the index file exists. If not, there was an uncaught error in the mapping file download.
    index_path = os.path.join(cancer_dir, "index.txt")
    if not os.path.isfile(index_path):
        raise CptacDevError(
            f"Index file not found at {index_path}. Mapping file download probably failed."
        )

    # See what data files we need to download
    data_dir = os.path.join(cancer_dir, f"{dataset}_v1.0")

    # If any of the files are missing, we're going to delete any remaining and redownload all, in case the missing files are a sign of a previous data problem
    data_files = [f"{data_type}.tsv.gz"
                  for data_type in dataset_ids.keys()] + ["clinical.tsv.gz"]
    for data_file in data_files:
        data_file_path = os.path.join(data_dir, data_file)
        if not os.path.isfile(data_file_path):
            redownload = True
            break

    if redownload:
        for data_file in data_files:
            data_file_path = os.path.join(data_dir, data_file)
            if os.path.isfile(data_file_path):
                os.remove(data_file_path)
    else:
        return True  # If all the files are there and the user didn't ask to redownload, we're done.

    # Now download all the data files

    # We'll combine all the clinical tables in case there are differences
    master_clin = pd.DataFrame()

    for data_type in dataset_ids.keys():

        # Print an update
        download_msg = f"Downloading {dataset} {data_type} files..."
        print(download_msg, end="\r")

        # Get the clinical and quantitative tables for the study ID
        clin, quant = download_pdc_id(dataset_ids[data_type],
                                      _download_msg=False)

        # Print a new update
        print(" " * len(download_msg), end="\r")
        save_msg = f"Saving {dataset} {data_type} files..."
        print(save_msg, end="\r")

        # Append the clinical dataframe
        #master_clin = master_clin.append(clin)
        master_clin = pd.concat([master_clin, clin], axis=0, join='outer')

        # Save the quantitative table
        quant.to_csv(os.path.join(data_dir, f"{data_type}.tsv.gz"), sep="\t")

        # Erase update
        print(" " * len(save_msg), end="\r")

    # Print an update
    save_msg = f"Saving {dataset} clinical file..."
    print(save_msg, end="\r")

    # Drop any duplicated rows in combined clinical table, then save it too
    master_clin = master_clin.drop_duplicates(keep="first")

    master_clin.to_csv(os.path.join(data_dir, "clinical.tsv.gz"), sep="\t")

    # Erase update
    print(" " * len(save_msg), end="\r")

    return True
Пример #12
0
 def test_invalid_dataset(self):
     with pytest.raises(InvalidParameterError) as exception_raised:
         cptac.download("abc")
     assert exception_raised.type == InvalidParameterError
Пример #13
0
'''
Get all cptac data
downloads data into docker image
'''

import cptac
for ds in ['brca', 'ccrcc', 'colon', 'ovarian']:
    cptac.download(dataset=ds)