예제 #1
0
    def load_dataframe(self, file_resources, npartitions=None):
        """
        Args:
            file_resources:
            npartitions:
        """
        go_terms = pd.read_table(
            file_resources["rnacentral_rfam_annotations.tsv"],
            low_memory=True,
            header=None,
            names=["RNAcentral id", "GO terms", "Rfams"])
        go_terms["RNAcentral id"] = go_terms["RNAcentral id"].str.split(
            "_", expand=True, n=2)[0]

        gene_ids = []
        for file in file_resources:
            if "database_mappings" in file:
                if npartitions:
                    id_mapping = dd.read_table(file_resources[file],
                                               header=None,
                                               names=[
                                                   "RNAcentral id", "database",
                                                   "external id", "species",
                                                   "RNA type", "gene symbol"
                                               ])
                else:
                    id_mapping = pd.read_table(file_resources[file],
                                               low_memory=True,
                                               header=None,
                                               names=[
                                                   "RNAcentral id", "database",
                                                   "external id", "species",
                                                   "RNA type", "gene symbol"
                                               ])

                gene_ids.append(id_mapping)

        if npartitions:
            gene_ids = dd.concat(gene_ids, join="inner")
        else:
            gene_ids = pd.concat(gene_ids, join="inner")

        gene_ids["species"] = gene_ids["species"].astype("O")
        if self.species is not None:
            gene_ids = gene_ids[gene_ids["species"] == self.species]

        lnc_go_terms = go_terms[go_terms["RNAcentral id"].isin(
            gene_ids["RNAcentral id"])].groupby("RNAcentral id")[
                "GO terms"].apply(lambda x: "|".join(x.unique()))
        lnc_rfams = go_terms[go_terms["RNAcentral id"].isin(
            gene_ids["RNAcentral id"])].groupby(
                "RNAcentral id")["Rfams"].apply(lambda x: "|".join(x.unique()))

        gene_ids["GO terms"] = gene_ids["RNAcentral id"].map(lnc_go_terms)
        gene_ids["Rfams"] = gene_ids["RNAcentral id"].map(lnc_rfams)
        gene_ids = gene_ids[gene_ids["GO terms"].notnull()
                            | gene_ids["Rfams"].notnull()]

        return gene_ids
예제 #2
0
    def add_drug_response_data(
            self,
            file_path="nationwidechildrens.org_clinical_drug.txt",
            patient_column="bcr_patient_barcode",
            columns=[
                'bcr_patient_barcode', 'pharmaceutical_therapy_drug_name',
                'pharmaceutical_therapy_type', 'treatment_best_response'
            ],
            drug_name_col=None,
            response_column=None):
        """
        Args:
            file_path:
            patient_column:
            columns:
            drug_name_col:
            response_column:
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(file_path)

        self.drug_name_col = drug_name_col
        self.response_column = response_column

        self.drugs = pd.read_table(
            file_path,
            sep="\t",
            skiprows=[1, 2],
            na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"],
            usecols=columns)
        self.drugs.set_index(patient_column, inplace=True)
예제 #3
0
    def load_dataframe(self, file_resources, npartitions=None):
        """
        Args:
            file_resources:
            npartitions:
        """
        source_df = pd.read_table(file_resources["NONCODEv5_source"],
                                  header=None)
        source_df.columns = ["NONCODE Transcript ID", "name type", "Gene ID"]

        transcript2gene_df = pd.read_table(
            file_resources["NONCODEv5_Transcript2Gene"], header=None)
        transcript2gene_df.columns = [
            "NONCODE Transcript ID", "NONCODE Gene ID"
        ]

        if npartitions:
            self.noncode_func_df = dd.read_table(
                file_resources["NONCODEv5_human.func"], header=None)
        else:
            self.noncode_func_df = pd.read_table(
                file_resources["NONCODEv5_human.func"], header=None)
        self.noncode_func_df.columns = ["NONCODE Gene ID", "GO terms"]
        self.noncode_func_df.set_index("NONCODE Gene ID", inplace=True)

        # Convert to NONCODE transcript ID for the functional annotation data
        self.noncode_func_df[
            "NONCODE Transcript ID"] = self.noncode_func_df.index.map(
                pd.Series(
                    transcript2gene_df['NONCODE Transcript ID'].values,
                    index=transcript2gene_df['NONCODE Gene ID']).to_dict())

        # Convert NONCODE transcript ID to gene names
        source_gene_names_df = source_df[source_df["name type"] ==
                                         "NAME"].copy()

        self.noncode_func_df["Gene Name"] = self.noncode_func_df[
            "NONCODE Transcript ID"].map(
                pd.Series(source_gene_names_df['Gene ID'].values,
                          index=source_gene_names_df['NONCODE Transcript ID']).
                to_dict())
예제 #4
0
    def load_dataframe(self, file_resources, npartitions=None):
        """
        Args:
            file_resources:
            npartitions:
        """
        if npartitions:
            df = dd.read_table(file_resources["proteinatlas.tsv"])
        else:
            df = pd.read_table(file_resources["proteinatlas.tsv"])

        return df
예제 #5
0
    def get_expressions(self, genes_index):
        """Preprocess LNCRNA expression file obtained from TANRIC MDAnderson,
        and replace ENSEMBL gene ID to HUGO gene names (HGNC). This function
        overwrites the GenomicData.process_expression_table() function which
        processes TCGA-Assembler data. TANRIC LNCRNA expression values are log2
        transformed

        Args:
            genes_index:
        """
        df = pd.read_table(self.file_resources["TCGA-LUAD-rnaexpr.tsv"])
        df[genes_index] = df[genes_index].str.replace(
            "[.].*", "")  # Removing .# ENGS gene version number at the end
        df = df[~df[genes_index].duplicated(
            keep='first')]  # Remove duplicate genes

        # Drop NA gene rows
        df.dropna(axis=0, inplace=True)

        # Transpose matrix to patients rows and genes columns
        df.index = df[genes_index]
        df = df.T.iloc[1:, :]

        # Change index string to bcr_sample_barcode standard
        def change_patient_barcode(s):
            if "Normal" in s:
                return s[s.find('TCGA'):] + "-11A"
            elif "Tumor" in s:
                return s[s.find('TCGA'):] + "-01A"
            else:
                return s

        df.index = df.index.map(change_patient_barcode)
        df.index.name = "gene_id"

        return df
예제 #6
0
    def add_biospecimen_data(
            self,
            file_path="genome.wustl.edu_biospecimen_sample.txt",
            patient_col_name="bcr_patient_barcode",
            columns=['bcr_sample_barcode', 'sample_type']):
        """
        Args:
            file_path:
            patient_col_name:
            columns:
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(file_path)

        self.biospecimen = pd.read_table(
            file_path,
            sep="\t",
            skiprows=[
                1,
            ],
            na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"],
            usecols=columns)
        self.sample_barcodes = self.biospecimen[patient_col_name].tolist()
        self.biospecimen.set_index(patient_col_name, inplace=True)
예제 #7
0
    def __init__(self,
                 patients_file,
                 patient_id_col="bcr_patient_barcode",
                 columns=None):
        """This class manages the clinical data tables to handle the phenotype,
        treatment, and sample data associated to a patient.

        Args:
            patients_file (str): path to the patients clinical data file
            patient_id_col (str): the patient's ID column name
            columns (list): default None. Specifies the columns to import, if
                None, then import all columns. Example: ['bcr_patient_barcode',
                'gender', 'race', 'histologic_diagnosis', 'tumor_status',
                'death_days_to',

                    'ajcc_pathologic_tumor_stage']
        """
        # self.cohort_name = cohort_name
        self.patient_column = patient_id_col

        if columns and patient_id_col not in columns:
            columns.append(patient_id_col)

        if isinstance(patients_file, io.StringIO):
            patients_file.seek(
                0
            )  # Needed since the file was previous read to extract columns information
            self.patient = pd.read_table(patients_file,
                                         skiprows=[1, 2],
                                         na_values=[
                                             "[Not Available]", "[Unknown]",
                                             "[Not Applicable]",
                                             "[Discrepancy]"
                                         ],
                                         usecols=columns)
        elif type(patients_file) == str and os.path.exists(patients_file):
            self.patient = pd.read_table(patients_file,
                                         skiprows=[1, 2],
                                         na_values=[
                                             "[Not Available]", "[Unknown]",
                                             "[Not Applicable]",
                                             "[Discrepancy]"
                                         ],
                                         usecols=columns)
        else:
            raise IOError(patients_file)

        self.patient_barcodes = self.patient[patient_id_col].tolist()
        self.patient.set_index(patient_id_col, inplace=True)

        # Rename columns
        self.patient.rename(
            {
                "ajcc_pathologic_tumor_stage": PATHOLOGIC_STAGE,
                "histological_type": HISTOLOGIC_SUBTYPE,
                "histologic_diagnosis.1": HISTOLOGIC_SUBTYPE
            },
            axis=1,
            inplace=True)
        self.patient.replace(
            {PATHOLOGIC_STAGE: ClinicalData.pathologic_stage_map},
            inplace=True)