def load_dataframe(self, file_resources, npartitions=None): """ Args: file_resources: npartitions: """ go_terms = pd.read_table( file_resources["rnacentral_rfam_annotations.tsv"], low_memory=True, header=None, names=["RNAcentral id", "GO terms", "Rfams"]) go_terms["RNAcentral id"] = go_terms["RNAcentral id"].str.split( "_", expand=True, n=2)[0] gene_ids = [] for file in file_resources: if "database_mappings" in file: if npartitions: id_mapping = dd.read_table(file_resources[file], header=None, names=[ "RNAcentral id", "database", "external id", "species", "RNA type", "gene symbol" ]) else: id_mapping = pd.read_table(file_resources[file], low_memory=True, header=None, names=[ "RNAcentral id", "database", "external id", "species", "RNA type", "gene symbol" ]) gene_ids.append(id_mapping) if npartitions: gene_ids = dd.concat(gene_ids, join="inner") else: gene_ids = pd.concat(gene_ids, join="inner") gene_ids["species"] = gene_ids["species"].astype("O") if self.species is not None: gene_ids = gene_ids[gene_ids["species"] == self.species] lnc_go_terms = go_terms[go_terms["RNAcentral id"].isin( gene_ids["RNAcentral id"])].groupby("RNAcentral id")[ "GO terms"].apply(lambda x: "|".join(x.unique())) lnc_rfams = go_terms[go_terms["RNAcentral id"].isin( gene_ids["RNAcentral id"])].groupby( "RNAcentral id")["Rfams"].apply(lambda x: "|".join(x.unique())) gene_ids["GO terms"] = gene_ids["RNAcentral id"].map(lnc_go_terms) gene_ids["Rfams"] = gene_ids["RNAcentral id"].map(lnc_rfams) gene_ids = gene_ids[gene_ids["GO terms"].notnull() | gene_ids["Rfams"].notnull()] return gene_ids
def add_drug_response_data( self, file_path="nationwidechildrens.org_clinical_drug.txt", patient_column="bcr_patient_barcode", columns=[ 'bcr_patient_barcode', 'pharmaceutical_therapy_drug_name', 'pharmaceutical_therapy_type', 'treatment_best_response' ], drug_name_col=None, response_column=None): """ Args: file_path: patient_column: columns: drug_name_col: response_column: """ if not os.path.exists(file_path): raise FileNotFoundError(file_path) self.drug_name_col = drug_name_col self.response_column = response_column self.drugs = pd.read_table( file_path, sep="\t", skiprows=[1, 2], na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"], usecols=columns) self.drugs.set_index(patient_column, inplace=True)
def load_dataframe(self, file_resources, npartitions=None): """ Args: file_resources: npartitions: """ source_df = pd.read_table(file_resources["NONCODEv5_source"], header=None) source_df.columns = ["NONCODE Transcript ID", "name type", "Gene ID"] transcript2gene_df = pd.read_table( file_resources["NONCODEv5_Transcript2Gene"], header=None) transcript2gene_df.columns = [ "NONCODE Transcript ID", "NONCODE Gene ID" ] if npartitions: self.noncode_func_df = dd.read_table( file_resources["NONCODEv5_human.func"], header=None) else: self.noncode_func_df = pd.read_table( file_resources["NONCODEv5_human.func"], header=None) self.noncode_func_df.columns = ["NONCODE Gene ID", "GO terms"] self.noncode_func_df.set_index("NONCODE Gene ID", inplace=True) # Convert to NONCODE transcript ID for the functional annotation data self.noncode_func_df[ "NONCODE Transcript ID"] = self.noncode_func_df.index.map( pd.Series( transcript2gene_df['NONCODE Transcript ID'].values, index=transcript2gene_df['NONCODE Gene ID']).to_dict()) # Convert NONCODE transcript ID to gene names source_gene_names_df = source_df[source_df["name type"] == "NAME"].copy() self.noncode_func_df["Gene Name"] = self.noncode_func_df[ "NONCODE Transcript ID"].map( pd.Series(source_gene_names_df['Gene ID'].values, index=source_gene_names_df['NONCODE Transcript ID']). to_dict())
def load_dataframe(self, file_resources, npartitions=None): """ Args: file_resources: npartitions: """ if npartitions: df = dd.read_table(file_resources["proteinatlas.tsv"]) else: df = pd.read_table(file_resources["proteinatlas.tsv"]) return df
def get_expressions(self, genes_index): """Preprocess LNCRNA expression file obtained from TANRIC MDAnderson, and replace ENSEMBL gene ID to HUGO gene names (HGNC). This function overwrites the GenomicData.process_expression_table() function which processes TCGA-Assembler data. TANRIC LNCRNA expression values are log2 transformed Args: genes_index: """ df = pd.read_table(self.file_resources["TCGA-LUAD-rnaexpr.tsv"]) df[genes_index] = df[genes_index].str.replace( "[.].*", "") # Removing .# ENGS gene version number at the end df = df[~df[genes_index].duplicated( keep='first')] # Remove duplicate genes # Drop NA gene rows df.dropna(axis=0, inplace=True) # Transpose matrix to patients rows and genes columns df.index = df[genes_index] df = df.T.iloc[1:, :] # Change index string to bcr_sample_barcode standard def change_patient_barcode(s): if "Normal" in s: return s[s.find('TCGA'):] + "-11A" elif "Tumor" in s: return s[s.find('TCGA'):] + "-01A" else: return s df.index = df.index.map(change_patient_barcode) df.index.name = "gene_id" return df
def add_biospecimen_data( self, file_path="genome.wustl.edu_biospecimen_sample.txt", patient_col_name="bcr_patient_barcode", columns=['bcr_sample_barcode', 'sample_type']): """ Args: file_path: patient_col_name: columns: """ if not os.path.exists(file_path): raise FileNotFoundError(file_path) self.biospecimen = pd.read_table( file_path, sep="\t", skiprows=[ 1, ], na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"], usecols=columns) self.sample_barcodes = self.biospecimen[patient_col_name].tolist() self.biospecimen.set_index(patient_col_name, inplace=True)
def __init__(self, patients_file, patient_id_col="bcr_patient_barcode", columns=None): """This class manages the clinical data tables to handle the phenotype, treatment, and sample data associated to a patient. Args: patients_file (str): path to the patients clinical data file patient_id_col (str): the patient's ID column name columns (list): default None. Specifies the columns to import, if None, then import all columns. Example: ['bcr_patient_barcode', 'gender', 'race', 'histologic_diagnosis', 'tumor_status', 'death_days_to', 'ajcc_pathologic_tumor_stage'] """ # self.cohort_name = cohort_name self.patient_column = patient_id_col if columns and patient_id_col not in columns: columns.append(patient_id_col) if isinstance(patients_file, io.StringIO): patients_file.seek( 0 ) # Needed since the file was previous read to extract columns information self.patient = pd.read_table(patients_file, skiprows=[1, 2], na_values=[ "[Not Available]", "[Unknown]", "[Not Applicable]", "[Discrepancy]" ], usecols=columns) elif type(patients_file) == str and os.path.exists(patients_file): self.patient = pd.read_table(patients_file, skiprows=[1, 2], na_values=[ "[Not Available]", "[Unknown]", "[Not Applicable]", "[Discrepancy]" ], usecols=columns) else: raise IOError(patients_file) self.patient_barcodes = self.patient[patient_id_col].tolist() self.patient.set_index(patient_id_col, inplace=True) # Rename columns self.patient.rename( { "ajcc_pathologic_tumor_stage": PATHOLOGIC_STAGE, "histological_type": HISTOLOGIC_SUBTYPE, "histologic_diagnosis.1": HISTOLOGIC_SUBTYPE }, axis=1, inplace=True) self.patient.replace( {PATHOLOGIC_STAGE: ClinicalData.pathologic_stage_map}, inplace=True)