def load_immune_landscape(fpath=IMMUNE_LANDSCAPE): df = (pd.read_csv(fpath).drop('TCGA Study', axis=1).rename( columns={ 'TCGA Participant Barcode': 'patient_id', 'Eosinophils.0': 'Eosinophils score', 'Neutrophils.0': 'Neutrophils score', 'Eosinophils.1': 'Eosinophils', 'Neutrophils.1': 'Neutrophils' }).dropna(subset=['Wound Healing']).melt(id_vars='patient_id', var_name='data_type', value_name='value')) df.loc[:, "unit"] = df["data_type"].map(lambda dt: IMMUNE_LANDSCAPE_UNITS[dt]) # Split into numeric values and text values df['is_numeric'] = df['value'].map(is_numeric) df_numeric = df[df['is_numeric']] df_numeric['value'] = df_numeric['value'].map(float) df_text = df[~df['is_numeric']] # Load to database in respective tables conn = db.engine.connect() df_numeric = (df_numeric[[ 'patient_id', 'data_type', 'unit', 'value' ]].drop_duplicates(subset=['patient_id', 'data_type']).dropna( subset=['value'])) db.copy_from_df(df_numeric, 'patient_value') df_text = (df_text[[ 'patient_id', 'data_type', 'unit', 'value' ]].drop_duplicates(subset=['patient_id', 'data_type']).dropna( subset=['value'])) db.copy_from_df(df_text, 'patient_text_value') conn.close()
def load_gtex_median_tpm(fpath=cfg['GTEX_MEDIAN_TPM'], env=cfg['ENV']): df = (pd.read_csv(fpath, sep='\t', skiprows=2).melt(id_vars=['gene_id', 'Description'], var_name='tissue', value_name='median_tpm')) df['ensembl_id'] = df['gene_id'].map(lambda s: s.split('.')[0]) if env == 'dev': df = df[df['Description'].isin(cfg['TEST_SYMBOLS'])] conn = db.engine.connect() conn.execute("INSERT INTO source (source_id) VALUES ('GTEx')") conn.execute(''' CREATE TABLE tmp_gtex (ensembl_id varchar, tissue varchar, median_tpm numeric) ''') selected = df[['ensembl_id', 'tissue', 'median_tpm']] try: db.copy_from_df(selected, 'tmp_gtex') conn.execute(''' INSERT INTO tissue_gene_value (source_id, tissue_id, gene_id, data_type, unit, value) SELECT 'GTEx', t.tissue_id, g.gene_id, 'expression', 'median_tpm', tmp.median_tpm FROM tmp_gtex tmp INNER JOIN tissue t ON t.gtex_id = tmp.tissue INNER JOIN gene g ON g.ensembl_id = tmp.ensembl_id ''') finally: conn.execute('DROP TABLE tmp_gtex') conn.close()
def load_tcia_patient(fpath=TCIA_PATIENT): df = (pd.read_csv(fpath, sep='\t', low_memory=False).drop( ['datasource', 'disease'], axis=1).melt( id_vars='barcode', var_name='data_type', value_name='value').rename(columns={'barcode': 'patient_id'})) df.loc[:, 'data_type'] = df['data_type'].str.replace('clinical_data_', '') df.loc[:, 'unit'] = 'clinical' # Split into numeric values and text values df.loc[:, 'is_numeric'] = df['value'].map(is_numeric) df_numeric = df[df['is_numeric']] df_numeric.loc[:, 'value'] = df_numeric['value'].map(float) df_text = df[~df['is_numeric']] # Load to database in respective tables df_numeric = (df_numeric[[ 'patient_id', 'data_type', 'unit', 'value' ]].drop_duplicates(subset=['patient_id', 'data_type']).dropna( subset=['value'])) df_text = (df_text[[ 'patient_id', 'data_type', 'unit', 'value' ]].drop_duplicates(subset=['patient_id', 'data_type']).dropna( subset=['value'])) conn = db.engine.connect() try: conn.execute(''' CREATE TABLE tmp_tcia_patient_text (patient_id varchar, data_type varchar, unit varchar, value varchar) ''') db.copy_from_df(df_text, 'tmp_tcia_patient_text') conn.execute(''' INSERT INTO patient_text_value (patient_id, data_type, unit, value) SELECT tmp.patient_id, tmp.data_type, tmp.unit, tmp.value FROM tmp_tcia_patient_text tmp ON CONFLICT DO NOTHING ''') conn.execute(''' CREATE TABLE tmp_tcia_patient_num (patient_id varchar, data_type varchar, unit varchar, value numeric) ''') db.copy_from_df(df_numeric, 'tmp_tcia_patient_num') conn.execute(''' INSERT INTO patient_value (patient_id, data_type, unit, value) SELECT tmp.patient_id, tmp.data_type, tmp.unit, tmp.value FROM tmp_tcia_patient_num tmp ON CONFLICT DO NOTHING ''') finally: conn.execute('DROP TABLE tmp_tcia_patient_text') conn.execute('DROP TABLE tmp_tcia_patient_num') conn.close()
def load_tcga_isoforms(fpath, env=cfg['ENV']): df = (pd.read_csv(fpath).rename(columns={ 'transcript_id': 'isoform_id', 'rsem_normalized': 'value' })) df['sample_id'] = df['barcode'].map(lambda s: s[:15]) df['unit'] = 'normalized_counts' df = df[['sample_id', 'isoform_id', 'unit', 'value']] df = df.drop_duplicates(subset=['sample_id', 'isoform_id']) if env == 'dev': df = df[df['isoform_id'].isin(cfg['TEST_ISOFORMS'])] db.copy_from_df(df, 'sample_isoform_value')
def load_tcia_pathways(up_fpath=TCIA_GSEA_ENRICHMENT, down_fpath=TCIA_GSEA_DEPLETION): for fpath in up_fpath, down_fpath: df = (pd.read_csv(fpath, sep='\t').drop(['disease'], axis=1).rename( columns={ 'patients': 'patient_id', 'cellType': 'data_type', 'qValue < 1%Enriched': 'qvalue', 'qValue < 1%Depleted': 'qvalue', 'NES > 0': 'value' })) df = df[df['qvalue'] < 0.05] df['unit'] = 'normalized enrichment score' # Load to database in respective tables df = (df[['patient_id', 'data_type', 'unit', 'value']].drop_duplicates( subset=['patient_id', 'data_type']).dropna(subset=['value'])) db.copy_from_df(df, 'patient_value')
def load_tcga_clinical(fpath): mat = pd.read_csv(fpath, sep='\t') df = mat.melt(id_vars='Hybridization REF', var_name='patient_id', value_name='value') df = df[df['Hybridization REF'] != 'Composite Element REF'] df['patient_id'] = df['patient_id'].str.upper() df['unit'] = 'clinical' df = df.rename(columns={'Hybridization REF': 'data_type'}) df = df.drop_duplicates(subset=['patient_id', 'data_type']) df = df.dropna(subset=['value']) df['is_numeric'] = df['value'].map(is_numeric) df_numeric = df[df['is_numeric']] df_text = df[~df['is_numeric']] df_numeric = df_numeric[['patient_id', 'data_type', 'unit', 'value']] db.copy_from_df(df_numeric, 'patient_value') df_text = df_text[['patient_id', 'data_type', 'unit', 'value']] db.copy_from_df(df_text, 'patient_text_value')
def load_hpa_protein(fpath=HPA_NORMAL_TISSUE, env=cfg['ENV']): cell_types = pd.read_csv(CELL_TYPES) df = pd.read_csv(fpath, sep='\t') # if env == 'test': # df = df[df['Gene name'].isin(TEST_GENES)] formatted = ( df[df['Reliability'] != 'Uncertain'] .merge( cell_types, left_on=['Tissue', 'Cell type'], right_on=['hpa_tissue_id', 'cell_type']) [['Gene', 'cell_type_id', 'Level']] .drop_duplicates(subset=['cell_type_id', 'Gene'])) conn = db.engine.connect() conn.execute("INSERT INTO source (source_id) VALUES ('HPA')") # Load to temp table conn.execute('DROP TABLE IF EXISTS tmp_hpa_prot') conn.execute(''' CREATE TABLE tmp_hpa_prot (Gene varchar, cell_type_id varchar, Level varchar) ''') db.copy_from_df(formatted, 'tmp_hpa_prot') # Insert from temp, joining to existing keys (cell type, gene) conn.execute(''' INSERT INTO cell_type_gene_text_value (source_id, cell_type_id, gene_id, data_type, unit, value) SELECT 'HPA', tmp.cell_type_id, g.gene_id, 'protein', 'detection level', tmp.Level FROM tmp_hpa_prot tmp INNER JOIN gene g ON g.ensembl_id = tmp.Gene ''') conn.close()
def load_hpa_expression(fpath=HPA_RNA_TISSUE, env=cfg['ENV']): tissues = pd.read_csv(TISSUES) df = pd.read_csv(fpath, sep='\t') # if env == 'test': # df = df[df['Gene name'].isin(TEST_GENES)] formatted = ( df .merge( tissues, left_on='Sample', right_on='hpa_id') [['Gene', 'tissue_id', 'Value']] .drop_duplicates(subset=['tissue_id', 'Gene'])) conn = db.engine.connect() # Load to temp table conn.execute('DROP TABLE IF EXISTS tmp_hpa_expr') conn.execute(''' CREATE TABLE tmp_hpa_expr (Gene varchar, tissue_id varchar, Value numeric) ''') db.copy_from_df(formatted, 'tmp_hpa_expr') # Insert from temp, joining to existing keys (cell type, gene) conn.execute(''' INSERT INTO tissue_gene_value (source_id, tissue_id, gene_id, data_type, unit, value) SELECT 'HPA', tmp.tissue_id, g.gene_id, 'expression', 'TPM', tmp.Value FROM tmp_hpa_expr tmp INNER JOIN gene g ON g.ensembl_id = tmp.Gene ''') conn.execute('DROP TABLE IF EXISTS tmp_hpa_expr') conn.close()
def load_tcga_sample_meta(cohort_fpath=TCGA_COHORT_META, patient_fpath=TCGA_PATIENT_META, sample_fpath=TCGA_SAMPLE_META): conn = db.engine.connect() conn.execute("INSERT INTO source (source_id) VALUES ('TCGA')") cohort = (pd.read_csv(cohort_fpath).assign(source_id='TCGA')[[ 'cohort_id', 'source_id', 'cohort_name' ]].drop_duplicates()) db.copy_from_df(cohort, 'cohort') patient = (pd.read_csv(patient_fpath)[[ 'patient_id', 'cohort_id' ]].drop_duplicates(subset=['patient_id'])) db.copy_from_df(patient, 'patient') sample = (pd.read_csv(sample_fpath)[[ 'sample_id', 'patient_id', 'sample_code', 'sample_type' ]].drop_duplicates()) db.copy_from_df(sample, 'sample') conn.close()
def load_tcga_mutation(fpath=TCGA_MUTATIONS, env=cfg['ENV']): maf = pd.read_csv(fpath, sep='\t') #if 'Protein_Change' in maf.columns: df = maf[[ 'Gene', 'Tumor_Sample_Barcode', 'Variant_Classification', 'Variant_Type', 'HGVSp_Short', ]] # elif 'AAChange' in maf.columns: # df = maf[ # [ # 'Entrez_Gene_Id', # 'Tumor_Sample_Barcode', # 'Variant_Classification', # 'Variant_Type', # 'AAChange', # ] # ] # elif 'amino_acid_change' in maf.columns: # df = maf[ # [ # 'Entrez_Gene_Id', # 'Tumor_Sample_Barcode', # 'Variant_Classification', # 'Variant_Type', # 'amino_acid_change', # ] # ] # else: # raise Exception( # 'MAF file should have either "Protein_Change", "AAChange", ' # 'or "amino_acid_change" column' # ) df.loc[:, 'sample_id'] = df['Tumor_Sample_Barcode'].map(lambda s: s[:15]) df = df.drop('Tumor_Sample_Barcode', axis=1) df = df.rename( columns={ 'Gene': 'ensembl_id', 'Variant_Type': 'variant type', 'Variant_Classification': 'variant classification', 'AAChange': 'AA change' }) # df = df[df['gene_id'] > 0] # if env == 'dev': # df = df[df['ensembl_id'].isin(cfg['TEST_ENSEMBL_GENES'])] df = df.melt(id_vars=['sample_id', 'ensembl_id'], var_name='data_type', value_name='value') df.loc[:, 'unit'] = 'mutation' df = df.drop_duplicates(subset=['ensembl_id', 'sample_id', 'data_type']) df = df.dropna(subset=['value']) df = df[['sample_id', 'ensembl_id', 'data_type', 'unit', 'value']] conn = db.engine.connect() try: conn.execute(''' CREATE TABLE tmp_tcga_maf (sample_id varchar, ensembl_id varchar, data_type varchar, unit varchar, value varchar) ''') db.copy_from_df(df, 'tmp_tcga_maf') conn.execute(''' INSERT INTO sample_gene_text_value (sample_id, gene_id, data_type, unit, value) SELECT tmp.sample_id, g.gene_id, tmp.data_type, tmp.unit, tmp.value FROM tmp_tcga_maf tmp INNER JOIN gene g ON g.ensembl_id = tmp.ensembl_id INNER JOIN sample s ON s.sample_id = tmp.sample_id ''') finally: conn.execute('DROP TABLE tmp_tcga_maf') conn.close()
def load_gtex_isoform(fpath=cfg['GTEX_MEDIAN_ISOFORM']): df = (pd.read_csv(fpath).rename(columns={'median_tpm': 'value'})) df.loc[:, 'unit']: 'TPM' df = df[['source_id', 'tissue_id', 'isoform_id', 'unit', 'value']] db.copy_from_df(df, 'tissue_isoform_value')