Пример #1
0
def load_immune_landscape(fpath=IMMUNE_LANDSCAPE):
    df = (pd.read_csv(fpath).drop('TCGA Study', axis=1).rename(
        columns={
            'TCGA Participant Barcode': 'patient_id',
            'Eosinophils.0': 'Eosinophils score',
            'Neutrophils.0': 'Neutrophils score',
            'Eosinophils.1': 'Eosinophils',
            'Neutrophils.1': 'Neutrophils'
        }).dropna(subset=['Wound Healing']).melt(id_vars='patient_id',
                                                 var_name='data_type',
                                                 value_name='value'))

    df.loc[:,
           "unit"] = df["data_type"].map(lambda dt: IMMUNE_LANDSCAPE_UNITS[dt])

    # Split into numeric values and text values
    df['is_numeric'] = df['value'].map(is_numeric)
    df_numeric = df[df['is_numeric']]
    df_numeric['value'] = df_numeric['value'].map(float)
    df_text = df[~df['is_numeric']]

    # Load to database in respective tables
    conn = db.engine.connect()
    df_numeric = (df_numeric[[
        'patient_id', 'data_type', 'unit', 'value'
    ]].drop_duplicates(subset=['patient_id', 'data_type']).dropna(
        subset=['value']))
    db.copy_from_df(df_numeric, 'patient_value')

    df_text = (df_text[[
        'patient_id', 'data_type', 'unit', 'value'
    ]].drop_duplicates(subset=['patient_id', 'data_type']).dropna(
        subset=['value']))
    db.copy_from_df(df_text, 'patient_text_value')
    conn.close()
Пример #2
0
def load_gtex_median_tpm(fpath=cfg['GTEX_MEDIAN_TPM'], env=cfg['ENV']):
    df = (pd.read_csv(fpath, sep='\t',
                      skiprows=2).melt(id_vars=['gene_id', 'Description'],
                                       var_name='tissue',
                                       value_name='median_tpm'))
    df['ensembl_id'] = df['gene_id'].map(lambda s: s.split('.')[0])

    if env == 'dev':
        df = df[df['Description'].isin(cfg['TEST_SYMBOLS'])]

    conn = db.engine.connect()
    conn.execute("INSERT INTO source (source_id) VALUES ('GTEx')")

    conn.execute('''
        CREATE TABLE tmp_gtex
        (ensembl_id varchar, tissue varchar, median_tpm numeric)
    ''')

    selected = df[['ensembl_id', 'tissue', 'median_tpm']]
    try:
        db.copy_from_df(selected, 'tmp_gtex')

        conn.execute('''
            INSERT INTO tissue_gene_value
            (source_id, tissue_id, gene_id, data_type, unit, value)
            SELECT 'GTEx', t.tissue_id, g.gene_id, 'expression',
                   'median_tpm', tmp.median_tpm
            FROM tmp_gtex tmp
            INNER JOIN tissue t ON t.gtex_id = tmp.tissue
            INNER JOIN gene g ON g.ensembl_id = tmp.ensembl_id
        ''')
    finally:
        conn.execute('DROP TABLE tmp_gtex')
        conn.close()
Пример #3
0
def load_tcia_patient(fpath=TCIA_PATIENT):
    df = (pd.read_csv(fpath, sep='\t', low_memory=False).drop(
        ['datasource', 'disease'], axis=1).melt(
            id_vars='barcode', var_name='data_type',
            value_name='value').rename(columns={'barcode': 'patient_id'}))
    df.loc[:, 'data_type'] = df['data_type'].str.replace('clinical_data_', '')
    df.loc[:, 'unit'] = 'clinical'

    # Split into numeric values and text values
    df.loc[:, 'is_numeric'] = df['value'].map(is_numeric)
    df_numeric = df[df['is_numeric']]
    df_numeric.loc[:, 'value'] = df_numeric['value'].map(float)
    df_text = df[~df['is_numeric']]

    # Load to database in respective tables
    df_numeric = (df_numeric[[
        'patient_id', 'data_type', 'unit', 'value'
    ]].drop_duplicates(subset=['patient_id', 'data_type']).dropna(
        subset=['value']))
    df_text = (df_text[[
        'patient_id', 'data_type', 'unit', 'value'
    ]].drop_duplicates(subset=['patient_id', 'data_type']).dropna(
        subset=['value']))

    conn = db.engine.connect()
    try:
        conn.execute('''
            CREATE TABLE tmp_tcia_patient_text
            (patient_id varchar, data_type varchar,
             unit varchar, value varchar)
        ''')
        db.copy_from_df(df_text, 'tmp_tcia_patient_text')
        conn.execute('''
            INSERT INTO patient_text_value
            (patient_id, data_type, unit, value)
            SELECT tmp.patient_id, tmp.data_type, tmp.unit, tmp.value
            FROM tmp_tcia_patient_text tmp
            ON CONFLICT DO NOTHING
        ''')

        conn.execute('''
            CREATE TABLE tmp_tcia_patient_num
            (patient_id varchar, data_type varchar,
             unit varchar, value numeric)
        ''')
        db.copy_from_df(df_numeric, 'tmp_tcia_patient_num')
        conn.execute('''
            INSERT INTO patient_value
            (patient_id, data_type, unit, value)
            SELECT tmp.patient_id, tmp.data_type, tmp.unit, tmp.value
            FROM tmp_tcia_patient_num tmp
            ON CONFLICT DO NOTHING
        ''')
    finally:
        conn.execute('DROP TABLE tmp_tcia_patient_text')
        conn.execute('DROP TABLE tmp_tcia_patient_num')
        conn.close()
Пример #4
0
def load_tcga_isoforms(fpath, env=cfg['ENV']):
    df = (pd.read_csv(fpath).rename(columns={
        'transcript_id': 'isoform_id',
        'rsem_normalized': 'value'
    }))
    df['sample_id'] = df['barcode'].map(lambda s: s[:15])
    df['unit'] = 'normalized_counts'
    df = df[['sample_id', 'isoform_id', 'unit', 'value']]
    df = df.drop_duplicates(subset=['sample_id', 'isoform_id'])

    if env == 'dev':
        df = df[df['isoform_id'].isin(cfg['TEST_ISOFORMS'])]

    db.copy_from_df(df, 'sample_isoform_value')
Пример #5
0
def load_tcia_pathways(up_fpath=TCIA_GSEA_ENRICHMENT,
                       down_fpath=TCIA_GSEA_DEPLETION):
    for fpath in up_fpath, down_fpath:
        df = (pd.read_csv(fpath, sep='\t').drop(['disease'], axis=1).rename(
            columns={
                'patients': 'patient_id',
                'cellType': 'data_type',
                'qValue < 1%Enriched': 'qvalue',
                'qValue < 1%Depleted': 'qvalue',
                'NES > 0': 'value'
            }))
        df = df[df['qvalue'] < 0.05]
        df['unit'] = 'normalized enrichment score'

        # Load to database in respective tables
        df = (df[['patient_id', 'data_type', 'unit', 'value']].drop_duplicates(
            subset=['patient_id', 'data_type']).dropna(subset=['value']))
        db.copy_from_df(df, 'patient_value')
Пример #6
0
def load_tcga_clinical(fpath):
    mat = pd.read_csv(fpath, sep='\t')
    df = mat.melt(id_vars='Hybridization REF',
                  var_name='patient_id',
                  value_name='value')
    df = df[df['Hybridization REF'] != 'Composite Element REF']
    df['patient_id'] = df['patient_id'].str.upper()
    df['unit'] = 'clinical'
    df = df.rename(columns={'Hybridization REF': 'data_type'})
    df = df.drop_duplicates(subset=['patient_id', 'data_type'])
    df = df.dropna(subset=['value'])

    df['is_numeric'] = df['value'].map(is_numeric)
    df_numeric = df[df['is_numeric']]
    df_text = df[~df['is_numeric']]

    df_numeric = df_numeric[['patient_id', 'data_type', 'unit', 'value']]
    db.copy_from_df(df_numeric, 'patient_value')
    df_text = df_text[['patient_id', 'data_type', 'unit', 'value']]
    db.copy_from_df(df_text, 'patient_text_value')
Пример #7
0
def load_hpa_protein(fpath=HPA_NORMAL_TISSUE, env=cfg['ENV']):
    cell_types = pd.read_csv(CELL_TYPES)
    df = pd.read_csv(fpath, sep='\t')

    # if env == 'test':
    #     df = df[df['Gene name'].isin(TEST_GENES)]

    formatted = (
        df[df['Reliability'] != 'Uncertain']
        .merge(
            cell_types,
            left_on=['Tissue', 'Cell type'],
            right_on=['hpa_tissue_id', 'cell_type'])
        [['Gene', 'cell_type_id', 'Level']]
        .drop_duplicates(subset=['cell_type_id', 'Gene']))

    conn = db.engine.connect()

    conn.execute("INSERT INTO source (source_id) VALUES ('HPA')")

    # Load to temp table
    conn.execute('DROP TABLE IF EXISTS tmp_hpa_prot')
    conn.execute('''
        CREATE TABLE tmp_hpa_prot
        (Gene varchar, cell_type_id varchar, Level varchar)
    ''')
    db.copy_from_df(formatted, 'tmp_hpa_prot')

    # Insert from temp, joining to existing keys (cell type, gene)
    conn.execute('''
        INSERT INTO cell_type_gene_text_value
        (source_id, cell_type_id, gene_id, data_type, unit, value)
        SELECT 'HPA', tmp.cell_type_id, g.gene_id, 'protein',
               'detection level', tmp.Level
        FROM tmp_hpa_prot tmp
        INNER JOIN gene g ON g.ensembl_id = tmp.Gene
    ''')

    conn.close()
Пример #8
0
def load_hpa_expression(fpath=HPA_RNA_TISSUE, env=cfg['ENV']):
    tissues = pd.read_csv(TISSUES)
    df = pd.read_csv(fpath, sep='\t')

    # if env == 'test':
    #     df = df[df['Gene name'].isin(TEST_GENES)]

    formatted = (
        df
        .merge(
            tissues,
            left_on='Sample', right_on='hpa_id')
        [['Gene', 'tissue_id', 'Value']]
        .drop_duplicates(subset=['tissue_id', 'Gene']))

    conn = db.engine.connect()

    # Load to temp table
    conn.execute('DROP TABLE IF EXISTS tmp_hpa_expr')
    conn.execute('''
        CREATE TABLE tmp_hpa_expr
        (Gene varchar, tissue_id varchar, Value numeric)
    ''')
    db.copy_from_df(formatted, 'tmp_hpa_expr')

    # Insert from temp, joining to existing keys (cell type, gene)
    conn.execute('''
        INSERT INTO tissue_gene_value
        (source_id, tissue_id, gene_id, data_type, unit, value)
        SELECT 'HPA', tmp.tissue_id, g.gene_id, 'expression',
               'TPM', tmp.Value
        FROM tmp_hpa_expr tmp
        INNER JOIN gene g ON g.ensembl_id = tmp.Gene
    ''')
    conn.execute('DROP TABLE IF EXISTS tmp_hpa_expr')
    conn.close()
Пример #9
0
def load_tcga_sample_meta(cohort_fpath=TCGA_COHORT_META,
                          patient_fpath=TCGA_PATIENT_META,
                          sample_fpath=TCGA_SAMPLE_META):

    conn = db.engine.connect()
    conn.execute("INSERT INTO source (source_id) VALUES ('TCGA')")

    cohort = (pd.read_csv(cohort_fpath).assign(source_id='TCGA')[[
        'cohort_id', 'source_id', 'cohort_name'
    ]].drop_duplicates())
    db.copy_from_df(cohort, 'cohort')

    patient = (pd.read_csv(patient_fpath)[[
        'patient_id', 'cohort_id'
    ]].drop_duplicates(subset=['patient_id']))
    db.copy_from_df(patient, 'patient')

    sample = (pd.read_csv(sample_fpath)[[
        'sample_id', 'patient_id', 'sample_code', 'sample_type'
    ]].drop_duplicates())
    db.copy_from_df(sample, 'sample')

    conn.close()
Пример #10
0
def load_tcga_mutation(fpath=TCGA_MUTATIONS, env=cfg['ENV']):
    maf = pd.read_csv(fpath, sep='\t')
    #if 'Protein_Change' in maf.columns:
    df = maf[[
        'Gene',
        'Tumor_Sample_Barcode',
        'Variant_Classification',
        'Variant_Type',
        'HGVSp_Short',
    ]]
    # elif 'AAChange' in maf.columns:
    #     df = maf[
    #         [
    #             'Entrez_Gene_Id',
    #             'Tumor_Sample_Barcode',
    #             'Variant_Classification',
    #             'Variant_Type',
    #             'AAChange',
    #         ]
    #     ]
    # elif 'amino_acid_change' in maf.columns:
    #     df = maf[
    #         [
    #             'Entrez_Gene_Id',
    #             'Tumor_Sample_Barcode',
    #             'Variant_Classification',
    #             'Variant_Type',
    #             'amino_acid_change',
    #         ]
    #     ]
    # else:
    #     raise Exception(
    #         'MAF file should have either "Protein_Change", "AAChange", '
    #         'or "amino_acid_change" column'
    #     )
    df.loc[:, 'sample_id'] = df['Tumor_Sample_Barcode'].map(lambda s: s[:15])
    df = df.drop('Tumor_Sample_Barcode', axis=1)
    df = df.rename(
        columns={
            'Gene': 'ensembl_id',
            'Variant_Type': 'variant type',
            'Variant_Classification': 'variant classification',
            'AAChange': 'AA change'
        })
    # df = df[df['gene_id'] > 0]

    # if env == 'dev':
    #     df = df[df['ensembl_id'].isin(cfg['TEST_ENSEMBL_GENES'])]

    df = df.melt(id_vars=['sample_id', 'ensembl_id'],
                 var_name='data_type',
                 value_name='value')
    df.loc[:, 'unit'] = 'mutation'
    df = df.drop_duplicates(subset=['ensembl_id', 'sample_id', 'data_type'])
    df = df.dropna(subset=['value'])

    df = df[['sample_id', 'ensembl_id', 'data_type', 'unit', 'value']]

    conn = db.engine.connect()

    try:
        conn.execute('''
            CREATE TABLE tmp_tcga_maf
            (sample_id varchar, ensembl_id varchar, data_type varchar,
             unit varchar, value varchar)
        ''')

        db.copy_from_df(df, 'tmp_tcga_maf')

        conn.execute('''
            INSERT INTO sample_gene_text_value
            (sample_id, gene_id, data_type, unit, value)
            SELECT tmp.sample_id, g.gene_id, tmp.data_type,
                   tmp.unit, tmp.value
            FROM tmp_tcga_maf tmp
            INNER JOIN gene g ON g.ensembl_id = tmp.ensembl_id
            INNER JOIN sample s ON s.sample_id = tmp.sample_id
        ''')
    finally:
        conn.execute('DROP TABLE tmp_tcga_maf')
        conn.close()
Пример #11
0
def load_gtex_isoform(fpath=cfg['GTEX_MEDIAN_ISOFORM']):
    df = (pd.read_csv(fpath).rename(columns={'median_tpm': 'value'}))
    df.loc[:, 'unit']: 'TPM'
    df = df[['source_id', 'tissue_id', 'isoform_id', 'unit', 'value']]
    db.copy_from_df(df, 'tissue_isoform_value')