示例#1
0
def main(args):
    logging.basicConfig(level=logging.INFO,
                        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
                            handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir,'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir,'processed'))

    gse = GEOparse.get_GEO(geo='GSE16032', destdir=join(args.out_expr_dir,'raw'))

    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split('///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    annotated2 = np.log(annotated2)
    disease_cls = ['disease state: Acute', 'disease state: Convalescence']
    logging.info(disease_cls)
    disease_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][1] in disease_cls]
    logging.info("Disease GSM: {}".format(len(disease_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2)
    utils.write_text(join(args.out_expr_dir,'processed','disease_gsms.txt'), disease_gsm)
示例#2
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE64913',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    classes = {}
    classes['healthy_cae'] = [
        'diagnosis: Healthy', 'cell type: Central airway epithelium'
    ]
    classes['healthy_pae'] = [
        'diagnosis: Healthy', 'cell type: Peripheral airway epithelium'
    ]
    classes['asthma_cae'] = [
        'diagnosis: Severe Asthmatic', 'cell type: Central airway epithelium'
    ]
    classes['asthma_pae'] = [
        'diagnosis: Severe Asthmatic',
        'cell type: Peripheral airway epithelium'
    ]

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls]
            [0] and gse.gsms[gsm].metadata['characteristics_ch1'][5] ==
            classes[cls][1]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    gda = pd.read_csv(args.in_gda_path, sep='\t')
    disease_maps = pd.read_csv(args.in_dismap_path, sep='|')
    net = utils.read_network(args.in_net_path)

    gda = gda[gda.geneId.isin(net.nodes())]
    #gda = gda.groupby('diseaseId').filter(lambda x: len(x) >= args.min_disease_size)
    cols = [
        'diseaseId', 'diseaseName', 'diseaseType', 'diseaseClass',
        'diseaseSemanticType', 'geneId'
    ]
    diseases = gda.groupby('diseaseId').apply(lambda x: pd.Series(
        {
            'diseaseName': x.diseaseName.tolist()[0],
            'diseaseType': x.diseaseType.tolist()[0],
            'diseaseClass': x.diseaseClass.tolist()[0],
            'diseaseSemanticType': x.diseaseSemanticType.tolist()[0],
            'n_genes': x.shape[0]
        })).reset_index()
    print(diseases.shape)
    diseases = diseases[(diseases.n_genes > args.min_disease_size)
                        & (diseases.n_genes < args.max_disease_size) &
                        (diseases.diseaseType == 'disease') &
                        (diseases.diseaseSemanticType
                         == 'Disease or Syndrome')]
    print(diseases.shape)

    def get_doids(diseaseId):
        return '|'.join([
            'DOID:{}'.format(doid)
            for doid in disease_maps[(disease_maps.vocabulary == 'DO') & (
                disease_maps.diseaseId == diseaseId)].code.tolist()
        ])

    diseases['DOID'] = diseases.diseaseId.apply(get_doids)
    doids = list(
        set([
            'DOID:{}'.format(doid)
            for doid in disease_maps[(disease_maps.vocabulary == 'DO')
                                     & (disease_maps.diseaseId.isin(
                                         diseases.diseaseId))].code.tolist()
        ]))

    gda.to_csv(args.out_gda_file, sep='\t', index=False)
    diseases.to_csv(args.out_disease_file, sep='\t', index=False)
    utils.write_text(args.out_doid_file, doids)
示例#4
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE54837',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    annotated2[annotated2 <= 0] = 0.001
    annotated2 = np.log(annotated2)
    disease_cls = ['subject type: COPD Subjects']
    healthy_cls = [
        'subject type: Non-smoker Controls', 'subject type: Smoker Controls'
    ]
    healthy_non_smoker_cls = ['subject type: Non-smoker Controls']
    logging.info(disease_cls)
    logging.info(healthy_cls)
    logging.info(healthy_non_smoker_cls)
    disease_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][5] in disease_cls
    ]
    healthy_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][5] in healthy_cls
    ]
    healthy_non_smoker_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][5] in
        healthy_non_smoker_cls
    ]
    logging.info(
        "Disease GSM: {}, Healthy GSM: {}, Healthy non smoker GSM: {}".format(
            len(disease_gsm), len(healthy_gsm), len(healthy_non_smoker_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'),
                     disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'),
                     healthy_gsm)
示例#5
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE473',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'],
                                       'ENTREZ_GENE_ID')
    gsm96 = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['platform_id'][0] == 'GPL96'
    ]  # we choose only GPL96 platform
    annotated2 = annotated[gsm96 + ['ENTREZ_GENE_ID']]
    print(annotated2.shape)
    annotated2 = annotated2[~pd.isnull(annotated2.ENTREZ_GENE_ID)]
    print(annotated2.shape)
    annotated2 = annotated2.loc[~annotated2.isnull().values.any(axis=1)]
    print(annotated2.shape)
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    annotated2 = np.log(annotated2)
    #{'astM_atop', 'astM_nonatop', 'astS_atop', 'ctr_atop', 'ctr_nonatop'}
    classes = {}
    classes['asthma_med_nonatop'] = ['astM_nonatop']
    classes['control_nonatop'] = ['ctr_nonatop']

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gsm96
            if classes[cls][0] in gse.gsms[gsm].metadata['title'][0]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
示例#6
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE31773',
                           destdir=join(args.out_expr_dir, 'raw'))

    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    classes = {}
    classes['cd4_severe'] = ['CD4_Tcells_severe_asthma']
    classes['cd8_severe'] = ['CD8_Tcells_severe_asthma']
    classes['cd4_healthy'] = ['CD4_Tcells_healthy_donor']
    classes['cd8_healthy'] = ['CD8_Tcells_healthy_donor']
    classes['cd8_non_severe'] = ['CD8_Tcells_non_severe_asthma']
    classes['cd4_non_severe'] = ['CD4_Tcells_non_severe_asthma']
    classes['asthma_severe'] = [
        'CD4_Tcells_severe_asthma', 'CD8_Tcells_severe_asthma'
    ]
    classes['healthy'] = [
        'CD4_Tcells_healthy_donor', 'CD8_Tcells_healthy_donor'
    ]
    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['source_name_ch1'][0] in classes[cls]
        ]
        for cls in classes
    }
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        logging.info("{} GSM: {}".format(cls, len(gsms[cls])))
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
示例#7
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE16972',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'],
                                       'ENTREZ_GENE_ID')
    print(annotated.shape)
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    print(annotated2.shape)
    annotated2 = annotated2.loc[~annotated2.isnull().values.any(axis=1)]
    print(annotated2.shape)
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    annotated2 = np.log(annotated2)
    classes = {}
    classes['copd'] = ['disease status: COPD patient']
    classes['control'] = ['disease status: control patient']

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['characteristics_ch1'][0] ==
            'cell type: alveolar macrophage' and
            gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls][0]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
示例#8
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE18965',
                           destdir=join(args.out_expr_dir, 'raw'))

    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    disease_cls = [
        gse.gsms[gsm].metadata['title'][0] for gsm in gse.gsms
        if 'AA' in gse.gsms[gsm].metadata['title'][0]
    ]
    healthy_cls = [
        gse.gsms[gsm].metadata['title'][0] for gsm in gse.gsms
        if 'HN' in gse.gsms[gsm].metadata['title'][0]
    ]
    logging.info(disease_cls)
    logging.info(healthy_cls)
    disease_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['title'][0] in disease_cls
    ]
    healthy_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['title'][0] in healthy_cls
    ]
    logging.info("Disease GSM: {}, Healthy GSM: {}".format(
        len(disease_gsm), len(healthy_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'),
                     disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'),
                     healthy_gsm)
示例#9
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE89809',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL13158'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    # ['Severe_Spm', 'Healthy_Spm', 'Healthy_Epithelial', 'Mild_Epithelial', 'Severe_Epithelial', 'Mild_Spm', 'Healthy_BAL',
    # 'Moderate_Epithelial', 'Severe_BAL', 'Mild_BAL', 'Moderate_Spm', 'Moderate_BAL']
    classes = set([
        '_'.join(gse.gsms[gsm].metadata['title'][0].split('_')[2:4])
        for gsm in gse.gsms
    ])

    logging.info(classes)
    gsms = {
        cls:
        [gsm for gsm in gse.gsms if cls in gse.gsms[gsm].metadata['title'][0]]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
示例#10
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE37147',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL13243'],
                                       'SPOT_ID')
    annotated2 = annotated.rename(columns={'SPOT_ID': 'ENTREZ_GENE_ID'})
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    disease_cls = ['copd: yes']
    healthy_cls = ['copd: no']
    logging.info(disease_cls)
    logging.info(healthy_cls)
    disease_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][4] in disease_cls
    ]
    healthy_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][4] in healthy_cls
    ]
    disease_no_asthma_gsm = [gsm for gsm in gse.gsms \
                             if gse.gsms[gsm].metadata['characteristics_ch1'][4] in disease_cls \
                            and gse.gsms[gsm].metadata['characteristics_ch1'][9] == 'history of asthma: no']
    healthy_no_asthma_gsm = [gsm for gsm in gse.gsms \
                             if gse.gsms[gsm].metadata['characteristics_ch1'][4] in healthy_cls \
                             and gse.gsms[gsm].metadata['characteristics_ch1'][9] == 'history of asthma: no']
    logging.info("Disease GSM: {}, Healthy GSM: {}".format(
        len(disease_gsm), len(healthy_gsm)))
    logging.info(
        "Disease (no history of asthma) GSM: {} Healthy (no history of asthma) GSM: {}"
        .format(len(disease_no_asthma_gsm), len(healthy_no_asthma_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'),
                     disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'),
                     healthy_gsm)
    utils.write_text(
        join(args.out_expr_dir, 'processed',
             'disease_no_history_asthma_gsms.txt'), disease_no_asthma_gsm)
    utils.write_text(
        join(args.out_expr_dir, 'processed',
             'healthy_no_history_asthma_gsms.txt'), healthy_no_asthma_gsm)
示例#11
0
def main(args):
    logging.basicConfig(level=logging.INFO,
                        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
                            handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir,'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir,'processed'))

    gse = GEOparse.get_GEO(geo='GSE57148', destdir=join(args.out_expr_dir,'raw'))
    with closing(request.urlopen(
            'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE57nnn/GSE57148/suppl/GSE57148_COPD_FPKM_Normalized.txt.gz')) as r:
        with open(join(args.out_expr_dir,'raw','GSE57148_COPD_FPKM_Normalized.txt.gz'), 'wb') as f:
            shutil.copyfileobj(r, f)
    counts = pd.read_csv(join(args.out_expr_dir,'raw','GSE57148_COPD_FPKM_Normalized.txt.gz'),sep='\t')
    annotated2 = counts.rename(columns={gse.gsms[gsm].metadata['title'][0].split(' ')[-1]: gsm for gsm in gse.gsms})
    print(annotated2.isnull().values.any(axis=1).sum())
    print(annotated2.isnull().values.all(axis=1).sum())
    convmatr = utils.gm._sources['biomart'].set_index('symbol')
    annotated2['ENTREZ_GENE_ID'] = convmatr.loc[annotated2.GeneName.tolist()].entrez.tolist()
    print(annotated2.shape)
    annotated2 = annotated2[~annotated2.ENTREZ_GENE_ID.isnull()]
    print(annotated2.shape)
    annotated2['ENTREZ_GENE_ID'] = annotated2['ENTREZ_GENE_ID'].astype(int)
    del annotated2['GeneName']
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    disease_cls = ['disease state: COPD']
    healthy_cls = ['disease state: Normal']
    logging.info(disease_cls)
    logging.info(healthy_cls)
    disease_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] in disease_cls]
    healthy_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] in healthy_cls]
    logging.info("Disease GSM: {}, Healthy GSM: {}".format(len(disease_gsm), len(healthy_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'), disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'), healthy_gsm)
示例#12
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE104468',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL21185'],
                                       'GENE_SYMBOL')
    convmatr = utils.gm._sources['biomart'].set_index('symbol')
    annotated2 = annotated[~pd.isnull(annotated.GENE_SYMBOL)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = convmatr.loc[
        annotated2.GENE_SYMBOL.tolist()].entrez.tolist()
    annotated2 = annotated2[~annotated2.ENTREZ_GENE_ID.isnull()]
    annotated2['ENTREZ_GENE_ID'] = annotated2['ENTREZ_GENE_ID'].astype(int)
    del annotated2['GENE_SYMBOL']
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    classes = {}
    classes['asthma_pbmc'] = ['disease state: Asthma', 'cell type: PBMC']
    classes['asthma_bronch'] = [
        'disease state: Asthma', 'cell type: bronchial epithelia'
    ]
    classes['asthma_nasal'] = [
        'disease state: Asthma', 'cell type: nasal epithelia'
    ]
    classes['normal_pbmc'] = ['disease state: Normal', 'cell type: PBMC']
    classes['normal_bronch'] = [
        'disease state: Normal', 'cell type: bronchial epithelia'
    ]
    classes['normal_nasal'] = [
        'disease state: Normal', 'cell type: nasal epithelia'
    ]

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['characteristics_ch1'][2] == classes[cls]
            [0] and gse.gsms[gsm].metadata['characteristics_ch1'][1] ==
            classes[cls][1]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])