Exemplo n.º 1
0
def load_entropy(config):
    suffix = ''
    if bool(config.experiment.data_params):
        data_params = copy.deepcopy(config.experiment.data_params)
        suffix += '_' + config.experiment.get_data_params_str()
    else:
        raise ValueError('Exog for entropy is empty.')

    fn_data = get_cache_path(config) + '/' + 'entropy' + suffix + '.npz'

    config.entropy_list = ['entropy']
    config.entropy_dict = {'entropy': 0}
    config.entropy_missed_dict = {'entropy': []}

    if os.path.isfile(fn_data):

        data = np.load(fn_data)
        config.entropy_data = data['data']

    else:

        if data_params['data'] == 'betas':
            config.experiment.data_params = {}
            load_betas(config)
            data = config.betas_data
            data_dict = config.betas_dict
        elif data_params['data'] == 'betas_adj':
            config.experiment.data_params.pop('data')
            load_betas_adj(config)
            data = config.betas_adj_data
            data_dict = config.betas_adj_dict
        elif data_params['data'] == 'residuals':
            config.experiment.data_params.pop('data')
            load_residuals(config)
            data = config.residuals_data
            data_dict = config.residuals_dict
        else:
            raise ValueError('Unsupported data for entropy.')

        num_subjects = data.shape[1]
        config.entropy_data = np.zeros(num_subjects, dtype=np.float32)

        rows = [data_dict[item] for item in config.cpg_list if item in data_dict]

        for subj_id in tqdm(range(0, num_subjects), mininterval=60.0, desc='entropy_data creating'):
            values = np.squeeze(np.asarray(data[np.ix_(rows, [subj_id])]))
            entropy = 0.0
            outliers = 0
            for val in values:
                if not math.isnan(val):
                    if 0.0 < val < 1.0:
                        entropy += val * np.log2(val) + (1.0 - val) * np.log2(1.0 - val)
                    else:
                        outliers += 1
                else:
                    outliers += 1
            entropy /= ((len(values) - outliers) * np.log2(0.5))
            config.entropy_data[subj_id] = entropy

        np.savez_compressed(fn_data, data=config.entropy_data)
Exemplo n.º 2
0
def subset_annotations(config):
    aux_data_fn = get_cache_path(config) + '/' + 'aux_data.pkl'

    if os.path.isfile(aux_data_fn):
        f = open(aux_data_fn, 'rb')
        aux_data = pickle.load(f)
        f.close()
        config.cpg_list = aux_data['cpg_list']
        config.cpg_gene_dict = aux_data['cpg_gene_dict']
        config.cpg_bop_dict = aux_data['cpg_bop_dict']
        config.gene_cpg_dict = aux_data['gene_cpg_dict']
        config.gene_bop_dict = aux_data['gene_bop_dict']
        config.bop_cpg_dict = aux_data['bop_cpg_dict']
        config.bop_gene_dict = aux_data['bop_gene_dict']
    else:
        config.cpg_list = []
        config.cpg_gene_dict = {}
        config.cpg_bop_dict = {}
        config.gene_cpg_dict = {}
        config.gene_bop_dict = {}
        config.bop_cpg_dict = {}
        config.bop_gene_dict = {}

        cpgs = config.annotations_dict[AnnotationKey.cpg.value]
        genes = config.annotations_dict[AnnotationKey.gene.value]
        bops = config.annotations_dict[AnnotationKey.bop.value]
        map_infos = config.annotations_dict[AnnotationKey.map_info.value]
        for id in range(0, len(cpgs)):

            if id % 10000 == 0:
                print('id: ' + str(id))

            curr_ann_dict = {}
            for key in config.annotations_dict:
                curr_ann_dict[key] = config.annotations_dict[key][id]

            if check_conditions(config, curr_ann_dict):

                cpg = cpgs[id]
                gene_raw = genes[id]
                curr_genes = list(set(gene_raw.split(';')))
                bop = bops[id]

                config.cpg_list.append(cpg)

                config.cpg_gene_dict[cpg] = curr_genes

                config.cpg_bop_dict[cpg] = bop

                for gene in curr_genes:
                    if gene in config.gene_cpg_dict:
                        config.gene_cpg_dict[gene].append(cpg)
                    else:
                        config.gene_cpg_dict[gene] = [cpg]

                for gene in curr_genes:
                    if gene in config.gene_bop_dict:
                        config.gene_bop_dict[gene].append(bop)
                    else:
                        config.gene_bop_dict[gene] = [bop]

                if len(bop) > 0:
                    if bop in config.bop_cpg_dict:
                        config.bop_cpg_dict[bop].append(cpg)
                    else:
                        config.bop_cpg_dict[bop] = [cpg]

                config.bop_gene_dict[bop] = curr_genes

        # Sorting cpgs by map_info in gene dict
        for curr_gene, curr_cpgs in config.gene_cpg_dict.items():
            curr_map_infos = []
            for curr_cpg in curr_cpgs:
                cpg_index = cpgs.index(curr_cpg)
                curr_map_infos.append(int(map_infos[cpg_index]))
            order = np.argsort(curr_map_infos)
            curr_cpgs_sorted = list(np.array(curr_cpgs)[order])
            config.gene_cpg_dict[curr_gene] = curr_cpgs_sorted

        # Sorting cpgs by map_info in bop dict
        for curr_bop, curr_cpgs in config.bop_cpg_dict.items():
            curr_map_infos = []
            for curr_cpg in curr_cpgs:
                cpg_index = cpgs.index(curr_cpg)
                curr_map_infos.append(int(map_infos[cpg_index]))
            order = np.argsort(curr_map_infos)
            curr_cpgs_sorted = list(np.array(curr_cpgs)[order])
            config.bop_cpg_dict[curr_bop] = curr_cpgs_sorted

        aux_data = {
            'cpg_list': config.cpg_list,
            'cpg_gene_dict': config.cpg_gene_dict,
            'cpg_bop_dict': config.cpg_bop_dict,
            'gene_cpg_dict': config.gene_cpg_dict,
            'gene_bop_dict': config.gene_bop_dict,
            'bop_cpg_dict': config.bop_cpg_dict,
            'bop_gene_dict': config.bop_gene_dict,
        }

        f = open(aux_data_fn, 'wb')
        pickle.dump(aux_data, f, pickle.HIGHEST_PROTOCOL)
        f.close()
Exemplo n.º 3
0
def load_genes(config):

    suffix_gene = ''
    if bool(config.experiment.data_params):
        suffix_gene += '_' + str(config.experiment.get_data_params_str())
        source = config.experiment.data_params.pop('source')
    else:
        raise ValueError('Data params for genes are empty')

    fn_list_txt = get_cache_path(config) + '/' + 'genes_list.txt'
    fn_list_pkl = get_cache_path(config) + '/' + 'genes_list.pkl'
    fn_dict_pkl = get_cache_path(config) + '/' + 'genes_dict.pkl'
    fn_missed_dict_pkl = get_cache_path(config) + '/' + 'genes_missed_dict.pkl'
    fn_data_npz = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.npz'
    fn_data_txt = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.txt'

    if os.path.isfile(fn_dict_pkl) and os.path.isfile(
            fn_list_pkl) and os.path.isfile(fn_data_npz):

        f = open(fn_list_pkl, 'rb')
        config.genes_list = pickle.load(f)
        f.close()

        f = open(fn_dict_pkl, 'rb')
        config.genes_dict = pickle.load(f)
        f.close()

        f = open(fn_missed_dict_pkl, 'rb')
        config.genes_missed_dict = pickle.load(f)
        f.close()

        data = np.load(fn_data_npz)
        config.genes_data = data['data']

    else:

        if source == 'betas':
            load_betas(config)
            source_dict = config.betas_dict
            source_data = config.betas_data
            source_missed_dict = config.betas_missed_dict
        elif source == 'betas_adj':
            load_betas_adj(config)
            source_dict = config.betas_adj_dict
            source_data = config.betas_adj_data
            source_missed_dict = config.betas_adj_missed_dict
        elif source == 'residuals':
            load_residuals(config)
            source_dict = config.residuals_dict
            source_data = config.residuals_data
            source_missed_dict = config.residuals_missed_dict
        else:
            raise ValueError('Source for genes is not specified')

        num_subjects = config.betas_data.shape[1]

        config.genes_list = []
        for gene_id, gene in tqdm(enumerate(config.gene_cpg_dict),
                                  mininterval=60.0,
                                  desc='genes_list creating'):
            cpgs = config.gene_cpg_dict[gene]
            for cpg in cpgs:
                if cpg in source_dict:
                    config.genes_list.append(gene)
                    break

        config.genes_dict = {}
        config.genes_missed_dict = {'any': []}
        config.genes_data = np.zeros((len(config.genes_list), num_subjects),
                                     dtype=np.float32)

        for gene_id, gene in tqdm(enumerate(config.genes_list),
                                  mininterval=60.0,
                                  desc='genes_data creating'):
            config.genes_dict[gene] = gene_id

            cpgs = config.gene_cpg_dict[gene]

            denominators = np.zeros(num_subjects, dtype=np.float32)

            for cpg in cpgs:
                if cpg in source_dict:
                    row_id = source_dict[cpg]
                    source_values_raw = source_data[row_id, :]
                    if len(source_missed_dict[cpg]) > 0:
                        source_values = np.zeros(num_subjects,
                                                 dtype=np.float32)
                        for val_id in range(0, num_subjects):
                            if val_id not in source_missed_dict[cpg]:
                                source_values[val_id] = source_values_raw[
                                    val_id]
                                denominators[val_id] += 1.0

                    else:
                        source_values = source_values_raw
                        denominators += np.ones(num_subjects, dtype=np.float32)
                    config.genes_data[gene_id] += source_values

            for val_id in range(0, num_subjects):
                config.genes_data[gene_id][val_id] /= denominators[val_id]

        f = open(fn_list_pkl, 'wb')
        pickle.dump(config.genes_list, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        f = open(fn_dict_pkl, 'wb')
        pickle.dump(config.genes_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        f = open(fn_missed_dict_pkl, 'wb')
        pickle.dump(config.genes_missed_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        np.savez_compressed(fn_data_npz, data=config.genes_data)
        np.savetxt(fn_data_txt, config.genes_data, delimiter='\t', fmt='%.8e')

        with open(fn_list_txt, 'w') as f:
            for item in config.genes_list:
                f.write("%s\n" % item)
Exemplo n.º 4
0
def subset_annotations(config):
    aux_data_fn = get_cache_path(config) + '/' + 'aux_data.pkl'

    if config.annotations.type == '450k':

        if os.path.isfile(aux_data_fn):
            f = open(aux_data_fn, 'rb')
            aux_data = pickle.load(f)
            f.close()
            config.cpg_list = aux_data['cpg_list']
            config.cpg_gene_dict = aux_data['cpg_gene_dict']
            config.cpg_bop_dict = aux_data['cpg_bop_dict']
            config.gene_cpg_dict = aux_data['gene_cpg_dict']
            config.gene_bop_dict = aux_data['gene_bop_dict']
            config.bop_cpg_dict = aux_data['bop_cpg_dict']
            config.bop_gene_dict = aux_data['bop_gene_dict']
            config.cpg_map_info_dict = aux_data['cpg_map_info_dict']
        else:
            config.cpg_list = []
            config.cpg_gene_dict = {}
            config.cpg_bop_dict = {}
            config.gene_cpg_dict = {}
            config.gene_bop_dict = {}
            config.bop_cpg_dict = {}
            config.bop_gene_dict = {}
            config.cpg_map_info_dict = {}

            cpgs_all = config.annotations_dict[config.annotations.id_name]
            genes_all = config.annotations_dict['UCSC_REFGENE_NAME']
            bops_all = config.annotations_dict['BOP']
            map_infos_all = config.annotations_dict['MAPINFO']

            for index, cpg in enumerate(cpgs_all):

                if global_check(config, index):

                    cpg = cpgs_all[index][0]
                    config.cpg_list.append(cpg)

                    map_info = map_infos_all[index][0]
                    if map_info == 'NA':
                        map_info = '0'
                    config.cpg_map_info_dict[cpg] = int(map_info)

                    genes = genes_all[index]
                    if len(genes) > 0:
                        config.cpg_gene_dict[cpg] = genes
                        for gene in genes:
                            if gene in config.gene_cpg_dict:
                                config.gene_cpg_dict[gene].append(cpg)
                            else:
                                config.gene_cpg_dict[gene] = [cpg]

                    bops = bops_all[index]
                    if len(bops) > 0:
                        config.cpg_bop_dict[cpg] = bops
                        for bop in bops:
                            if bop in config.bop_cpg_dict:
                                config.bop_cpg_dict[bop].append(cpg)
                            else:
                                config.bop_cpg_dict[bop] = [cpg]

                    if len(genes) > 0 and len(bops) > 0:
                        for gene in genes:
                            if gene in config.gene_bop_dict:
                                config.gene_bop_dict[gene] += bops
                            else:
                                config.gene_bop_dict[gene] = copy.deepcopy(
                                    bops)
                        for bop in bops:
                            if bop in config.bop_gene_dict:
                                config.bop_gene_dict[bop] += genes
                            else:
                                config.bop_gene_dict[bop] = copy.deepcopy(
                                    genes)

            # Sorting cpgs by map_info in gene dict
            for gene, cpgs in config.gene_cpg_dict.items():
                map_infos = []
                for cpg in cpgs:
                    map_infos.append(int(config.cpg_map_info_dict[cpg]))
                order = np.argsort(map_infos)
                cpgs_sorted = list(np.array(cpgs)[order])
                config.gene_cpg_dict[gene] = cpgs_sorted

            # Sorting cpgs by map_info in bop dict
            for bop, cpgs in config.bop_cpg_dict.items():
                map_infos = []
                for cpg in cpgs:
                    map_infos.append(int(config.cpg_map_info_dict[cpg]))
                order = np.argsort(map_infos)
                cpgs_sorted = list(np.array(cpgs)[order])
                config.bop_cpg_dict[bop] = cpgs_sorted

            aux_data = {
                'cpg_list': config.cpg_list,
                'cpg_gene_dict': config.cpg_gene_dict,
                'cpg_bop_dict': config.cpg_bop_dict,
                'gene_cpg_dict': config.gene_cpg_dict,
                'gene_bop_dict': config.gene_bop_dict,
                'bop_cpg_dict': config.bop_cpg_dict,
                'bop_gene_dict': config.bop_gene_dict,
                'cpg_map_info_dict': config.cpg_map_info_dict
            }

            f = open(aux_data_fn, 'wb')
            pickle.dump(aux_data, f, pickle.HIGHEST_PROTOCOL)
            f.close()

    elif config.annotations.type == '850k':

        if os.path.isfile(aux_data_fn):
            f = open(aux_data_fn, 'rb')
            aux_data = pickle.load(f)
            f.close()
            config.cpg_list = aux_data['cpg_list']
            config.cpg_gene_dict = aux_data['cpg_gene_dict']
            config.gene_cpg_dict = aux_data['gene_cpg_dict']
            config.cpg_map_info_dict = aux_data['cpg_map_info_dict']
            config.bops = aux_data['bops']

        else:
            config.cpg_list = []
            config.cpg_gene_dict = {}
            config.gene_cpg_dict = {}
            config.cpg_map_info_dict = {}

            cpgs_all = config.annotations_dict[config.annotations.id_name]
            genes_all = config.annotations_dict['UCSC_RefGene_Name']
            map_infos_all = config.annotations_dict['MAPINFO']
            chr_all = config.annotations_dict['CHR']
            geo_all = config.annotations_dict['UCSC_CpG_Islands_Name']
            geo_type_all = config.annotations_dict[
                'Relation_to_UCSC_CpG_Island']

            config.bops = {}

            for index, cpg in enumerate(cpgs_all):

                if global_check(config, index):

                    cpg = cpgs_all[index][0]
                    config.cpg_list.append(cpg)

                    map_info = map_infos_all[index]
                    if len(map_info) > 0:
                        map_info = map_info[0]
                        if map_info == 'NA':
                            map_info = '0'
                    else:
                        map_info = 0
                    config.cpg_map_info_dict[cpg] = int(map_info)

                    genes = genes_all[index]
                    genes.sort()
                    if len(genes) > 0:
                        config.cpg_gene_dict[cpg] = genes
                        for gene in genes:
                            if gene in config.gene_cpg_dict:
                                config.gene_cpg_dict[gene].append(cpg)
                            else:
                                config.gene_cpg_dict[gene] = [cpg]

                    if len(chr_all[index]) > 0:
                        chr = chr_all[index][0]
                    else:
                        chr = ''
                    if len(geo_all[index]) > 0:
                        geo = geo_all[index][0]
                    else:
                        geo = ''
                    if len(geo_type_all[index]) > 0:
                        geo_type = geo_type_all[index][0]
                    else:
                        geo_type = ''

                    if geo == '':
                        if len(genes) > 0:
                            bop_class = 'C'
                            bop_names = [f'{chr}*{gene}' for gene in genes]
                        else:
                            bop_class = 'D'
                            bop_names = [cpg]
                    else:
                        if len(genes) > 0:
                            bop_class = 'A'
                            bop_names = [f'{geo}*{geo_type}']
                        else:
                            bop_class = 'B'
                            bop_names = [f'{geo}*{geo_type}*nogene']

                    for bop_name in bop_names:
                        if bop_name in config.bops:
                            if config.bops[bop_name]['class'] != bop_class:
                                raise ValueError(
                                    f'Error: Different classes in BOP creation: {bop_name}'
                                )
                            config.bops[bop_name]['cpg'].append(cpg)
                            config.bops[bop_name]['map_info'].append(map_info)
                            config.bops[bop_name]['gene'].update(set(genes))
                        else:
                            config.bops[bop_name] = {}
                            config.bops[bop_name]['class'] = bop_class
                            config.bops[bop_name]['cpg'] = [cpg]
                            config.bops[bop_name]['map_info'] = [map_info]
                            config.bops[bop_name]['gene'] = set(genes)

            # Sorting cpgs by map_info in gene dict
            for gene, cpgs in config.gene_cpg_dict.items():
                map_infos = []
                for cpg in cpgs:
                    map_infos.append(int(config.cpg_map_info_dict[cpg]))
                order = np.argsort(map_infos)
                cpgs_sorted = list(np.array(cpgs)[order])
                config.gene_cpg_dict[gene] = cpgs_sorted

            # Sorting cpgs by map_info in bop
            for bop_name, bop_dict in config.bops.items():
                cpg = config.bops[bop_name]['cpg']
                map_info = config.bops[bop_name]['map_info']
                order = np.argsort(map_info)
                cpg_sorted = list(np.array(cpg)[order])
                config.bops[bop_name]['cpg'] = cpg_sorted

            aux_data = {
                'cpg_list': config.cpg_list,
                'cpg_gene_dict': config.cpg_gene_dict,
                'gene_cpg_dict': config.gene_cpg_dict,
                'cpg_map_info_dict': config.cpg_map_info_dict,
                'bops': config.bops
            }

            f = open(aux_data_fn, 'wb')
            pickle.dump(aux_data, f, pickle.HIGHEST_PROTOCOL)
            f.close()

    elif config.annotations.type == 'epityper':

        if os.path.isfile(aux_data_fn):
            f = open(aux_data_fn, 'rb')
            aux_data = pickle.load(f)
            f.close()
            config.cpg_list = aux_data['cpg_list']
        else:
            config.cpg_list = []
            cpgs_all = config.annotations_dict[config.annotations.id_name]
            for index, cpg in enumerate(cpgs_all):
                cpg = cpgs_all[index][0]
                config.cpg_list.append(cpg)

            aux_data = {
                'cpg_list': config.cpg_list,
            }

            f = open(aux_data_fn, 'wb')
            pickle.dump(aux_data, f, pickle.HIGHEST_PROTOCOL)
            f.close()