def process_intersections(data_dicts, save_path, item_key='item', is_rewrite=True): cpg_dicts = get_cpg_dicts(data_dicts, item_key) for dataset, data_dict in data_dicts.items(): save_table_dict_xlsx(f'{save_path}/{dataset}', data_dict, is_rewrite) sets, sets_with_difference = get_sets(data_dicts, item_key) save_dicts = get_cpg_dataset_save_dicts(sets, data_dicts, cpg_dicts, item_key) curr_save_path = f'{save_path}/intersection_full' if not os.path.exists(curr_save_path): os.makedirs(curr_save_path) for key, save_dict in save_dicts.items(): save_table_dict_xlsx(f'{curr_save_path}/{key}', save_dict, is_rewrite) save_dicts_with_diff = get_cpg_dataset_save_dicts(sets_with_difference, data_dicts, cpg_dicts, item_key) curr_save_path = f'{save_path}/intersection_diff' if not os.path.exists(curr_save_path): os.makedirs(curr_save_path) venn_labels = [] for key, save_dict in save_dicts_with_diff.items(): save_table_dict_xlsx(f'{curr_save_path}/{key}', save_dict, is_rewrite) curr_labels = key.split('_') + [str(len(sets_with_difference[key]))] venn_labels.append('<br>'.join(curr_labels)) if len(data_dicts) == 4: layout = get_layout_4() trace = get_trace_4(venn_labels) elif len(data_dicts) == 3: layout = get_layout_3() trace = get_trace_3(venn_labels) elif len(data_dicts) == 2: layout = get_layout_2() trace = get_trace_2(venn_labels) else: raise ValueError(f'Venn diagram is not supported') fig = { 'data': [trace], 'layout': layout, } save_figure(f'{save_path}/venn', fig) return save_dicts, save_dicts_with_diff
from paper.routines.infrastructure.load.table import load_table_dict_xlsx, load_table_dict_pkl from paper.routines.infrastructure.save.table import save_table_dict_xlsx, save_table_dict_pkl from statsmodels.stats.multitest import multipletests path = 'E:/YandexDisk/Work/pydnameth/unn_epic/bop/table/manova/3c48cd40ad58b06cc3b1f27e3c72554c' fn = 'ABC' target_metrics = ['Sample_Group_p_value_roy_3c48cd40'] limit = 0.05 table = {} table['Number of'] = ['BoPs', 'Genes'] curr_fn = f'{path}/{fn}.xlsx' data = load_table_dict_xlsx(curr_fn) for metric in target_metrics: reject, pvals_corr, alphacSidak, alphacBonf = multipletests( data[metric], 0.05, method='fdr_bh') data[f'{metric}_fdr_bh'] = pvals_corr reject, pvals_corr, alphacSidak, alphacBonf = multipletests( data[metric], 0.05, method='bonferroni') data[f'{metric}_bonferroni'] = pvals_corr save_table_dict_xlsx(f'{path}/{fn}_mod', data) save_table_dict_pkl(f'{path}/{fn}_mod', data)
papers_keys = ['inoshita', 'singmann', 'yousefi'] path = 'E:/YandexDisk/Work/pydnameth/draft/fixes/materials_and_methods/update_4_bonferroni' data_dicts_passed = {} cpgs_dicts_passed = {} R2s = {} R2_percentiles = {} data_dict = load_table_dict_xlsx(f'{path}/{name}.xlsx') for key in annotations_keys: data_dict[key] = [] for key in papers_keys: data_dict[key] = [] annotations_dict = load_annotations_dict() papers_dict = load_papers_dict() for cpg in tqdm(data_dict[cpg_key], desc=f'intersection processing'): for key in annotations_keys: data_dict[key].append(annotations_dict[key][cpg]) for paper_key in papers_keys: if cpg in papers_dict[paper_key]: data_dict[paper_key].append(1) else: data_dict[paper_key].append(0) save_table_dict_xlsx(f'{path}/{name}_with_added_info', data_dict)
def get_human_plasma_proteome_dicts(save_path): lehallier_data_path = f'{get_data_path()}/human_plasma_proteome' fn = lehallier_data_path + '/' + 'proteins_genes.xlsx' proteins_genes_data_dict = load_table_dict_xlsx(fn) id_gene = {} gene_id = {} suspect_rows = [] suspect_ids = [] for row_id in tqdm(range(0, len(proteins_genes_data_dict['ID']))): id = proteins_genes_data_dict['ID'][row_id] gene = proteins_genes_data_dict['EntrezGeneSymbol'][row_id] if gene in gene_id: gene_id[gene].append(id) else: gene_id[gene] = [id] if id in id_gene: suspect_rows.append(row_id) suspect_ids.append(id) if isinstance(gene, str): id_gene[id] = gene else: suspect_rows.append(row_id) suspect_ids.append(id) suspect_rows = [x + 2 for x in suspect_rows] np.savetxt(f'{save_path}/suspect_rows.txt', suspect_rows, fmt='%d') np.savetxt(f'{save_path}/suspect_ids.txt', suspect_ids, fmt='%s') fn = lehallier_data_path + '/' + 'age_sex.xlsx' age_sex_data_dict = load_table_dict_xlsx(fn) id_age_q = {} id_sex_q = {} for row_id in range(0, len(age_sex_data_dict['ID'])): id = age_sex_data_dict['ID'][row_id] age_q = age_sex_data_dict['q.Age'][row_id] sex_q = age_sex_data_dict['q.Sex'][row_id] id_age_q[id] = age_q id_sex_q[id] = sex_q ar_genes_lehallier = [] ss_genes_lehallier = [] ssar_genes_lehallier = [] for id, gene in id_gene.items(): if id_age_q[id] < 0.05: ar_genes_lehallier.append(gene) if id_sex_q[id] < 0.05: ss_genes_lehallier.append(gene) if id_age_q[id] < 0.05 and id_sex_q[id] < 0.05: ssar_genes_lehallier.append(gene) print( f'Number of ss genes in Lehallier, et. al.: {len(ss_genes_lehallier)}') print( f'Number of UNIQUE ss genes in Lehallier, et. al.: {len(set(ss_genes_lehallier))}' ) genes_duplicates = [ item for item, count in collections.Counter(ss_genes_lehallier).items() if count > 1 ] genes_duplicates_str = {'id': [], 'gene': []} for gene in genes_duplicates: ids = gene_id[gene] for id in ids: genes_duplicates_str['id'].append(id) genes_duplicates_str['gene'].append(gene) save_table_dict_xlsx(f'{save_path}/duplicates_ss', genes_duplicates_str) print( f'Number of ar genes in Lehallier, et. al.: {len(ar_genes_lehallier)}') print( f'Number of UNIQUE ar genes in Lehallier, et. al.: {len(set(ar_genes_lehallier))}' ) genes_duplicates = [ item for item, count in collections.Counter(ar_genes_lehallier).items() if count > 1 ] genes_duplicates_str = {'id': [], 'gene': []} for gene in genes_duplicates: ids = gene_id[gene] for id in ids: genes_duplicates_str['id'].append(id) genes_duplicates_str['gene'].append(gene) save_table_dict_xlsx(f'{save_path}/duplicates_ar', genes_duplicates_str) print( f'Number of ssar genes in Lehallier, et. al.: {len(ssar_genes_lehallier)}' ) print( f'Number of UNIQUE ssar genes in Lehallier, et. al.: {len(set(ssar_genes_lehallier))}' ) genes_duplicates = [ item for item, count in collections.Counter(ssar_genes_lehallier).items() if count > 1 ] genes_duplicates_str = {'id': [], 'gene': []} for gene in genes_duplicates: ids = gene_id[gene] for id in ids: genes_duplicates_str['id'].append(id) genes_duplicates_str['gene'].append(gene) save_table_dict_xlsx(f'{save_path}/duplicates_ssar', genes_duplicates_str) return ss_genes_lehallier, ar_genes_lehallier, ssar_genes_lehallier
datasets = ['GSE40279', 'GSE87571', 'EPIC', 'GSE55763'] for dataset in datasets: print(dataset) source_fn = f'E:/YandexDisk/Work/pydnameth/approaches/ancova/Treatment/{dataset}.xlsx' source_keys = ['x:category_pval', 'x:category'] target_fn = 'E:/YandexDisk/Work/pydnameth/draft/fixes/materials_and_methods/update_4_bonferroni/ssDMPs_ext.xlsx' target_main_key = 'MarkerName' target_keys = [ f'interaction p-value ({dataset})', f'interaction coeff ({dataset})' ] save_fn = 'E:/YandexDisk/Work/pydnameth/draft/fixes/materials_and_methods/update_4_bonferroni/ssDMPs_ext' source_dict = load_table_dict_by_key_xlsx(source_fn, 'item') target_dict = load_table_dict_xlsx(target_fn) for key in target_keys: target_dict[key] = [] for item in target_dict[target_main_key]: for key_id, key in enumerate(target_keys): if item in source_dict[source_keys[key_id]]: target_dict[key].append(source_dict[source_keys[key_id]][item]) else: target_dict[key].append('NA') save_table_dict_xlsx(save_fn, target_dict)
def process_human_plasma_proteome(target_dict, proteomic_genes, save_path, aux_key='aux'): fn_exp = 'E:/YandexDisk/Work/pydnameth/human_plasma_proteome/GTEx' exp_dict = load_table_dict(fn_exp) for tissue in exp_dict: if tissue not in ['Name', 'Description']: exp_dict[tissue] = np.log10(np.asarray(exp_dict[tissue])) genes = {} for dataset in target_dict: for key in target_dict[dataset]: if 'aux_' in key: aux_key = key break genes[dataset] = {'gene': get_genes(target_dict[dataset], aux_key)} genes['Proteomic'] = {'gene': proteomic_genes} for dataset in genes: tmp_key = 'gene' print(f'num genes in {dataset}: {len(genes[dataset][tmp_key])}') sets, sets_with_difference = get_sets(genes, item_key='gene') curr_save_path = f'{save_path}/intersection_full' if not os.path.exists(curr_save_path): os.makedirs(curr_save_path) for set_key in sets: save_dict = {} for metrics_key in ['gene']: save_dict[metrics_key] = [] for i in sets[set_key]: save_dict['gene'].append(i) save_table_dict_xlsx(f'{curr_save_path}/{set_key}', save_dict) gtex_processing(exp_dict, sets[set_key], set_key, curr_save_path) curr_save_path = f'{save_path}/intersection_diff' if not os.path.exists(curr_save_path): os.makedirs(curr_save_path) venn_labels = [] for set_key in sets_with_difference: save_dict = {} for metrics_key in ['gene']: save_dict[metrics_key] = [] for i in sets_with_difference[set_key]: save_dict['gene'].append(i) save_table_dict_xlsx(f'{curr_save_path}/{set_key}', save_dict) curr_labels = set_key.split('_') + [ str(len(sets_with_difference[set_key])) ] venn_labels.append('<br>'.join(curr_labels)) if len(genes) == 4: layout = get_layout_4() trace = get_trace_4(venn_labels) elif len(genes) == 3: layout = get_layout_3() trace = get_trace_3(venn_labels) elif len(genes) == 2: layout = get_layout_2() trace = get_trace_2(venn_labels) else: raise ValueError(f'Venn diagram is not supported') fig = { 'data': [trace], 'layout': layout, } save_figure(f'{save_path}/venn', fig)
def gtex_processing(exp_dict, genes, main_key, save_path, is_plot=False): gene_id_dict = dict( zip(exp_dict['Description'], list(range(0, len(exp_dict['Description']))))) result_dict = {key: [] for key in exp_dict} for gene in genes: if gene in gene_id_dict: row_id = gene_id_dict[gene] for key in result_dict: result_dict[key].append(exp_dict[key][row_id]) save_table_dict_xlsx(f'{save_path}/{main_key}_expression', result_dict) if is_plot: target_keys = ['Whole Blood', 'Liver', 'Brain - Frontal Cortex (BA9)'] plot_data = [] for t_id, tissue in enumerate(target_keys): if len(result_dict[tissue]) > 0: xs, ys = get_pdf_x_and_y(result_dict[tissue], num_bins=50) color = cl.scales['8']['qual']['Set1'][t_id] coordinates = color[4:-1].split(',') color_border = 'rgba(' + ','.join(coordinates) + ',' + str( 0.8) + ')' scatter = go.Scatter(x=xs, y=ys, name=tissue, mode='lines', line=dict(width=4, color=color_border), showlegend=True) plot_data.append(scatter) layout = get_layout('$log_{2}GTEX$', 'Probability density function') fn = f'{save_path}/{main_key}' figure = go.Figure(data=plot_data, layout=layout) plotly.offline.plot(figure, filename=f'{fn}.html', auto_open=False, show_link=True) pio.write_image(figure, f'{fn}.png') pio.write_image(figure, f'{fn}.pdf') traces = [] base_order = [] color_scales = [ px.colors.sequential.Reds[2:-2], px.colors.sequential.Blues[2:-2], px.colors.sequential.Greens[2:-2] ] for t_id, tissue in enumerate(target_keys): if len(result_dict[tissue]) > 0: target_genes = result_dict['Description'] target_exp = result_dict[tissue] if t_id == 0: base_order = np.argsort(target_exp)[::-1] genes_sorted = list(np.array(target_genes)[base_order]) exp_sorted = list(np.array(target_exp)[base_order]) traces.append( go.Bar(orientation='h', name=tissue, y=genes_sorted, x=[x + 4 for x in exp_sorted], base=-4, marker=dict( color=[x + 4 for x in exp_sorted], colorscale=color_scales[t_id], colorbar=dict( showticklabels=False, len=1, x=1 + 0.1 * t_id, title=dict( text=tissue.replace(' ', '<br>'), font=dict(size=12, color=color_scales[t_id][-1]), side='right'), ), showscale=True))) layout = go.Layout( plot_bgcolor='rgba(233,233,233,0)', barmode='overlay', showlegend=False, autosize=False, margin=go.layout.Margin(l=10, r=10, b=10, t=10, pad=0), height=15 * (len(base_order) + 1), width=1000, xaxis=dict( gridcolor='rgb(100, 100, 100)', #gridwidth=0.01, mirror=True, linecolor='black', title='$log_{10}GTEX$', autorange=False, range=[-4, 5], showgrid=False, showline=True, titlefont=dict(family='Arial', size=30, color='black'), showticklabels=True, tickangle=0, tickfont=dict(family='Arial', size=15, color='black'), exponentformat='e', showexponent='all', ), yaxis=dict( gridcolor='rgb(100, 100, 100)', mirror=True, linecolor='black', autorange=True, showgrid=False, showline=True, tickangle=0, titlefont=dict(family='Arial', size=10, color='black'), showticklabels=True, tickfont=dict(family='Arial', size=10, color='black'), exponentformat='e', showexponent='all', ), ) traces = traces[::-1] fn = f'{save_path}/{main_key}_combo' fig = go.Figure(data=traces, layout=layout) fig.update_layout(barmode='group') plotly.offline.plot(fig, filename=fn + '.html', auto_open=False, show_link=True) pio.write_image(fig, fn + '.png') pio.write_image(fig, fn + '.pdf')
data_dict = load_table_dict_xlsx(f'{path}/{dataset}.xlsx') data_dict_passed = {} for key in data_dict: data_dict_passed[key] = [] num_cpgs = len(data_dict[cpg_key]) for cpg_id in tqdm(range(0, num_cpgs), desc=f'{dataset} processing'): is_passed = check_condition(data_dict[area_criteria_key][cpg_id], data_dict[slope_criteria_key][cpg_id]) if is_passed: for key in data_dict: data_dict_passed[key].append(data_dict[key][cpg_id]) save_table_dict_xlsx(f'{path}/{dataset}_passed', data_dict_passed) data_dicts_passed[dataset] = data_dict_passed cpgs_dicts_passed[dataset] = data_dict_passed[cpg_key] datasets_ids = list(range(0, len(datasets))) keys_ordered = copy.deepcopy(datasets) sets = {} checking = {} for dataset in datasets: sets[dataset] = set(cpgs_dicts_passed[dataset]) checking[dataset] = 0 for L in range(2, len(datasets) + 1): for subset in itertools.combinations(datasets_ids, L):
result['corr_coeff'].append(corr_coeff) result['p_value'].append(p_value) result['item'].append(cpg) aux = '' if cpg in config_unn.cpg_gene_dict: aux = ';'.join(config_unn.cpg_gene_dict[cpg]) result['aux_unn'].append(aux) aux = '' if cpg in config_other.cpg_gene_dict: aux = ';'.join(config_other.cpg_gene_dict[cpg]) result['aux_other'].append(aux) pvals = np.asarray(result['p_value']) reject, pvals_corr, alphacSidak, alphacBonf = multipletests( pvals, 0.05, method='fdr_bh' ) result['p_value_benjamini_hochberg'] = pvals_corr reject, pvals_corr, alphacSidak, alphacBonf = multipletests( pvals, 0.05, method='bonferroni' ) result['p_value_bonferroni'] = pvals_corr save_table_dict_xlsx(f'{save_path}/pbc_vs_GSE87571', result)
cpg_map_info_dict = subset['cpg_map_info_dict'] cpg_dict = {x: 0 for x in cpg_list} table_dict = load_table_dict_xlsx( f'{path}/{dataset}/{dataset}_{data_type}.xlsx') filtered = {x: [] for x in table_dict.keys()} for cpg_id, cpg in tqdm(enumerate(table_dict['CpG'])): if cpg in cpg_dict: for key in table_dict: filtered[key].append(table_dict[key][cpg_id]) filtered = add_info_to_dict(filtered) save_table_dict_xlsx( f'{path}/{dataset}/{dataset}_{data_type}_filtered', filtered) save_table_dict_pkl(f'{path}/{dataset}/{dataset}_{data_type}_filtered', filtered) all_data[dataset] = filtered transforms = {x: 'lin' for x in metrics} values[dataset] = {} xs[dataset] = {} ys[dataset] = {} for key in metrics: if 'P.Val' in key: values[dataset][key] = -np.log10( np.asarray(filtered[key])[np.nonzero(filtered[key])]) labels[key] = f'-lоg({key})'
else: if opt != '': count_global_mod[opt] = count_global[opt] count_target_mod[opt] = count_target[opt] if opt == '': count_global_mod['NA'] = count_global[opt] count_target_mod['NA'] = count_target[opt] opt = 'NA' odds_ratio, p_value = perform_fisher(count_target_mod[opt], count_global_mod[opt], target_num, global_num) odds_ratios[opt] = odds_ratio p_values[opt] = p_value res_table_dict = defaultdict(list) for opt in orders[var]: res_table_dict[var].append(opt) res_table_dict['number of probes'].append(count_target_mod[opt]) res_table_dict['total number of probes'].append(count_global_mod[opt]) res_table_dict['p-value'].append(p_values[opt]) res_table_dict['odds ratio'].append(odds_ratios[opt]) if not os.path.exists(save_path): os.makedirs(save_path) save_table_dict_xlsx(f'{save_path}/{var}', res_table_dict) x_data = res_table_dict[var] y_data = list(map(float, res_table_dict['odds ratio'])) odds_ratio_plot(x_data, y_data, f'{save_path}/{var}')
f'max_abs_slope_{get_polygon_hash(dataset)}', f'slope_{get_linreg_female_hash(dataset)}', f'slope_{get_linreg_male_hash(dataset)}' ] save_path = f'{get_data_path()}/approaches/sex_specific_not_age_related/{type}' if not os.path.exists(save_path): os.makedirs(save_path) data_dicts = get_data_dicts(datasets, 'aggregator', keys_load, keys_save, get_approach_1_hash, check_condition) cpg_dicts = get_cpg_dicts(data_dicts) for dataset, data_dict in data_dicts.items(): save_table_dict_xlsx(f'{save_path}/{dataset}', data_dict) sets, sets_with_difference = get_sets(datasets, data_dicts) save_dicts = get_cpg_dataset_save_dicts(sets, data_dicts, cpg_dicts) curr_save_path = f'{save_path}/intersection' if not os.path.exists(curr_save_path): os.makedirs(curr_save_path) for key, save_dict in save_dicts.items(): save_table_dict_xlsx(f'{curr_save_path}/{key}', save_dict) save_dicts = get_cpg_dataset_save_dicts(sets_with_difference, data_dicts, cpg_dicts) curr_save_path = f'{save_path}/intersection_with_difference' if not os.path.exists(curr_save_path): os.makedirs(curr_save_path)