def main(key, remade=True): table_annotated = key + get_ending("annotation") output = get_results_file(key, 'BAD') with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file: d = json.load(read_file) rev_d = make_reverse_dict(d) badmap_file_name = rev_d[key] print('Now doing {} \n with BAD map file {}'.format(table_annotated, badmap_file_name)) badmap_file_path = create_badmaps_path_function(badmap_file_name, valid=remade) with open(badmap_file_path, 'r') as badmap_file, open(output, 'w') as out, open(table_annotated, 'r') as table_file: out.write(pack(['#chr', 'pos', 'ID', 'ref', 'alt', 'ref_read_counts', 'alt_read_counts', 'repeat_type'] + callers_names + ['BAD'] + ["Q{:.2f}".format(x) for x in segmentation_states] + ['SNP_count', 'sum_cover'])) u = UnpackBadSegments(None) for chr, pos, ID, ref, alt, ref_c, alt_c, repeat_type, in_callers, \ in_intersection, segment_BAD, segment_snps, segment_snp_ids,\ segment_sumcov, Qual in \ Intersection(table_file, badmap_file, write_segment_args=True, write_intersect=True, unpack_snp_function=lambda x: unpack(x, use_in='Pcounter'), unpack_segments_function=lambda x: u.unpack_bad_segments(x, segmentation_states)): if in_intersection and ID.startswith('rs'): out.write(pack([chr, pos, ID, ref, alt, ref_c, alt_c, repeat_type] + [in_callers[name] for name in callers_names] + [segment_BAD] + [Qual[x] for x in Qual] + [segment_snp_ids, segment_sumcov]))
def main(base_path): exp = dict() with gzip.open(base_path + get_ending('vcf'), 'rt') as f: make_dict_from_vcf(f, exp) sorted_lines = [[chromosome, pos, ID, REF, ALT, R, A] for ((chromosome, pos, ID, REF, ALT), (R, A)) in exp.items()] sorted_lines = sorted(sorted_lines, key=lambda x: x[1]) sorted_lines = sorted(sorted_lines, key=lambda x: x[0]) if os.path.exists(repeats_path): with open(repeats_path, "r") as repeats_buffer: new_arr = [] for chromosome, pos, ID, REF, ALT, R, A, in_repeats, repeat_type \ in Intersection(sorted_lines, repeats_buffer, write_intersect=True, write_segment_args=True): if in_repeats and ID == ".": continue new_arr.append( [chromosome, pos, ID, REF, ALT, R, A, repeat_type]) sorted_lines = new_arr else: sorted_lines = [x + [''] for x in sorted_lines] for peak_type in callers_names: new_arr = [] caller_path = make_sorted_caller_path(base_path, peak_type) if os.path.isfile(caller_path): peak_file = open(caller_path, "r") else: peak_file = [] for chromosome, pos, ID, REF, ALT, R, A, repeat_type, *in_peaks in Intersection( sorted_lines, peak_file, write_intersect=True): new_arr.append([chromosome, pos, ID, REF, ALT, R, A, repeat_type] + in_peaks) sorted_lines = new_arr table_annotated_path = base_path + get_ending('annotation') with open(table_annotated_path, "w") as out: out.write( pack([ '#chr', 'pos', 'ID', 'ref', 'alt', 'ref_read_counts', 'alt_read_counts', 'repeat_type' ] + callers_names)) for split_line in sorted_lines: out.write(pack(split_line))
def main(files, path_to_output): phenotypes_for_db_list = [ parse_grasp(files['GRASP']), parse_ebi(files['EBI']), parse_clinvar(files['ClinVar']), parse_phewas(files['PheWas']), parse_finemapping(files['FineMapping']) ] phenotypes_ids_dict = {} ids_phenotypes_dict = {} phenotype_id = 1 def remove_phen_name_punctuation(phenotype_name): return phenotype_name.lower().replace("'", '').replace('_', ' ') for db in phenotypes_for_db_list: for phenotype in db: if remove_phen_name_punctuation( phenotype) not in phenotypes_ids_dict: phenotypes_ids_dict[remove_phen_name_punctuation( phenotype)] = phenotype_id ids_phenotypes_dict[ phenotype_id] = remove_phen_name_punctuation(phenotype) phenotype_id += 1 all_phenotypes = {} for i in range(len(phenotypes_for_db_list)): for phenotype in phenotypes_for_db_list[i]: for s in phenotypes_for_db_list[i][phenotype]: if s not in all_phenotypes: all_phenotypes[s] = {x: set() for x in phenotype_db_names} all_phenotypes[s][phenotype_db_names[i]].add( phenotypes_ids_dict[remove_phen_name_punctuation( phenotype)]) print('pheno sizes:', len(phenotypes_ids_dict), len(all_phenotypes)) with open(path_to_output, 'w') as out: header = ['RSID', '#all', '#allbutgrasp', '#allsum', '#allsumbutgrasp'] + \ ['#' + x for x in phenotype_db_names] + \ phenotype_db_names out.write('\t'.join(header) + '\n') for s in all_phenotypes: abn, phenotypes_without_grasp = set(), set() for db in phenotype_db_names: if db != 'grasp': phenotypes_without_grasp.update(all_phenotypes[s][db]) abn.update(all_phenotypes[s][db]) bb = [len(all_phenotypes[s][x]) for x in phenotype_db_names] bb = [ sum(len(all_phenotypes[s][x]) for x in phenotype_db_names), sum(len(all_phenotypes[s][x]) for x in phenotype_db_names[1:]) ] + bb bb = [len(abn), len(phenotypes_without_grasp)] + bb cc = [ ';'.join( sorted( [ids_phenotypes_dict[y] for y in all_phenotypes[s][x]])) for x in phenotype_db_names ] out.write(pack(['rs{}'.format(s), *bb, *cc])) for x in phenotype_db_names: all_phenotypes[s][x] = len(all_phenotypes[s][x]) print('{} is successfully created'.format(path_to_output))
def correlation_with_cosmic(SNP_objects, mode, method='normal', heatmap_data_file=None, cell_line_name='', cosmic_names=None): if cosmic_names is None: cosmic_names = {} heatmap = None if heatmap_data_file is None else open( heatmap_data_file, 'w') cosmic_segments = [] with open(cosmic_path, 'r') as cosmic_file: for line in cosmic_file: cosmic_cell_line_segments = unpack_cosmic_segments( line, mode=mode, cell_line_name=cell_line_name, cosmic_names=cosmic_names) if cosmic_cell_line_segments: cosmic_segments.append(cosmic_cell_line_segments) snp_BAD_list = [] cosmic_BAD_list = [] if method == 'normal': for chromosome, pos, snp_BAD, quals, in_intersect, cosmic_BAD \ in Intersection(SNP_objects, cosmic_segments, write_intersect=True, write_segment_args=True): if not in_intersect: continue snp_BAD_list.append(snp_BAD) cosmic_BAD_list.append(cosmic_BAD) if heatmap is not None: heatmap.write(pack([chromosome, pos, snp_BAD, cosmic_BAD])) if heatmap is not None: heatmap.close() if len(snp_BAD_list) != 0: kt = kendalltau(snp_BAD_list, cosmic_BAD_list)[0] if kt == 'nan': return 'NaN' return kt return 'NaN' elif method == 'cover': for chromosome, pos, cov, snp_BAD, quals, in_intersect, cosmic_BAD \ in Intersection(SNP_objects, cosmic_segments, write_intersect=True, write_segment_args=True): if not in_intersect: continue snp_BAD_list.append(snp_BAD) cosmic_BAD_list.append(cosmic_BAD) if heatmap is not None: heatmap.write( pack([chromosome, pos, cov, snp_BAD, cosmic_BAD] + [quals[x] for x in quals])) if heatmap is not None: heatmap.close() if len(snp_BAD_list) != 0: kt = kendalltau(snp_BAD_list, cosmic_BAD_list)[0] if kt == 'nan': return 'NaN' return kt return 'NaN'
def main(what_for, key_name, remade=True): check_if_in_expected_args(what_for) table_path = get_result_table_path(what_for, key_name) with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file: old_rev_d = make_reverse_dict(json.load(read_file)) rev_d = { get_results_file(k, 'p-value', False): v for k, v in old_rev_d.items() } tables = [] if what_for == "CL": tables = cell_lines_dict[key_name] if what_for == "TF": tables = tf_dict[key_name] print('Reading datasets for {} {}'.format(what_for, key_name)) common_snps = dict() for table in tables: if os.path.isfile(table) and is_valid( split_ext_recursive(table), rev_d, remade=remade): table_name = get_name(table) another_agr = get_another_agr(table, what_for) with open(table, 'r') as file: for line in file: try: (chromosome, pos, ID, ref, alt, ref_c, alt_c, repeat, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt) = unpack(line, use_in="Aggregation") except ValueError: if line.startswith('#'): continue else: raise if np.isnan(p_ref) or ID == '.': continue cov = ref_c + alt_c try: common_snps[(chromosome, pos, ID, ref, alt, repeat)].append( (cov, ref_c, alt_c, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt, table_name, another_agr)) except KeyError: common_snps[(chromosome, pos, ID, ref, alt, repeat)] = [ (cov, ref_c, alt_c, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt, table_name, another_agr) ] else: print("There is no {}".format(table)) print('Writing {}'.format(key_name)) with open(table_path, 'w') as out: out.write( pack([ '#chr', 'pos', 'ID', 'ref', 'alt', 'repeat_type', 'n_peak_calls', 'n_peak_callers', 'mean_BAD', 'mean_SNP_per_segment', 'n_aggregated', 'refc_mostsig_ref', 'altc_mostsig_ref', 'BAD_mostsig_ref', 'es_mostsig_ref', 'p_mostsig_ref', 'refc_mostsig_alt', 'altc_mostsig_alt', 'BAD_mostsig_alt', 'es_mostsig_alt', 'p_mostsig_alt', 'min_cover', 'max_cover', 'median_cover', 'total_cover', 'es_mean_ref', 'es_mean_alt', 'logitp_ref', 'logitp_alt' ])) SNP_counter = 0 print('{} snps'.format(len(common_snps))) if len(common_snps) == 0: os.remove(table_path) sys.exit(0) origin_of_snp_dict = OrderedDict() keys = list(common_snps.keys()) keys = sorted(keys, key=lambda chr_pos: chr_pos[1]) keys = sorted(keys, key=lambda chr_pos: chr_pos[0]) for key in keys: chromosome, pos, ID, ref, alt, repeat = key value = common_snps[key] SNP_counter += 1 if SNP_counter % 10000 == 0: print('done {}'.format(SNP_counter)) unique_callers_counter = dict( zip(callers_names, [False] * len(callers_names))) total_callers_counter = 0 BAD_array = [] SNPs_per_segment_array = [] p_ref_array = [] p_alt_array = [] cover_array = [] ref_effect_size_array = [] alt_effect_size_array = [] table_names_array = [] another_agr_name = [] ref_counts_array = [] alt_counts_array = [] for v in value: cov, ref_c, alt_c, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt, table_name, \ another_agr = v table_names_array.append(table_name) another_agr_name.append(another_agr) for caller in callers_names: unique_callers_counter[caller] = unique_callers_counter[ caller] or in_callers[caller] total_callers_counter += in_callers[caller] BAD_array.append(BAD) SNPs_per_segment_array.append(seg_c) p_ref_array.append(p_ref) p_alt_array.append(p_alt) if not np.isnan(es_ref): ref_effect_size_array.append(es_ref / np.log(2)) if not np.isnan(es_alt): alt_effect_size_array.append(es_alt / np.log(2)) cover_array.append(cov) ref_counts_array.append(ref_c) alt_counts_array.append(alt_c) p = 1 / (BAD + 1) min_cover = min(cover_array) max_cover = max(cover_array) med_cover = median_grouped(cover_array) total_cover = sum(cover_array) unique_callers = sum(unique_callers_counter[caller] for caller in callers_names) mean_BAD = np.round(np.mean(BAD_array), 2) mean_SNPs_per_segment = np.round(np.mean(SNPs_per_segment_array), 1) n_aggregated = len(value) logitp_ref = logit_combine_p_values(p_ref_array) logitp_palt = logit_combine_p_values(p_alt_array) if ref_effect_size_array: weights = [-1 * np.log10(x) for x in p_ref_array if x != 1] es_mean_ref = np.round( np.average(ref_effect_size_array, weights=weights), 3) es_mostsig_ref = ref_effect_size_array[int(np.argmax(weights))] idx = int(np.argmax([-x for x in p_ref_array])) p_mostsig_ref = p_ref_array[idx] ref_c_mostsig_ref = ref_counts_array[idx] alt_c_mostsig_ref = alt_counts_array[idx] BAD_mostsig_ref = BAD_array[idx] else: es_mean_ref = 'NaN' es_mostsig_ref = 'NaN' ref_c_mostsig_ref = 'NaN' p_mostsig_ref = 'NaN' alt_c_mostsig_ref = 'NaN' BAD_mostsig_ref = 'NaN' if alt_effect_size_array: weights = [-1 * np.log10(x) for x in p_alt_array if x != 1] es_mean_alt = np.round( np.average(alt_effect_size_array, weights=weights), 3) es_mostsig_alt = alt_effect_size_array[int(np.argmax(weights))] idx = int(np.argmax([-x for x in p_alt_array])) p_mostsig_alt = p_alt_array[idx] ref_c_mostsig_alt = ref_counts_array[idx] alt_c_mostsig_alt = alt_counts_array[idx] BAD_mostsig_alt = BAD_array[idx] else: es_mean_alt = 'NaN' es_mostsig_alt = 'NaN' ref_c_mostsig_alt = 'NaN' p_mostsig_alt = 'NaN' alt_c_mostsig_alt = 'NaN' BAD_mostsig_alt = 'NaN' out.write( pack([ chromosome, pos, ID, ref, alt, repeat, total_callers_counter, unique_callers, mean_BAD, mean_SNPs_per_segment, n_aggregated, ref_c_mostsig_ref, alt_c_mostsig_ref, BAD_mostsig_ref, es_mostsig_ref, p_mostsig_ref, ref_c_mostsig_alt, alt_c_mostsig_alt, BAD_mostsig_alt, es_mostsig_alt, p_mostsig_alt, min_cover, max_cover, med_cover, total_cover, es_mean_ref, es_mean_alt, logitp_ref, logitp_palt ])) origin_of_snp_dict["\t".join(map(str, key))] = { 'aligns': table_names_array, expected_args[what_for]: another_agr_name, 'ref_counts': ref_counts_array, 'alt_counts': alt_counts_array, 'ref_ef': ref_effect_size_array, 'alt_ef': alt_effect_size_array, 'BAD': BAD_array, 'ref_pvalues': p_ref_array, 'alt_pvalues': p_alt_array, } print("Counting FDR") table = pd.read_table(table_path) if table.empty: os.remove(table_path) sys.exit(0) mc_filter_array = np.array(table['max_cover'] >= 20) if sum(mc_filter_array) != 0: bool_ar_ref, p_val_ref, _, _ = statsmodels.stats.multitest.multipletests( table[mc_filter_array]["logitp_ref"], alpha=0.05, method='fdr_bh') bool_ar_alt, p_val_alt, _, _ = statsmodels.stats.multitest.multipletests( table[mc_filter_array]["logitp_alt"], alpha=0.05, method='fdr_bh') else: p_val_ref = [] p_val_alt = [] bool_ar_ref = [] bool_ar_alt = [] fdr_by_ref = np.array(['NaN'] * len(table.index), dtype=np.float128) fdr_by_ref[mc_filter_array] = p_val_ref table["fdrp_bh_ref"] = fdr_by_ref fdr_by_alt = np.array(['NaN'] * len(table.index), dtype=np.float128) fdr_by_alt[mc_filter_array] = p_val_alt table["fdrp_bh_alt"] = fdr_by_alt table.to_csv(table_path, sep="\t", index=False) bool_ar = np.array([False] * len(table.index), dtype=np.bool) bool_ar[mc_filter_array] = bool_ar_alt + bool_ar_ref with open( os.path.join(results_path, what_for + '_DICTS/{}.json'.format(key_name)), 'w') as out: json.dump(origin_of_snp_dict, out)
def main(remake=False): correlation_file_path = get_correlation_file_path(remake=remake) cor_df_test = find_test_datasets(correlation_file_path) test_dfs = open_dfs(cor_df_test, remake=remake, concat=False) print('Test concatenated') min_tr, max_tr = 20, 75 results = [] for dataset, dataset_df in test_dfs: cov_dfs_test = {} for cov in dataset_df['cov'].unique(): cov_dfs_test[cov] = dataset_df[dataset_df['cov'] == cov].copy() print('Split test {}'.format(dataset)) args, vals, covs = construct_total_dist(cov_dfs_test, min_tr=min_tr, max_tr=max_tr) results.append({ 'args': args, 'vals': vals, 'covs': covs, 'dataset': dataset, 'snps': len(dataset_df.index) }) with open(os.path.expanduser('~/cov_res_debug.json'), 'w') as f: json.dump(results, f) cors = pd.read_table(correlation_file_path) if not remake: # collect_stats cell_line_data = {} for d in results: if d['args']: line, cells = d['dataset'].split('@') cor = cors[(cors['#cell_line'] == line) & ( cors['cells'] == cells)]['cor_by_snp_CAIC'].tolist() assert len(cor) == 1 cor = cor[0] if not pd.isna(cor): cell_line_data.setdefault(line, { 'correlations': [], 'cells': [], 'snps': [] }) cell_line_data[line]['correlations'].append(cor) cell_line_data[line]['cells'].append(cells) cell_line_data[line]['snps'].append(sum(d['vals'])) # construct big cell lines big_cell_lines = set() cell_line_reference = {} for line, data in cell_line_data.items(): if len(data['correlations']) < 4: continue cor_treshold = np.quantile(data['correlations'], 0.75) datasets = [(cells, cor, snps) for cells, cor, snps in zip( data['cells'], data['correlations'], data['snps']) if cor >= cor_treshold] snps = sum(x[2] for x in datasets) if snps >= 25000: cell_line_reference[line] = [x[0] for x in datasets] big_cell_lines.add(line) else: prev_excluded = pd.read_table( get_excluded_badmaps_list_path(remake=False)) big_cell_lines = set() cell_line_reference = {} for index, row in prev_excluded.iterrows(): if row['is_ref']: big_cell_lines.add(row['#cell_line']) cell_line_reference.setdefault(row['#cell_line'], []).append(row['sample']) big_cell_lines = list(big_cell_lines) ref_dists = {x: {} for x in big_cell_lines + ['Other']} ref_vars = {x: {} for x in big_cell_lines + ['Other']} all_vars = [] all_metrics = [] all_cells = [] all_lines = [] all_sizes = [] all_is_ref = [] for d in results: if d['args']: line, cells = d['dataset'].split('@') dist = dict(zip(d['args'], d['vals'])) if line in big_cell_lines: if cells in cell_line_reference[line]: ref_dists[line] = update_dist(ref_dists[line], dist) ref_dists['Other'] = update_dist(ref_dists[line], dist) for key in ref_dists: ref_dists[key] = transform_dist_to_list(ref_dists[key]) ref_vars[key] = np.nanstd(ref_dists[key]) print(ref_vars) for d in results: if d['args']: dist = dict(zip(d['args'], d['vals'])) line, cells = d['dataset'].split('@') snps = d['snps'] flat_dist = transform_dist_to_list(dist) ref_dist = ref_dists[line if line in big_cell_lines else 'Other'] if not flat_dist or len(flat_dist) == 0: continue if not ref_dist or len(ref_dist) == 0: print('Empty ref dist for {}'.format(line)) exit(1) stat, p = levene(flat_dist, ref_dist) assert not pd.isna(p) all_vars.append( (np.nanstd(flat_dist), ref_vars[line if line in big_cell_lines else 'Other'])) all_metrics.append(p) all_cells.append(cells) all_lines.append(line) all_sizes.append(snps) all_is_ref.append(True if line in big_cell_lines and cells in cell_line_reference[line] else False) _, all_fdr, _, _ = statsmodels.stats.multitest.multipletests( all_metrics, alpha=0.05, method='fdr_bh') with open(get_excluded_badmaps_list_path(remake=remake), 'w') as out: out.write( pack([ '#cell_line', 'sample', 'size', 'dataset_es_var', 'ref_es_var', 'fdr', 'is_ref' ])) for fdr, size, line, ce, var, ref in zip(all_fdr, all_sizes, all_lines, all_cells, all_vars, all_is_ref): out.write(pack([line, ce, size, var[0]**2, var[1]**2, fdr, ref]))
def main(file_name, remake=False): correlation_path = get_correlation_path() with open(badmaps_dict_path, 'r') as file: aligns_by_cell_type = json.loads(file.readline().strip()) modes = [] for dir_name in sorted(os.listdir(get_badmaps_path_by_validity())): if os.path.isdir(os.path.join(get_badmaps_path_by_validity(), dir_name)): modes.append(dir_name) try: assert os.path.isfile(os.path.join(badmaps_path, 'merged_vcfs', file_name)) except AssertionError: print(os.path.join(badmaps_path, 'merged_vcfs', file_name), file_name) exit(1) name = file_name.split('@')[0] lab = os.path.splitext(file_name.split('@')[1])[0] try: aligns = aligns_by_cell_type[file_name[:-4]] # .tsv al_list = [os.path.basename(align) for align in aligns if os.path.isfile(align)] datasetsn = len(al_list) except KeyError: datasetsn = 'nan' al_list = [] print(file_name) table_path = os.path.join(badmaps_path, 'merged_vcfs', file_name) for mode in modes: if re.match(r'^CAIC@.+@.+$', mode) is not None: states = get_states(mode.split('@')[1]) else: states = get_states('') out_dir = os.path.join(correlation_path, mode + '_tables{}'.format('_filtered' if remake else '')) if not os.path.isdir(out_dir): try: os.mkdir(out_dir) except OSError as exc: if exc.errno != errno.EEXIST: raise pass badmaps_file_path = os.path.join(get_badmaps_path_by_validity(valid=remake), mode, name + '@' + lab + '.badmap.tsv') out_path = os.path.join(out_dir, name + '@' + lab + '.tsv') print(out_path) u = UnpackBadSegments(0) with open(table_path, 'r') as table, open(badmaps_file_path, 'r') as BADmap_file, open(out_path, 'w') as out: out.write('#' + str(datasetsn) + '@' + lab + '@' + ','.join(al_list) + '\n') for chrom, pos, ref, alt, filename, in_intersection, segment_BAD, segment_snps, segment_snp_ids,\ segment_sumcov, segment_id, Qual \ in Intersection(table, BADmap_file, unpack_segments_function=lambda x: u.unpack_bad_segments(x, states), unpack_snp_function=unpack_snps, write_intersect=True, write_segment_args=True): if not in_intersection: continue p_value = get_p_value(ref + alt, 1 / (segment_BAD + 1), min(ref, alt)) out.write(pack([chrom, pos, ref, alt, segment_BAD] + [Qual[x] for x in Qual] + [segment_snp_ids, segment_sumcov] + [filename, segment_id, p_value]))