def run(): dic_primary_full, _ = return_metadata() signature_2_treatment_signature_analyzer = { 'CISPLATIN': ['21_SBS31_0.953955_1', '14_1'], 'CARBOPLATIN': ['21_SBS31_0.953955_1', '25_1'], '5-FU_CAPE': ['31_SBS17b_0.968799_1'], 'OXALIPLATIN': ['14_1', '37_1'] } signature_2_treatment_sigprofiler = { 'CISPLATIN': ['1_SBS31_0.968153_0.98'], 'CARBOPLATIN': ['1_SBS31_0.968153_0.98'], 'OXALIPLATIN': ['20_0.92'], '5-FU_CAPE': ['19_SBS17b_0.961548_0.99'], } outpath = 'figures/' path_base = 'data/hartwig/signatures/extraction/results/{}/snvs/exposures/' path_siganalyzer = path_base.format( "SignatureAnalyzer") + "Pan_full/Pan_full.exposures.tsv" path_sigprofiler = path_base.format( "SigProfiler" ) + "PanNoSkinNoUnknown.snvs/PanNoSkinNoUnknown.snvs.exposures.tsv" for drug in signature_2_treatment_signature_analyzer: process_exposures_methods(path_siganalyzer, path_sigprofiler, drug, signature_2_treatment_signature_analyzer, signature_2_treatment_sigprofiler, outpath)
def __init__(self, exposures_path, discriminate_path, matrix_treatments_path, ttype='Pan'): self.exposures = pd.read_csv(exposures_path, sep='\t', index_col=0) self.exposures = self.exposures.transpose() self.discriminate = pd.read_csv(discriminate_path, sep='\t') self.discriminate = self.discriminate[self.discriminate['ttype'] == ttype] self.discriminate = self.discriminate.astype({ 'signature': str, 'effect_size': float, 'pvals': float, 'treatment': str, 'ttype': str, 'total_treated': float }) self.treatments = pd.read_csv(matrix_treatments_path, sep='\t', index_col=0) self.ttype = ttype dic_ttypes, _ = return_metadata() if ttype == 'Pan': self.samples = dic_ttypes.keys() else: self.samples = [k for k, v in dic_ttypes.items() if v == ttype] self.effect_thresh = EFFECT_SIZE # min effect size self.pvalue_thresh = PVALUE # empirical_pvalue threshold self.overlap_thresh = OVERLAP # min overlap to consider cotreatments
def process_exposures_methods(path_siganalyzer, path_sigprofiler, drug, signature_2_treatment_signature_analyzer, signature_2_treatment_sigprofiler, outpath): df_sigpro = read_exposures(path_sigprofiler) df_sigan = read_exposures(path_siganalyzer) all_treated_samples = set() treated_samples = pickle.load(gzip.open(Conf['treatment_specific_drug'])) dic_primary_full, _ = return_metadata() dsiganalyzer = defaultdict(lambda: defaultdict(float)) dsiganalyzer_simple = defaultdict(float) dsiganalyzer_simple_exposure = defaultdict(float) sigs_affected_signature_analyzer = signature_2_treatment_signature_analyzer[ drug] for sample, d in df_sigan.iterrows(): if sample in treated_samples[drug]['Pan']['YES']: dsiganalyzer[dic_primary_full[sample]][sample] = d.loc[ sigs_affected_signature_analyzer].sum() dsiganalyzer_simple[sample] = d.loc[ sigs_affected_signature_analyzer].sum() dsiganalyzer_simple_exposure[sample] = d.loc[ sigs_affected_signature_analyzer].sum() / d.sum() dsigprof = defaultdict(lambda: defaultdict(float)) dsigprof_simple = defaultdict(float) dsigprof_simple_exposure = defaultdict(float) sigs_affected_sigprofiler = signature_2_treatment_sigprofiler[drug] for sample, d in df_sigpro.iterrows(): if sample in treated_samples[drug]['Pan']['YES']: all_treated_samples.add(sample) dsigprof[dic_primary_full[sample]][sample] = d.loc[ sigs_affected_sigprofiler].sum() dsigprof_simple[sample] = d.loc[sigs_affected_sigprofiler].sum() dsigprof_simple_exposure[ sample] = d.loc[sigs_affected_sigprofiler].sum() / d.sum() barplot_classes(dsigprof_simple_exposure, dsiganalyzer_simple_exposure, drug, outpath, 'count') plot_single_distribution(dsigprof_simple, dsiganalyzer_simple, drug, outpath, 'count') plot_single_correlation(dsigprof_simple, dsiganalyzer_simple, drug, outpath, 'count') plot_single_distribution(dsigprof_simple_exposure, dsiganalyzer_simple_exposure, drug, outpath, 'exposure') plot_single_correlation(dsigprof_simple_exposure, dsiganalyzer_simple_exposure, drug, outpath, 'exposure') bad_samples_drug = get_samples_similar_exposure( dsigprof_simple_exposure, dsiganalyzer_simple_exposure, drug, outpath, 'exposure')
def merge_samples_timing(input_file, outpath): os.makedirs(outpath, exist_ok=True) dic_primary_full, _ = return_metadata() all_hartwig = glob(input_file) to_remove = [ 'EXTENDED', 'MAJOR_CN', 'MINOR_CN', 'TOTAL_CN', 'NORMAL_CN', 'VAR_COUNTS', 'REF_COUNTS', 'GENDER', 'PURITY' ] all_files = [] for f in tqdm(all_hartwig, total=len(all_hartwig)): # remove those that do not belong to a good origin sample_name = os.path.basename(f).split('_')[1].split('.')[0] if sample_name in dic_primary_full: df = pd.read_csv(f, sep='\t', low_memory=False) df.drop(to_remove, axis=1, inplace=True) all_files.append(df) hart_df = pd.concat(all_files) hart_df['PRIMARY'] = hart_df['SAMPLE'].map(dic_primary_full) hart_df.sort_values(by=['CHROM', 'POS'], inplace=True) outfile = '{}/Pan.snvs.gz'.format(outpath) hart_df[hart_df['CLASS'] == 'SNV'].to_csv(outfile, sep='\t', header=True, index=False, compression='gzip') outfile = '{}/Pan.indels.gz'.format(outpath) hart_df[hart_df['CLASS'] == 'INDEL'].to_csv(outfile, sep='\t', header=True, index=False, compression='gzip') outfile = '{}/Pan.dbs.gz'.format(outpath) hart_df[hart_df['CLASS'] == 'DBS'].to_csv(outfile, sep='\t', header=True, index=False, compression='gzip') # save ALL file outfile = '{}/Pan.all.gz'.format(outpath) hart_df.to_csv(outfile, sep='\t', header=True, index=False, compression='gzip')
def create_matrix_treatments_plot(): treated = pickle.load(gzip.open(Conf['treatment_FDA_drug'])) dic_primary, _ = return_metadata() matrix_treated = defaultdict(lambda: defaultdict(int)) forbidden = ['CUP', 'Eye', 'Double-primary', 'Unknown'] for sample, t in dic_primary.items(): if t not in forbidden: for k, d in treated.items(): if sample in d[t]['YES']: matrix_treated[sample][k] = 1 elif sample in d[t]['NO']: matrix_treated[sample][k] = 0 out = pd.DataFrame.from_dict(matrix_treated, orient='index') out = out.dropna() dic_t = defaultdict(list) for i, row in out.iterrows(): dic_t[dic_primary[i]].append(i) return out, dic_t
def get_items_to_plot(samples_set, expos, sig): df = pd.read_csv(expos, sep='\t', index_col=0) df = df.T dic_primary_full, _ = return_metadata() colors_dict = return_colors() vals_to_plot = defaultdict(list) colors = defaultdict(list) count_exposed = defaultdict(int) count_total = defaultdict(int) for sample in samples_set: ttype = dic_primary_full[sample] if sample in df.index: sample_exp = df.loc[sample][sig] if sample_exp > 1: count_exposed[ttype] += 1 vals_to_plot[ttype].append(sample_exp) colors[ttype].append(colors_dict[ttype]) count_total[ttype] += 1 return vals_to_plot, colors, count_exposed, count_total
def plot_heatmap_treatment(file): outf = os.path.basename(file).split('.')[0] dic_primary_full, _ = return_metadata() color_ttype = return_colors() total_s = defaultdict(list) total_count = defaultdict(int) for sample, t in dic_primary_full.items(): total_s[t].append(sample) total_count[t] += 1 sorted_ttyps = sorted(total_count, key=total_count.get, reverse=True) treated = pickle.load(gzip.open(file)) forbidden = ['RADIATION', 'TOPOII'] matrix_treated = defaultdict(lambda: defaultdict(int)) for sample, t in dic_primary_full.items(): for k, d in treated.items(): if k not in forbidden: if sample in d[t]['YES']: matrix_treated[sample][k] = 1 elif sample in d[t]['NO']: matrix_treated[sample][k] = 0 d_treatments = defaultdict(int) for k, d in treated.items(): if k not in forbidden: for ttype, l in d.items(): d_treatments[k] += len(l['YES']) sorted_treatments = sorted(d_treatments, key=d_treatments.get, reverse=True) out = pd.DataFrame.from_dict(matrix_treated, orient='index') out['TTYPE'] = [dic_primary_full[t] for t in out.index.tolist()] out['sum'] = out.sum(axis=1) out = out[out['sum'] > 0].drop('sum', axis=1) forbidden = ['Double-primary'] order_sample_plot = [] order = [] dic_len = defaultdict(int) for ttype in tqdm(sorted_ttyps): if ttype not in forbidden: subs = out[out['TTYPE'] == ttype] mat = subs.drop('TTYPE', axis=1).dropna()[sorted_treatments[:30]] if len(mat) > 1: n = classic_mutual_exclusivity_visualization( mat, sorted_treatments[:30]) new_order = n.dendrogram_col.reordered_ind sample_list = mat.reset_index().loc[new_order]['index'].tolist( ) order_sample_plot.extend(sample_list) order.append(ttype) dic_len[ttype] = len(sample_list) new_cmap = LinearSegmentedColormap.from_list( "", ["lightgrey", "grey", "darkred"]) concat = out.loc[order_sample_plot].drop('TTYPE', axis=1) concat = concat[sorted_treatments[:20]] if 'specific' in outf: new_cols = [s.lower() for s in concat.columns] concat.columns = new_cols config_params(2) fig, ax = plt.subplots(1, 2, figsize=(1, 3), gridspec_kw={'width_ratios': [1, 27]}) ax2 = sns.heatmap(concat, cmap=new_cmap, yticklabels=False, ax=ax[1], cbar=False) ax[1].xaxis.set_ticks_position('top') bot = 0 for t in order[::-1]: ax[0].bar(0, dic_len[t], bottom=bot, color=color_ttype[t]) bot += dic_len[t] ax[0].set_ylim(0, bot) ax[0].spines['top'].set_visible(False) ax[0].spines['bottom'].set_visible(False) ax[0].spines['left'].set_visible(False) ax[0].spines['right'].set_visible(False) ax[0].get_yaxis().set_visible(False) ax[0].get_xaxis().set_visible(False) plt.xticks(rotation=90) plt.savefig('figures/EDF1_{}.png'.format(outf), dpi=600) plt.close()
def create_matrix_treatments(drug, treated_path, exposures_path, tumor_type, keep_sigs=True): # load treated samples with gzip.open(treated_path, 'rb') as fd: treated_samples = pickle.load(fd) # return equivalence sample tumor type dic_primary_full, _ = return_metadata() # load mutational signature exposures at the Pan level d = pd.read_csv(exposures_path, sep='\t', index_col=0) # get all signatures all_sigs = d.index.tolist() # divide samples treated vs not treated if tumor_type == 'Pan': treated = list(treated_samples[drug]['Pan']['YES']) notreated = list(treated_samples[drug]['Pan']['NO']) else: treated = list(treated_samples[drug][tumor_type]['YES']) notreated = list(treated_samples[drug][tumor_type]['NO']) # select samples treated if they exist in the dataframe. s_treat = [s for s in d.columns if s in treated] s_nottreat = [s for s in d.columns if s in notreated] # get samples and transpose them affected = d[list(s_treat)].T notaffected = d[list(s_nottreat)].T affected_reg = affected.copy() notaffected_reg = notaffected.copy() # add variable response affected_reg['resp'] = 1 notaffected_reg['resp'] = 0 # concat all samples toregression = pd.concat([affected_reg, notaffected_reg]) # if keep_sigs flag is True, we will select for the regression all the signatures which # have at least exposure in one fifth of the samples if keep_sigs: keep_exposed_signatures = [] for signature in affected_reg.columns: exposed_to_sig = len(affected_reg[affected_reg[signature] > 10]) if exposed_to_sig > (len(affected_reg) / 5): keep_exposed_signatures.append(signature) keep_exposed_signatures = keep_exposed_signatures + ['resp'] toregression = toregression[keep_exposed_signatures] # add intercept toregression['_intercept'] = 1 # add tumor type in the dataframe for ttype_l in set(dic_primary_full.values()): # get whether the sample belongs to a specific tumor type val = [1 if dic_primary_full.get(s, s) == ttype_l else 0 for s in toregression.index.tolist()] toregression[ttype_l] = val return toregression, all_sigs, list(set(list(dic_primary_full.values()))), len(treated)
def merge_individual_samples_hartwig(path_hartwig, outpath): """ This should merge all tumor types, remove those which belong to a primary, and then prepare for the extraction. :param path_hartwig: :return: """ os.makedirs(outpath, exist_ok=True) # metadata of biopsy dic_primary_full, dic_secondary_fixed = return_metadata() all_hartwig = glob(path_hartwig) dic_muts = defaultdict(int) all_files = [] for f in tqdm(all_hartwig, total=len(all_hartwig)): # remove those that do not belong to a good origin sample_name = os.path.basename(f).split('.')[0].split('_')[1] if sample_name in dic_primary_full: df = pd.read_csv(f, sep='\t') all_files.append(df) dic_muts[sample_name] = len(df) hart_df = pd.concat(all_files) print('sorting...') hart_df.sort_values(by=['CHROM', 'POS'], inplace=True) hart_df['PRIMARY'] = hart_df['SAMPLE'].map(dic_primary_full) hart_df['PRIMARY_EXTENDED'] = hart_df['SAMPLE'].map(dic_secondary_fixed) # we will split indels and SNVs, without considering hypermutants for ttype, data in tqdm(hart_df.groupby(by='PRIMARY')): outfile = '{}/{}.all.gz'.format(outpath, ttype) data.to_csv(outfile, sep='\t', header=True, index=False, compression='gzip') outfile = '{}/{}.snvs.gz'.format(outpath, ttype) data[data['CLASS'] == 'SNV'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}.indels.gz'.format(outpath, ttype) data[data['CLASS'] == 'INDEL'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}.dbs.gz'.format(outpath, ttype) data[data['CLASS'] == 'DBS'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}.mnvs.gz'.format(outpath, ttype) data[data['CLASS'] == 'MNV'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}.complex_indels.gz'.format(outpath, ttype) data[data['CLASS'] == 'COMPLEX_INDELS'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) # 2- save ALL file # Pan No Skin and no Unknown for SigProfiler extraction data = hart_df[ (hart_df['PRIMARY'] != 'Skin') & (hart_df['PRIMARY'] != 'nan') & (hart_df['PRIMARY'] != 'Unknown') & (hart_df['PRIMARY'] != 'CUP') ] outfile = '{}/PanNoSkinNoUnknown.snvs.gz'.format(outpath) data[data['CLASS'] == 'SNV'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/PanNoSkinNoUnknown.indels.gz'.format(outpath) data[data['CLASS'] == 'INDEL'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/PanNoSkinNoUnknown.dbs.gz'.format(outpath) data[data['CLASS'] == 'DBS'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/PanNoSkinNoUnknown.mnvs.gz'.format(outpath) data[data['CLASS'] == 'MNV'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/PanNoSkinNoUnknown.complex_indels.gz'.format(outpath) data[data['CLASS'] == 'COMPLEX_INDELS'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) # define hypermutators based on SNV count. An hypermutated sample is a sample # with the total number of SNVs higher than 2.5IQR from the median of the entire distribution sample_counts = hart_df[hart_df['CLASS'] == 'SNV']['SAMPLE'].value_counts().to_dict() IQR = iqr(list(sample_counts.values())) median = np.median(list(sample_counts.values())) cutoff = median + 2.5 * IQR hypersamples = [s for s, v in sample_counts.items() if v > cutoff] nothypersamples = [s for s, v in sample_counts.items() if v <= cutoff] hypers = [hypersamples, nothypersamples] labels = ['Hypermutated', 'NotHypermutated'] ttype = 'Pan' # Split into hypermutants and no hypermutants for Signature Analyzer extraction for h, l in zip(hypers, labels): subsdata = hart_df[hart_df['SAMPLE'].isin(h)] outfile = '{}/{}{}.all.gz'.format(outpath, ttype, l) subsdata.to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}{}.snvs.gz'.format(outpath, ttype, l) subsdata[subsdata['CLASS'] == 'SNV'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}{}.indels.gz'.format(outpath, ttype, l) subsdata[subsdata['CLASS'] == 'INDEL'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}{}.dbs.gz'.format(outpath, ttype, l) subsdata[subsdata['CLASS'] == 'DBS'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}{}.mnvs.gz'.format(outpath, ttype, l) subsdata[subsdata['CLASS'] == 'MNV'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/{}{}.complex_indels.gz'.format(outpath, ttype, l) subsdata[subsdata['CLASS'] == 'COMPLEX_INDELS'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/Pan.all.gz'.format(outpath) hart_df.to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/Pan.snvs.gz'.format(outpath) hart_df[hart_df['CLASS'] == 'SNV'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/Pan.indels.gz'.format(outpath) hart_df[hart_df['CLASS'] == 'INDEL'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/Pan.dbs.gz'.format(outpath) hart_df[hart_df['CLASS'] == 'DBS'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/Pan.mnvs.gz'.format(outpath) hart_df[hart_df['CLASS'] == 'MNV'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' ) outfile = '{}/Pan.complex_indels.gz'.format(outpath) hart_df[hart_df['CLASS'] == 'COMPLEX_INDELS'].to_csv( outfile, sep='\t', header=True, index=False, compression='gzip' )
def plot_piecharts_signatures(exposures_path, type_mut, type_extraction, figsize, min_val): colors_full = { "SBS": { 25: '#003d7c', 10: '#224ba5', 5: '#4459ce', 2.5: '#6767f7', 1: '#9968c8', 0.5: '#cc6999', 0.25: '#ff6b6b', 0.1: '#ff8962', 0.05: '#ffa759', 0: '#ffc651' }, "ID": { 1: '#003d7c', 0.5: '#224ba5', 0.1: '#4459ce', 0.05: '#6767f7', 0.04: '#9968c8', 0.03: '#cc6999', 0.02: '#ff6b6b', 0.01: '#ff8962', 0.001: '#ffa759', 0: '#ffc651' }, "DBS": { 0.5: '#003d7c', 0.1: '#224ba5', 0.05: '#4459ce', 0.03: '#6767f7', 0.02: '#9968c8', 0.01: '#cc6999', 0.008: '#ff6b6b', 0.005: '#ff8962', 0.001: '#ffa759', 0: '#ffc651' } } colors = colors_full[type_mut] result = read_exposures(exposures_path) config_params() dic_primary_full, _ = return_metadata() # result = pd.concat([df, df_mela]) result = result.fillna(0) signatures = result.columns.tolist() # get list of similar found signatures in the extraction similar = [s for s in signatures if type_mut in s] notsimilar = [s for s in signatures if type_mut not in s] result['TTYPE'] = [dic_primary_full[t] for t in result.index.tolist()] dic_sig = defaultdict(lambda: defaultdict(float)) dic_proportion = defaultdict(lambda: defaultdict(float)) for ttype, data in result.groupby(by='TTYPE'): data2 = data.copy() data2.drop('TTYPE', axis=1, inplace=True) for col in data2: # we normalize it by the number of MB in the human genome (3234) dic_sig[ttype][col] = data2[ data2[col] > min_val][col].median() / 3234 if type_mut not in col: dic_proportion[ttype][col.split('_')[0]] = len( data2[data2[col] > min_val]) / len(data2) else: dic_proportion[ttype][col] = len( data2[data2[col] > min_val]) / len(data2) medians = pd.DataFrame.from_dict(dic_sig) # sorting the already known signatures keep_order_similar = defaultdict(list) for s in similar: number = s.split('_')[1].split(type_mut)[1] try: keep_n = int(number) except Exception: keep_n = int(number[:-1]) keep_order_similar[keep_n].append(str(s)) order_prev_labels = [] order_prev = [] for i in sorted(keep_order_similar, reverse=True): all_s = [] d_equiv = defaultdict(str) for sig in keep_order_similar[i]: ID_sig = '{}_{}_{}'.format( sig.split('_')[1], sig.split('_')[2], sig.split('_')[0]) d_equiv[ID_sig] = sig sorted_final_k = sorted(d_equiv) sorted_sigs_list = [d_equiv[ss] for ss in sorted_final_k] for sim, sig in reversed(list(enumerate(sorted_sigs_list, start=1))): if len(keep_order_similar[i]) == 1: order_prev_labels.append('E-{}{} ({}-like, {})'.format( type_mut, sig.split('_')[0], sig.split('_')[1], round(float(sig.split('_')[2]), 3))) order_prev.append(sig) else: order_prev_labels.append('E-{}{} ({}-like {}, {})'.format( type_mut, sig.split('_')[0], sig.split('_')[1], sim, round(float(sig.split('_')[2]), 3))) order_prev.append(sig) no_similar_signatures = medians.loc[notsimilar] new_index = [ int(l.split('_')[0]) for l in no_similar_signatures.index.tolist() ] no_similar_signatures.index = new_index no_similar_signatures.sort_index(inplace=True, ascending=True) names_notsimilar = [ 'E-{} {}'.format(type_mut, c) for c in no_similar_signatures.index.tolist()[::-1] ] # merge new and old merged = pd.concat([ no_similar_signatures.sort_index(ascending=False), medians.loc[order_prev], ]) # merged = merged.loc[order_prev+small_newset.index.tolist()] merged_labels = names_notsimilar + order_prev_labels config_params(5) fig, ax = plt.subplots(1, 1, figsize=figsize) # plt.grid(b=True, which='major',) for yval, (i, row) in enumerate(merged.iterrows()): for xval, t in enumerate(merged.columns.tolist()): val = row[t] if val > 0: color = None for number in sorted(colors.keys(), reverse=True): if (val > number) & (color is None): color_scatter = colors[number] break if type_mut in str(i): plt.scatter(xval, yval, c=color_scatter, s=dic_proportion[t][i] * 20) else: plt.scatter(xval, yval, c=color_scatter, s=dic_proportion[t][str(i)] * 20) ax.set_xticks(np.arange(len(merged.T))) ax.set_xticklabels(merged.columns.tolist(), rotation=90) ax.set_yticks(np.arange(len(merged))) ax.set_yticklabels(merged_labels) ax.xaxis.set_ticks_position('top') ax.set_axisbelow(True) ax.yaxis.grid(color='gray', linestyle='dashed', alpha=0.3) ax.xaxis.grid(color='gray', linestyle='dashed', alpha=0.3) ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) ax.xaxis.set_ticks_position('none') ax.yaxis.set_ticks_position('none') plt.ylim(-1, len(merged)) plt.tight_layout() plt.savefig('figures/{}/supp1_{}.svg'.format(type_extraction, type_mut)) plt.savefig('figures/{}/supp1_{}.png'.format(type_extraction, type_mut), dpi=600) plt.close()
def do_plot(samples_lowbraca, samples_braca, not_breaked, only_treated, exp): dic_primary_full, _ = return_metadata() color_ttype = return_colors() fig, ax = plt.subplots(1, 1, figsize=(2, 1.5)) config_params(6.5) sigs = ['12_ID6_0.962941_1'] exposures_not_breaked_1 = exp[sigs].sum(axis=1).loc[[ i for i in not_breaked if i in samples_lowbraca ]].dropna() exposures_not_breaked_2 = exp[sigs].sum(axis=1).loc[[ i for i in not_breaked if i in samples_braca ]].dropna() exposures_breaked_1 = exp[sigs].sum(axis=1).loc[[ i for i in only_treated if i in samples_lowbraca ]].dropna() exposures_breaked_2 = exp[sigs].sum(axis=1).loc[[ i for i in only_treated if i in samples_braca ]].dropna() sns.boxplot(data=[ exposures_not_breaked_1, exposures_breaked_1, exposures_not_breaked_2, exposures_breaked_2 ], linewidth=0.6, showfliers=False, color='#cbcacbff') plt.ylabel('Indels DSB repair by\nnon-homologous end-joining') plt.xticks([0, 1, 2, 3], [ 'Not radiated no BRCAness ({})'.format(len(exposures_not_breaked_1)), 'Radiated no BRCAness ({})'.format(len(exposures_breaked_1)), 'Not radiated BRCAness ({})'.format(len(exposures_not_breaked_2)), 'Radiated BRCAness ({})'.format(len(exposures_breaked_2)) ], rotation=90) plotdot = [] colors = [] for sample in [i for i in not_breaked if i in samples_lowbraca]: plotdot.append(exp[sigs].sum(axis=1).loc[sample]) colors.append(color_ttype[dic_primary_full[sample]]) ax.scatter( [0 + np.random.uniform(-0.2, 0.2, 1)[0] for i in range(len(plotdot))], plotdot, color=colors, s=1, alpha=0.2) plotdot = [] colors = [] for sample in [i for i in only_treated if i in samples_lowbraca]: plotdot.append(exp[sigs].sum(axis=1).loc[sample]) colors.append(color_ttype[dic_primary_full[sample]]) ax.scatter( [1 + np.random.uniform(-0.2, 0.2, 1)[0] for i in range(len(plotdot))], plotdot, color=colors, s=1, alpha=0.2) plotdot = [] colors = [] for sample in [i for i in not_breaked if i in samples_braca]: plotdot.append(exp[sigs].sum(axis=1).loc[sample]) colors.append(color_ttype[dic_primary_full[sample]]) ax.scatter( [2 + np.random.uniform(-0.2, 0.2, 1)[0] for i in range(len(plotdot))], plotdot, color=colors, s=1, alpha=0.2) plotdot = [] colors = [] for sample in [i for i in only_treated if i in samples_braca]: plotdot.append(exp[sigs].sum(axis=1).loc[sample]) colors.append(color_ttype[dic_primary_full[sample]]) ax.scatter( [3 + np.random.uniform(-0.2, 0.2, 1)[0] for i in range(len(plotdot))], plotdot, color=colors, s=1, alpha=0.2) plt.ylim(0, 700) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.savefig('figures/radiation.svg') plt.show() ################## fig, ax = plt.subplots( 1, 1, ) stat, pval1 = mannwhitneyu(exposures_not_breaked_1, exposures_breaked_1) print("Not radiated no BRCAnes vs Radiated no BRCAness", pval1) stat, pval2 = mannwhitneyu(exposures_not_breaked_2, exposures_breaked_2) print("Not radiated BRCAnes vs Radiated BRCAness", pval2) stat, pval3 = mannwhitneyu(exposures_breaked_1, exposures_breaked_2) print("radiated no BRCAnes vs Radiated BRCAness", pval3) ax.text(1, 1, "$\it{P}$" + " = {}".format(sci_notation(pval1)), fontsize=7) ax.text(1, 4, "$\it{P}$" + " = {}".format(sci_notation(pval2)), fontsize=7) ax.text(1, 8, "$\it{P}$" + " = {}".format(sci_notation(pval3)), fontsize=7) plt.xlim(0, 5) plt.ylim(0, 10) plt.savefig('figures/radiation_pvals.svg') sys.exit()