예제 #1
0
def run():

    dic_primary_full, _ = return_metadata()
    signature_2_treatment_signature_analyzer = {
        'CISPLATIN': ['21_SBS31_0.953955_1', '14_1'],
        'CARBOPLATIN': ['21_SBS31_0.953955_1', '25_1'],
        '5-FU_CAPE': ['31_SBS17b_0.968799_1'],
        'OXALIPLATIN': ['14_1', '37_1']
    }
    signature_2_treatment_sigprofiler = {
        'CISPLATIN': ['1_SBS31_0.968153_0.98'],
        'CARBOPLATIN': ['1_SBS31_0.968153_0.98'],
        'OXALIPLATIN': ['20_0.92'],
        '5-FU_CAPE': ['19_SBS17b_0.961548_0.99'],
    }

    outpath = 'figures/'
    path_base = 'data/hartwig/signatures/extraction/results/{}/snvs/exposures/'
    path_siganalyzer = path_base.format(
        "SignatureAnalyzer") + "Pan_full/Pan_full.exposures.tsv"
    path_sigprofiler = path_base.format(
        "SigProfiler"
    ) + "PanNoSkinNoUnknown.snvs/PanNoSkinNoUnknown.snvs.exposures.tsv"

    for drug in signature_2_treatment_signature_analyzer:
        process_exposures_methods(path_siganalyzer, path_sigprofiler, drug,
                                  signature_2_treatment_signature_analyzer,
                                  signature_2_treatment_sigprofiler, outpath)
예제 #2
0
    def __init__(self,
                 exposures_path,
                 discriminate_path,
                 matrix_treatments_path,
                 ttype='Pan'):

        self.exposures = pd.read_csv(exposures_path, sep='\t', index_col=0)
        self.exposures = self.exposures.transpose()
        self.discriminate = pd.read_csv(discriminate_path, sep='\t')
        self.discriminate = self.discriminate[self.discriminate['ttype'] ==
                                              ttype]
        self.discriminate = self.discriminate.astype({
            'signature': str,
            'effect_size': float,
            'pvals': float,
            'treatment': str,
            'ttype': str,
            'total_treated': float
        })
        self.treatments = pd.read_csv(matrix_treatments_path,
                                      sep='\t',
                                      index_col=0)

        self.ttype = ttype

        dic_ttypes, _ = return_metadata()

        if ttype == 'Pan':
            self.samples = dic_ttypes.keys()
        else:
            self.samples = [k for k, v in dic_ttypes.items() if v == ttype]

        self.effect_thresh = EFFECT_SIZE  # min effect size
        self.pvalue_thresh = PVALUE  # empirical_pvalue threshold
        self.overlap_thresh = OVERLAP  # min overlap to consider cotreatments
예제 #3
0
def process_exposures_methods(path_siganalyzer, path_sigprofiler, drug,
                              signature_2_treatment_signature_analyzer,
                              signature_2_treatment_sigprofiler, outpath):

    df_sigpro = read_exposures(path_sigprofiler)
    df_sigan = read_exposures(path_siganalyzer)
    all_treated_samples = set()

    treated_samples = pickle.load(gzip.open(Conf['treatment_specific_drug']))
    dic_primary_full, _ = return_metadata()

    dsiganalyzer = defaultdict(lambda: defaultdict(float))
    dsiganalyzer_simple = defaultdict(float)
    dsiganalyzer_simple_exposure = defaultdict(float)

    sigs_affected_signature_analyzer = signature_2_treatment_signature_analyzer[
        drug]
    for sample, d in df_sigan.iterrows():
        if sample in treated_samples[drug]['Pan']['YES']:
            dsiganalyzer[dic_primary_full[sample]][sample] = d.loc[
                sigs_affected_signature_analyzer].sum()
            dsiganalyzer_simple[sample] = d.loc[
                sigs_affected_signature_analyzer].sum()
            dsiganalyzer_simple_exposure[sample] = d.loc[
                sigs_affected_signature_analyzer].sum() / d.sum()

    dsigprof = defaultdict(lambda: defaultdict(float))
    dsigprof_simple = defaultdict(float)
    dsigprof_simple_exposure = defaultdict(float)
    sigs_affected_sigprofiler = signature_2_treatment_sigprofiler[drug]

    for sample, d in df_sigpro.iterrows():
        if sample in treated_samples[drug]['Pan']['YES']:
            all_treated_samples.add(sample)

            dsigprof[dic_primary_full[sample]][sample] = d.loc[
                sigs_affected_sigprofiler].sum()
            dsigprof_simple[sample] = d.loc[sigs_affected_sigprofiler].sum()
            dsigprof_simple_exposure[
                sample] = d.loc[sigs_affected_sigprofiler].sum() / d.sum()

    barplot_classes(dsigprof_simple_exposure, dsiganalyzer_simple_exposure,
                    drug, outpath, 'count')
    plot_single_distribution(dsigprof_simple, dsiganalyzer_simple, drug,
                             outpath, 'count')
    plot_single_correlation(dsigprof_simple, dsiganalyzer_simple, drug,
                            outpath, 'count')
    plot_single_distribution(dsigprof_simple_exposure,
                             dsiganalyzer_simple_exposure, drug, outpath,
                             'exposure')
    plot_single_correlation(dsigprof_simple_exposure,
                            dsiganalyzer_simple_exposure, drug, outpath,
                            'exposure')

    bad_samples_drug = get_samples_similar_exposure(
        dsigprof_simple_exposure, dsiganalyzer_simple_exposure, drug, outpath,
        'exposure')
예제 #4
0
def merge_samples_timing(input_file, outpath):

    os.makedirs(outpath, exist_ok=True)
    dic_primary_full, _ = return_metadata()
    all_hartwig = glob(input_file)
    to_remove = [
        'EXTENDED', 'MAJOR_CN', 'MINOR_CN', 'TOTAL_CN', 'NORMAL_CN',
        'VAR_COUNTS', 'REF_COUNTS', 'GENDER', 'PURITY'
    ]

    all_files = []
    for f in tqdm(all_hartwig, total=len(all_hartwig)):

        # remove those that do not belong to a good origin
        sample_name = os.path.basename(f).split('_')[1].split('.')[0]
        if sample_name in dic_primary_full:
            df = pd.read_csv(f, sep='\t', low_memory=False)
            df.drop(to_remove, axis=1, inplace=True)
            all_files.append(df)

    hart_df = pd.concat(all_files)

    hart_df['PRIMARY'] = hart_df['SAMPLE'].map(dic_primary_full)
    hart_df.sort_values(by=['CHROM', 'POS'], inplace=True)

    outfile = '{}/Pan.snvs.gz'.format(outpath)
    hart_df[hart_df['CLASS'] == 'SNV'].to_csv(outfile,
                                              sep='\t',
                                              header=True,
                                              index=False,
                                              compression='gzip')

    outfile = '{}/Pan.indels.gz'.format(outpath)

    hart_df[hart_df['CLASS'] == 'INDEL'].to_csv(outfile,
                                                sep='\t',
                                                header=True,
                                                index=False,
                                                compression='gzip')

    outfile = '{}/Pan.dbs.gz'.format(outpath)
    hart_df[hart_df['CLASS'] == 'DBS'].to_csv(outfile,
                                              sep='\t',
                                              header=True,
                                              index=False,
                                              compression='gzip')

    # save ALL file
    outfile = '{}/Pan.all.gz'.format(outpath)
    hart_df.to_csv(outfile,
                   sep='\t',
                   header=True,
                   index=False,
                   compression='gzip')
예제 #5
0
def create_matrix_treatments_plot():

    treated = pickle.load(gzip.open(Conf['treatment_FDA_drug']))
    dic_primary, _ = return_metadata()
    matrix_treated = defaultdict(lambda: defaultdict(int))

    forbidden = ['CUP', 'Eye', 'Double-primary', 'Unknown']

    for sample, t in dic_primary.items():
        if t not in forbidden:
            for k, d in treated.items():
                if sample in d[t]['YES']:
                    matrix_treated[sample][k] = 1
                elif sample in d[t]['NO']:
                    matrix_treated[sample][k] = 0

    out = pd.DataFrame.from_dict(matrix_treated, orient='index')
    out = out.dropna()
    dic_t = defaultdict(list)
    for i, row in out.iterrows():
        dic_t[dic_primary[i]].append(i)

    return out, dic_t
def get_items_to_plot(samples_set, expos, sig):

    df = pd.read_csv(expos, sep='\t', index_col=0)
    df = df.T

    dic_primary_full, _ = return_metadata()
    colors_dict = return_colors()

    vals_to_plot = defaultdict(list)
    colors = defaultdict(list)
    count_exposed = defaultdict(int)
    count_total = defaultdict(int)

    for sample in samples_set:
        ttype = dic_primary_full[sample]
        if sample in df.index:
            sample_exp = df.loc[sample][sig]
            if sample_exp > 1:
                count_exposed[ttype] += 1
                vals_to_plot[ttype].append(sample_exp)
                colors[ttype].append(colors_dict[ttype])
            count_total[ttype] += 1

    return vals_to_plot, colors, count_exposed, count_total
예제 #7
0
def plot_heatmap_treatment(file):

    outf = os.path.basename(file).split('.')[0]
    dic_primary_full, _ = return_metadata()
    color_ttype = return_colors()
    total_s = defaultdict(list)
    total_count = defaultdict(int)

    for sample, t in dic_primary_full.items():
        total_s[t].append(sample)
        total_count[t] += 1
    sorted_ttyps = sorted(total_count, key=total_count.get, reverse=True)

    treated = pickle.load(gzip.open(file))
    forbidden = ['RADIATION', 'TOPOII']
    matrix_treated = defaultdict(lambda: defaultdict(int))

    for sample, t in dic_primary_full.items():
        for k, d in treated.items():
            if k not in forbidden:
                if sample in d[t]['YES']:
                    matrix_treated[sample][k] = 1
                elif sample in d[t]['NO']:
                    matrix_treated[sample][k] = 0

    d_treatments = defaultdict(int)
    for k, d in treated.items():
        if k not in forbidden:
            for ttype, l in d.items():
                d_treatments[k] += len(l['YES'])

    sorted_treatments = sorted(d_treatments,
                               key=d_treatments.get,
                               reverse=True)
    out = pd.DataFrame.from_dict(matrix_treated, orient='index')
    out['TTYPE'] = [dic_primary_full[t] for t in out.index.tolist()]
    out['sum'] = out.sum(axis=1)
    out = out[out['sum'] > 0].drop('sum', axis=1)

    forbidden = ['Double-primary']
    order_sample_plot = []
    order = []
    dic_len = defaultdict(int)
    for ttype in tqdm(sorted_ttyps):
        if ttype not in forbidden:
            subs = out[out['TTYPE'] == ttype]
            mat = subs.drop('TTYPE', axis=1).dropna()[sorted_treatments[:30]]
            if len(mat) > 1:
                n = classic_mutual_exclusivity_visualization(
                    mat, sorted_treatments[:30])
                new_order = n.dendrogram_col.reordered_ind
                sample_list = mat.reset_index().loc[new_order]['index'].tolist(
                )
                order_sample_plot.extend(sample_list)
                order.append(ttype)
                dic_len[ttype] = len(sample_list)

    new_cmap = LinearSegmentedColormap.from_list(
        "", ["lightgrey", "grey", "darkred"])
    concat = out.loc[order_sample_plot].drop('TTYPE', axis=1)
    concat = concat[sorted_treatments[:20]]

    if 'specific' in outf:
        new_cols = [s.lower() for s in concat.columns]
        concat.columns = new_cols

    config_params(2)
    fig, ax = plt.subplots(1,
                           2,
                           figsize=(1, 3),
                           gridspec_kw={'width_ratios': [1, 27]})
    ax2 = sns.heatmap(concat,
                      cmap=new_cmap,
                      yticklabels=False,
                      ax=ax[1],
                      cbar=False)
    ax[1].xaxis.set_ticks_position('top')
    bot = 0
    for t in order[::-1]:
        ax[0].bar(0, dic_len[t], bottom=bot, color=color_ttype[t])
        bot += dic_len[t]
    ax[0].set_ylim(0, bot)
    ax[0].spines['top'].set_visible(False)
    ax[0].spines['bottom'].set_visible(False)
    ax[0].spines['left'].set_visible(False)
    ax[0].spines['right'].set_visible(False)
    ax[0].get_yaxis().set_visible(False)
    ax[0].get_xaxis().set_visible(False)

    plt.xticks(rotation=90)
    plt.savefig('figures/EDF1_{}.png'.format(outf), dpi=600)
    plt.close()
예제 #8
0
def create_matrix_treatments(drug, treated_path, exposures_path, tumor_type, keep_sigs=True):

    # load treated samples
    with gzip.open(treated_path, 'rb') as fd:
        treated_samples = pickle.load(fd)

    # return equivalence sample tumor type
    dic_primary_full, _ = return_metadata()

    # load mutational signature exposures at the Pan level
    d = pd.read_csv(exposures_path, sep='\t', index_col=0)

    # get all signatures
    all_sigs = d.index.tolist()

    # divide samples treated vs not treated
    if tumor_type == 'Pan':
        treated = list(treated_samples[drug]['Pan']['YES'])
        notreated = list(treated_samples[drug]['Pan']['NO'])
    else:
        treated = list(treated_samples[drug][tumor_type]['YES'])
        notreated = list(treated_samples[drug][tumor_type]['NO'])

    # select samples treated if they exist in the dataframe.
    s_treat = [s for s in d.columns if s in treated]
    s_nottreat = [s for s in d.columns if s in notreated]

    # get samples and transpose them
    affected = d[list(s_treat)].T
    notaffected = d[list(s_nottreat)].T

    affected_reg = affected.copy()
    notaffected_reg = notaffected.copy()

    # add variable response
    affected_reg['resp'] = 1
    notaffected_reg['resp'] = 0

    # concat all samples
    toregression = pd.concat([affected_reg, notaffected_reg])

    # if keep_sigs flag is True, we will select for the regression all the signatures which
    # have at least exposure in one fifth of the samples
    if keep_sigs:

        keep_exposed_signatures = []
        for signature in affected_reg.columns:
            exposed_to_sig = len(affected_reg[affected_reg[signature] > 10])
            if exposed_to_sig > (len(affected_reg) / 5):
                keep_exposed_signatures.append(signature)

        keep_exposed_signatures = keep_exposed_signatures + ['resp']
        toregression = toregression[keep_exposed_signatures]

    # add intercept
    toregression['_intercept'] = 1

    # add tumor type in the dataframe
    for ttype_l in set(dic_primary_full.values()):
        # get whether the sample belongs to a specific tumor type
        val = [1 if dic_primary_full.get(s, s) == ttype_l else 0 for s in toregression.index.tolist()]
        toregression[ttype_l] = val

    return toregression, all_sigs, list(set(list(dic_primary_full.values()))), len(treated)
예제 #9
0
def merge_individual_samples_hartwig(path_hartwig, outpath):
    """
    This should merge all tumor types, remove those which belong to a primary,
    and then prepare for the extraction.
    :param path_hartwig:
    :return:
    """

    os.makedirs(outpath, exist_ok=True)

    # metadata of biopsy
    dic_primary_full, dic_secondary_fixed = return_metadata()

    all_hartwig = glob(path_hartwig)
    dic_muts = defaultdict(int)

    all_files = []
    for f in tqdm(all_hartwig, total=len(all_hartwig)):

        # remove those that do not belong to a good origin
        sample_name = os.path.basename(f).split('.')[0].split('_')[1]
        if sample_name in dic_primary_full:
            df = pd.read_csv(f, sep='\t')
            all_files.append(df)
            dic_muts[sample_name] = len(df)

    hart_df = pd.concat(all_files)

    print('sorting...')
    hart_df.sort_values(by=['CHROM', 'POS'], inplace=True)

    hart_df['PRIMARY'] = hart_df['SAMPLE'].map(dic_primary_full)
    hart_df['PRIMARY_EXTENDED'] = hart_df['SAMPLE'].map(dic_secondary_fixed)

    # we will split indels and SNVs, without considering hypermutants
    for ttype, data in tqdm(hart_df.groupby(by='PRIMARY')):

        outfile = '{}/{}.all.gz'.format(outpath, ttype)
        data.to_csv(outfile, sep='\t', header=True, index=False, compression='gzip')

        outfile = '{}/{}.snvs.gz'.format(outpath, ttype)
        data[data['CLASS'] == 'SNV'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}.indels.gz'.format(outpath, ttype)
        data[data['CLASS'] == 'INDEL'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}.dbs.gz'.format(outpath, ttype)
        data[data['CLASS'] == 'DBS'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}.mnvs.gz'.format(outpath, ttype)
        data[data['CLASS'] == 'MNV'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}.complex_indels.gz'.format(outpath, ttype)
        data[data['CLASS'] == 'COMPLEX_INDELS'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

    # 2- save ALL file

    # Pan No Skin and no Unknown for SigProfiler extraction
    data = hart_df[
        (hart_df['PRIMARY'] != 'Skin') &
        (hart_df['PRIMARY'] != 'nan') &
        (hart_df['PRIMARY'] != 'Unknown') &
        (hart_df['PRIMARY'] != 'CUP')
        ]

    outfile = '{}/PanNoSkinNoUnknown.snvs.gz'.format(outpath)
    data[data['CLASS'] == 'SNV'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/PanNoSkinNoUnknown.indels.gz'.format(outpath)
    data[data['CLASS'] == 'INDEL'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/PanNoSkinNoUnknown.dbs.gz'.format(outpath)
    data[data['CLASS'] == 'DBS'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/PanNoSkinNoUnknown.mnvs.gz'.format(outpath)
    data[data['CLASS'] == 'MNV'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/PanNoSkinNoUnknown.complex_indels.gz'.format(outpath)
    data[data['CLASS'] == 'COMPLEX_INDELS'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    # define hypermutators based on SNV count. An hypermutated sample is a sample
    # with the total number of SNVs higher than 2.5IQR from the median of the entire distribution
    sample_counts = hart_df[hart_df['CLASS'] == 'SNV']['SAMPLE'].value_counts().to_dict()
    IQR = iqr(list(sample_counts.values()))
    median = np.median(list(sample_counts.values()))

    cutoff = median + 2.5 * IQR

    hypersamples = [s for s, v in sample_counts.items() if v > cutoff]
    nothypersamples = [s for s, v in sample_counts.items() if v <= cutoff]

    hypers = [hypersamples, nothypersamples]
    labels = ['Hypermutated', 'NotHypermutated']

    ttype = 'Pan'

    # Split into hypermutants and no hypermutants for Signature Analyzer extraction
    for h, l in zip(hypers, labels):

        subsdata = hart_df[hart_df['SAMPLE'].isin(h)]

        outfile = '{}/{}{}.all.gz'.format(outpath, ttype, l)
        subsdata.to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}{}.snvs.gz'.format(outpath, ttype, l)
        subsdata[subsdata['CLASS'] == 'SNV'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}{}.indels.gz'.format(outpath, ttype, l)
        subsdata[subsdata['CLASS'] == 'INDEL'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}{}.dbs.gz'.format(outpath, ttype, l)
        subsdata[subsdata['CLASS'] == 'DBS'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}{}.mnvs.gz'.format(outpath, ttype, l)
        subsdata[subsdata['CLASS'] == 'MNV'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

        outfile = '{}/{}{}.complex_indels.gz'.format(outpath, ttype, l)
        subsdata[subsdata['CLASS'] == 'COMPLEX_INDELS'].to_csv(
            outfile, sep='\t', header=True, index=False, compression='gzip'
        )

    outfile = '{}/Pan.all.gz'.format(outpath)
    hart_df.to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/Pan.snvs.gz'.format(outpath)
    hart_df[hart_df['CLASS'] == 'SNV'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/Pan.indels.gz'.format(outpath)

    hart_df[hart_df['CLASS'] == 'INDEL'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/Pan.dbs.gz'.format(outpath)
    hart_df[hart_df['CLASS'] == 'DBS'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/Pan.mnvs.gz'.format(outpath)
    hart_df[hart_df['CLASS'] == 'MNV'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )

    outfile = '{}/Pan.complex_indels.gz'.format(outpath)
    hart_df[hart_df['CLASS'] == 'COMPLEX_INDELS'].to_csv(
        outfile, sep='\t', header=True, index=False, compression='gzip'
    )
def plot_piecharts_signatures(exposures_path, type_mut, type_extraction,
                              figsize, min_val):

    colors_full = {
        "SBS": {
            25: '#003d7c',
            10: '#224ba5',
            5: '#4459ce',
            2.5: '#6767f7',
            1: '#9968c8',
            0.5: '#cc6999',
            0.25: '#ff6b6b',
            0.1: '#ff8962',
            0.05: '#ffa759',
            0: '#ffc651'
        },
        "ID": {
            1: '#003d7c',
            0.5: '#224ba5',
            0.1: '#4459ce',
            0.05: '#6767f7',
            0.04: '#9968c8',
            0.03: '#cc6999',
            0.02: '#ff6b6b',
            0.01: '#ff8962',
            0.001: '#ffa759',
            0: '#ffc651'
        },
        "DBS": {
            0.5: '#003d7c',
            0.1: '#224ba5',
            0.05: '#4459ce',
            0.03: '#6767f7',
            0.02: '#9968c8',
            0.01: '#cc6999',
            0.008: '#ff6b6b',
            0.005: '#ff8962',
            0.001: '#ffa759',
            0: '#ffc651'
        }
    }

    colors = colors_full[type_mut]
    result = read_exposures(exposures_path)
    config_params()
    dic_primary_full, _ = return_metadata()

    # result = pd.concat([df, df_mela])
    result = result.fillna(0)
    signatures = result.columns.tolist()

    # get list of similar found signatures in the extraction
    similar = [s for s in signatures if type_mut in s]
    notsimilar = [s for s in signatures if type_mut not in s]

    result['TTYPE'] = [dic_primary_full[t] for t in result.index.tolist()]

    dic_sig = defaultdict(lambda: defaultdict(float))
    dic_proportion = defaultdict(lambda: defaultdict(float))
    for ttype, data in result.groupby(by='TTYPE'):
        data2 = data.copy()
        data2.drop('TTYPE', axis=1, inplace=True)
        for col in data2:
            # we normalize it by the number of MB in the human genome (3234)
            dic_sig[ttype][col] = data2[
                data2[col] > min_val][col].median() / 3234

            if type_mut not in col:
                dic_proportion[ttype][col.split('_')[0]] = len(
                    data2[data2[col] > min_val]) / len(data2)
            else:
                dic_proportion[ttype][col] = len(
                    data2[data2[col] > min_val]) / len(data2)

    medians = pd.DataFrame.from_dict(dic_sig)

    # sorting the already known signatures
    keep_order_similar = defaultdict(list)
    for s in similar:
        number = s.split('_')[1].split(type_mut)[1]
        try:
            keep_n = int(number)
        except Exception:
            keep_n = int(number[:-1])

        keep_order_similar[keep_n].append(str(s))

    order_prev_labels = []
    order_prev = []
    for i in sorted(keep_order_similar, reverse=True):
        all_s = []
        d_equiv = defaultdict(str)
        for sig in keep_order_similar[i]:
            ID_sig = '{}_{}_{}'.format(
                sig.split('_')[1],
                sig.split('_')[2],
                sig.split('_')[0])
            d_equiv[ID_sig] = sig
        sorted_final_k = sorted(d_equiv)
        sorted_sigs_list = [d_equiv[ss] for ss in sorted_final_k]

        for sim, sig in reversed(list(enumerate(sorted_sigs_list, start=1))):
            if len(keep_order_similar[i]) == 1:
                order_prev_labels.append('E-{}{} ({}-like, {})'.format(
                    type_mut,
                    sig.split('_')[0],
                    sig.split('_')[1], round(float(sig.split('_')[2]), 3)))
                order_prev.append(sig)
            else:
                order_prev_labels.append('E-{}{} ({}-like {}, {})'.format(
                    type_mut,
                    sig.split('_')[0],
                    sig.split('_')[1], sim, round(float(sig.split('_')[2]),
                                                  3)))
                order_prev.append(sig)

    no_similar_signatures = medians.loc[notsimilar]
    new_index = [
        int(l.split('_')[0]) for l in no_similar_signatures.index.tolist()
    ]
    no_similar_signatures.index = new_index
    no_similar_signatures.sort_index(inplace=True, ascending=True)

    names_notsimilar = [
        'E-{} {}'.format(type_mut, c)
        for c in no_similar_signatures.index.tolist()[::-1]
    ]

    # merge new and old
    merged = pd.concat([
        no_similar_signatures.sort_index(ascending=False),
        medians.loc[order_prev],
    ])

    # merged = merged.loc[order_prev+small_newset.index.tolist()]
    merged_labels = names_notsimilar + order_prev_labels

    config_params(5)

    fig, ax = plt.subplots(1, 1, figsize=figsize)
    # plt.grid(b=True, which='major',)

    for yval, (i, row) in enumerate(merged.iterrows()):
        for xval, t in enumerate(merged.columns.tolist()):
            val = row[t]
            if val > 0:
                color = None
                for number in sorted(colors.keys(), reverse=True):
                    if (val > number) & (color is None):
                        color_scatter = colors[number]
                        break
                if type_mut in str(i):
                    plt.scatter(xval,
                                yval,
                                c=color_scatter,
                                s=dic_proportion[t][i] * 20)
                else:
                    plt.scatter(xval,
                                yval,
                                c=color_scatter,
                                s=dic_proportion[t][str(i)] * 20)

    ax.set_xticks(np.arange(len(merged.T)))
    ax.set_xticklabels(merged.columns.tolist(), rotation=90)
    ax.set_yticks(np.arange(len(merged)))

    ax.set_yticklabels(merged_labels)
    ax.xaxis.set_ticks_position('top')
    ax.set_axisbelow(True)

    ax.yaxis.grid(color='gray', linestyle='dashed', alpha=0.3)
    ax.xaxis.grid(color='gray', linestyle='dashed', alpha=0.3)

    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)

    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')

    plt.ylim(-1, len(merged))
    plt.tight_layout()
    plt.savefig('figures/{}/supp1_{}.svg'.format(type_extraction, type_mut))
    plt.savefig('figures/{}/supp1_{}.png'.format(type_extraction, type_mut),
                dpi=600)

    plt.close()
예제 #11
0
def do_plot(samples_lowbraca, samples_braca, not_breaked, only_treated, exp):

    dic_primary_full, _ = return_metadata()
    color_ttype = return_colors()
    fig, ax = plt.subplots(1, 1, figsize=(2, 1.5))
    config_params(6.5)

    sigs = ['12_ID6_0.962941_1']

    exposures_not_breaked_1 = exp[sigs].sum(axis=1).loc[[
        i for i in not_breaked if i in samples_lowbraca
    ]].dropna()
    exposures_not_breaked_2 = exp[sigs].sum(axis=1).loc[[
        i for i in not_breaked if i in samples_braca
    ]].dropna()
    exposures_breaked_1 = exp[sigs].sum(axis=1).loc[[
        i for i in only_treated if i in samples_lowbraca
    ]].dropna()
    exposures_breaked_2 = exp[sigs].sum(axis=1).loc[[
        i for i in only_treated if i in samples_braca
    ]].dropna()

    sns.boxplot(data=[
        exposures_not_breaked_1, exposures_breaked_1, exposures_not_breaked_2,
        exposures_breaked_2
    ],
                linewidth=0.6,
                showfliers=False,
                color='#cbcacbff')

    plt.ylabel('Indels DSB repair by\nnon-homologous end-joining')
    plt.xticks([0, 1, 2, 3], [
        'Not radiated no BRCAness ({})'.format(len(exposures_not_breaked_1)),
        'Radiated no BRCAness ({})'.format(len(exposures_breaked_1)),
        'Not radiated BRCAness ({})'.format(len(exposures_not_breaked_2)),
        'Radiated BRCAness ({})'.format(len(exposures_breaked_2))
    ],
               rotation=90)

    plotdot = []
    colors = []
    for sample in [i for i in not_breaked if i in samples_lowbraca]:
        plotdot.append(exp[sigs].sum(axis=1).loc[sample])
        colors.append(color_ttype[dic_primary_full[sample]])

    ax.scatter(
        [0 + np.random.uniform(-0.2, 0.2, 1)[0] for i in range(len(plotdot))],
        plotdot,
        color=colors,
        s=1,
        alpha=0.2)

    plotdot = []
    colors = []
    for sample in [i for i in only_treated if i in samples_lowbraca]:
        plotdot.append(exp[sigs].sum(axis=1).loc[sample])
        colors.append(color_ttype[dic_primary_full[sample]])

    ax.scatter(
        [1 + np.random.uniform(-0.2, 0.2, 1)[0] for i in range(len(plotdot))],
        plotdot,
        color=colors,
        s=1,
        alpha=0.2)

    plotdot = []
    colors = []
    for sample in [i for i in not_breaked if i in samples_braca]:
        plotdot.append(exp[sigs].sum(axis=1).loc[sample])
        colors.append(color_ttype[dic_primary_full[sample]])

    ax.scatter(
        [2 + np.random.uniform(-0.2, 0.2, 1)[0] for i in range(len(plotdot))],
        plotdot,
        color=colors,
        s=1,
        alpha=0.2)

    plotdot = []
    colors = []
    for sample in [i for i in only_treated if i in samples_braca]:
        plotdot.append(exp[sigs].sum(axis=1).loc[sample])
        colors.append(color_ttype[dic_primary_full[sample]])

    ax.scatter(
        [3 + np.random.uniform(-0.2, 0.2, 1)[0] for i in range(len(plotdot))],
        plotdot,
        color=colors,
        s=1,
        alpha=0.2)

    plt.ylim(0, 700)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.savefig('figures/radiation.svg')
    plt.show()

    ##################

    fig, ax = plt.subplots(
        1,
        1,
    )
    stat, pval1 = mannwhitneyu(exposures_not_breaked_1, exposures_breaked_1)
    print("Not radiated no BRCAnes vs Radiated no BRCAness", pval1)

    stat, pval2 = mannwhitneyu(exposures_not_breaked_2, exposures_breaked_2)
    print("Not radiated  BRCAnes vs Radiated  BRCAness", pval2)

    stat, pval3 = mannwhitneyu(exposures_breaked_1, exposures_breaked_2)
    print("radiated  no BRCAnes vs Radiated  BRCAness", pval3)

    ax.text(1, 1, "$\it{P}$" + " = {}".format(sci_notation(pval1)), fontsize=7)
    ax.text(1, 4, "$\it{P}$" + " = {}".format(sci_notation(pval2)), fontsize=7)
    ax.text(1, 8, "$\it{P}$" + " = {}".format(sci_notation(pval3)), fontsize=7)

    plt.xlim(0, 5)
    plt.ylim(0, 10)
    plt.savefig('figures/radiation_pvals.svg')
    sys.exit()