Exemplo n.º 1
0
def combine_cohorts(**set_up_kwargs):
    # initialize script params
    saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool)
    toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool)
    reportName = set_up_kwargs.get('reportName', script_fname)
    txt_label = set_up_kwargs.get('txt_label', 'test_txt_label')

    # plotting params
    plot_kwargs = set_up_kwargs.get('plot_kwargs', {})
    function_dict = plot_kwargs.get('function_dict', None)
    highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool)
    if highRes:
        img_ext = '.pdf'
    else:
        img_ext = '.png'
    cmap_custom = plot_kwargs.get('cmap_custom', None)
    vmin = parse_arg_type(plot_kwargs.get('vmin', None), int)
    vmax = parse_arg_type(plot_kwargs.get('vmax', None), int)
    if (cmap_custom is None) and (vmin is not None) and (vmax is not None):
        custom_div_cmap_arg = abs(vmin) + abs(vmax)
        if (vmin <= 0) and (vmax >= 0):
            custom_div_cmap_arg = custom_div_cmap_arg + 1
        mincol = plot_kwargs.get('mincol', None)
        midcol = plot_kwargs.get('midcol', None)
        maxcol = plot_kwargs.get('maxcol', None)
        if ((mincol is not None) and (midcol is not None)
                and (maxcol is not None)):
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg,
                                          mincol=mincol,
                                          midcol=midcol,
                                          maxcol=maxcol)
        else:
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg)

    # initialize directories
    MainDataDir = set_path(os.path.join(script_path, '..', 'data'))

    # data input
    data_fpaths = _split_argument_to_list(set_up_kwargs,
                                          'files_to_combine_samples',
                                          asPath=True,
                                          MainDataDir=MainDataDir)

    # data output
    _output_directory = set_up_kwargs.get('output_directory')
    if ',' in _output_directory:
        _output_directory = os.path.join(*_output_directory.rsplit(','))
    _output_directory = os.path.join(MainDataDir, _output_directory)
    output_directory = set_directory(
        os.path.join(_output_directory, reportName))

    # save the set_up_kwargs in the output dir for reproducibility
    fname = 'set_up_kwargs.json'
    f = os.path.join(output_directory, fname)
    if toPrint:
        logger.info('-save set_up_kwargs dictionary for reproducibility in: ' +
                    f)
    with open(f, 'w') as fp:
        json.dump(set_up_kwargs, fp, indent=4)

    # sample_info params
    sample_info_kwargs = set_up_kwargs.get('sample_info_kwargs', {})
    save_new_sample_info = False
    if sample_info_kwargs:
        save_new_sample_info = True
        # sample info input
        sample_info_fpaths = _split_argument_to_list(sample_info_kwargs,
                                                     'sample_info_fpaths',
                                                     asPath=True,
                                                     MainDataDir=MainDataDir)
        sample_info_read_csv_kwargs = sample_info_kwargs.get(
            'sample_info_read_csv_kwargs', {})
        sample_final_id = _split_argument_to_list(sample_info_kwargs,
                                                  'sample_final_id',
                                                  asPath=False)
        sample_info_new_label = _split_argument_to_list(
            sample_info_kwargs, 'sample_info_new_label', asPath=False)
        sample_info_combine_labels = _split_argument_to_list(
            sample_info_kwargs, 'sample_info_combine_labels', asPath=False)
        sample_info_swap_class_label = _split_argument_to_list(
            sample_info_kwargs, 'sample_info_swap_class_label', asPath=False)
        # if not isinstance(sample_info_swap_class_label, list):
        #     sample_info_swap_class_label = \
        #         [sample_info_swap_class_label]

        # new sample_info output dir
        new_sample_info_fpath = sample_info_kwargs.get('new_sample_info_fpath')
        if new_sample_info_fpath is None:
            new_sample_info_fpath = _output_directory
        else:
            if ',' in new_sample_info_fpath:
                new_sample_info_fpath = os.path.join(
                    *new_sample_info_fpath.rsplit(','))
                new_sample_info_fpath = os.path.join(MainDataDir,
                                                     new_sample_info_fpath)

    data_dfs = []
    if save_new_sample_info:
        sample_info_tables = []
    # load info table of samples
    for i, fpath in enumerate(data_fpaths):
        if save_new_sample_info:
            try:
                sample_info_read_csv_kwargs[str(i)]['col_as_index'] = \
                    sample_final_id[i]
                info_table = load_clinical(
                    sample_info_fpaths[i],
                    **sample_info_read_csv_kwargs[str(i)])
            except Exception as ex:
                logger.error('Load info table of samples FAILED!')
                logger.error(ex)
                raise

            if isinstance(sample_info_swap_class_label[i], list):
                _sample_info_swap_class_label_list = \
                    sample_info_swap_class_label[i]
            else:
                _sample_info_swap_class_label_list = \
                    [sample_info_swap_class_label[i]]

            for j in range(len(_sample_info_swap_class_label_list)):
                if _sample_info_swap_class_label_list[j] == '':
                    continue
                logger.warning('The user requested to swap the ' +
                               str(_sample_info_swap_class_label_list[j]) +
                               ' label in the ' + str(i) + ' dataset')
                info_table[_sample_info_swap_class_label_list[j]] = (
                    ~info_table[_sample_info_swap_class_label_list[j]].astype(
                        bool)).astype(int)

            info_table['dataset'] = i

        # load data
        fpath = data_fpaths[i]
        try:
            df = pd.read_csv(fpath, sep='\t', header=0, index_col=0)
            logger.info('loaded data file with shape: ' + str(df.shape))
        except Exception as ex:
            logger.error('failed to read data file from: ' + str(fpath))
            logger.error(ex)
            raise

        data_dfs.append(df)
        if save_new_sample_info:
            sample_info_tables.append(info_table)

    # now we join the cohort samples from the multiple datasets
    # on the common features (inner join)
    data = pd.concat(data_dfs, axis=0, join='inner', sort=False)

    # gene info input
    gene_info_fpath = set_up_kwargs.get('gene_info_fpath')
    if gene_info_fpath is not None:
        if ',' in gene_info_fpath:
            gene_info_fpath = os.path.join(*gene_info_fpath.rsplit(','))
            gene_info_fpath = os.path.join(MainDataDir, gene_info_fpath)

        chr_col = set_up_kwargs.get('chr_col', 'chr_int')
        gene_id_col = set_up_kwargs.get('gene_id_col', 'gene')

        # load gene info
        try:
            genes_positions_table = pd.read_csv(gene_info_fpath,
                                                sep='\t',
                                                header=0,
                                                index_col=0)
            # get gene chrom position
            xlabels, xpos = get_chr_ticks(genes_positions_table,
                                          data,
                                          id_col='gene',
                                          chr_col=chr_col)
        except Exception as ex:
            logger.warning('could not get genes position info')
            logger.warning(ex)
            xlabels, xpos = None, None
    else:
        xlabels, xpos = None, None

    if save_new_sample_info:
        # do the same for the info_tables
        # but keep all collumns (outer join)
        sample_info = pd.concat(sample_info_tables,
                                axis=0,
                                join='outer',
                                sort=False)
        sample_info.index.name = 'patientID'
        # create new label name by merging existing labels
        # (when no common label name between cohorts)
        if sample_info_new_label is not None:
            if isinstance(sample_info_new_label, list):
                _sample_info_new_label_list = \
                    sample_info_new_label
                _sample_info_combine_labels_list = \
                    sample_info_combine_labels
            else:
                _sample_info_new_label_list = \
                    [sample_info_new_label]
                _sample_info_combine_labels_list = \
                    [sample_info_combine_labels]
        for l, new_label in enumerate(_sample_info_new_label_list):
            combine_labels = _sample_info_combine_labels_list[l]
            sample_info[new_label] = sample_info[combine_labels].sum(axis=1)
            logger.info('combined labels: ' + str(combine_labels) +
                        'into the new label: ' + str(new_label))

    # sort the samples by name
    all_samples = natsorted(data.index.values)
    data = data.loc[all_samples, :]

    # heatmap of combined data (on samples)

    _figure_x_size, _figure_y_size, _show_gene_names, _show_sample_names = \
        set_heatmap_size(data)

    plt.figure(figsize=(_figure_x_size, _figure_y_size))
    ax = sns.heatmap(data,
                     vmin=vmin,
                     vmax=vmax,
                     yticklabels=_show_sample_names,
                     xticklabels=_show_gene_names,
                     cmap=cmap_custom,
                     cbar=False)
    plt.xticks(rotation=90)
    cbar = ax.figure.colorbar(ax.collections[0])
    set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
    plt.title(txt_label)

    if saveReport:
        logger.info('Save heatmap')
        fpath = os.path.join(output_directory, 'Fig_heatmap' + img_ext)
        plt.savefig(fpath,
                    transparent=True,
                    bbox_inches='tight',
                    pad_inches=0.1,
                    frameon=False)
        plt.close("all")
    else:
        plt.show()

    #########################################
    if (xlabels is not None) and (xpos is not None):
        # ORDER genes
        if toPrint:
            logger.info('Order data according to genomic position')

        # extract the gene relative order
        gene_order = genes_positions_table.set_index(
            gene_id_col).loc[:, 'order'].copy()
        # keep only gene_order with data
        ids_tmp = set(gene_order.index.values).intersection(
            set(data.columns.values))
        # keep only the order of these genes
        gene_order = gene_order.loc[ids_tmp].copy()
        gene_order = gene_order.sort_values()
        # then keep only these genes from the data
        data2plot = data.loc[:, gene_order.index].copy()

        # PLOT heatmap after gene ordering
        if toPrint:
            logger.info('Plot heatmap after gene ordering')
        _figure_x_size, _figure_y_size, \
            _show_gene_names, _show_sample_names = \
            set_heatmap_size(data2plot)
        plt.figure(figsize=(_figure_x_size, _figure_y_size))
        ax = sns.heatmap(data2plot,
                         vmin=vmin,
                         vmax=vmax,
                         xticklabels=_show_gene_names,
                         yticklabels=_show_sample_names,
                         cmap=cmap_custom,
                         cbar=False)
        ax.set_xticks(xpos)
        ax.set_xticklabels(xlabels, rotation=0)
        cbar = ax.figure.colorbar(ax.collections[0])
        set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
        if saveReport:
            if toPrint:
                logger.info('Save heatmap')
            plt.savefig(os.path.join(output_directory,
                                     'Fig_heatmap_ordered' + img_ext),
                        transparent=True,
                        bbox_inches='tight',
                        pad_inches=0.1,
                        frameon=False)
            plt.close("all")
        else:
            plt.show()
    #########################################

    # save the combined data
    fname = 'integrated_data.csv'
    fpath = os.path.join(output_directory, fname)
    logger.info('-save the combined data from different cohorts ' +
                'with shape:\n' + str(data.shape))
    data.to_csv(fpath, sep='\t')

    if save_new_sample_info:
        # save the sample_info
        fname = 'integrated_sample_info.csv'
        fpath = os.path.join(new_sample_info_fpath, fname)
        logger.info('-save the combined sample_info from different cohorts ' +
                    'with shape:\n' + str(sample_info.shape))
        sample_info.to_csv(fpath, sep='\t')
def nexus_express(**set_up_kwargs):
    # chose sample set from data
    # function: choose_samples()
    select_samples_from = set_up_kwargs.get('select_samples_from', None)
    select_samples_which = parse_arg_type(
        set_up_kwargs.get('select_samples_which', None), int)
    select_samples_sort_by = set_up_kwargs.get('select_samples_sort_by', None)
    if select_samples_sort_by is not None:
        select_samples_sort_by = select_samples_sort_by.rsplit(',')
    select_samples_title = set_up_kwargs.get('select_samples_title',
                                             'select_all')
    clinical_label = select_samples_sort_by[0]
    class_labels = set_up_kwargs.get('class_labels', None)
    if class_labels is not None:
        if ',' in class_labels:
            class_labels = class_labels.rsplit(',')
    class_values = set_up_kwargs.get('class_values', None)
    if class_values is not None:
        if ',' in class_values:
            class_values = class_values.rsplit(',')
            class_values = np.array(class_values).astype(int)

    # initialize script params
    saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool)
    toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool)
    toPlotFreq = parse_arg_type(set_up_kwargs.get('toPlotFreq', True), bool)
    reportName = set_up_kwargs.get('reportName', script_fname)
    txt_label = set_up_kwargs.get('txt_label', 'test_txt_label')
    input_fname = set_up_kwargs.get('input_fname', 'data_processed.csv')
    gene_info_fname = set_up_kwargs.get('gene_info_fname', None)
    chr_col = set_up_kwargs.get('chr_col', 'chr_int')
    gene_id_col = set_up_kwargs.get('gene_id_col', 'gene')
    sample_info_fname = set_up_kwargs.get('sample_info_fname', None)
    if ',' in sample_info_fname:
        sample_info_fname = os.path.join(*sample_info_fname.rsplit(','))
    sample_info_read_csv_kwargs = set_up_kwargs.get(
        'sample_info_read_csv_kwargs', {})
    data_uniq_fname = input_fname.rsplit('.')[0]+'__' + \
        select_samples_title+'__uniq'
    toRemoveDupl = parse_arg_type(set_up_kwargs.get('toRemoveDupl', True),
                                  bool)

    # params for diff analysis
    min_diff_thres = parse_arg_type(set_up_kwargs.get('min_diff_thres', 0.25),
                                    float)
    multtest_alpha = parse_arg_type(set_up_kwargs.get('multtest_alpha', 0.05),
                                    float)
    with_perc = parse_arg_type(set_up_kwargs.get('with_perc', 100), int)
    multtest_method = set_up_kwargs.get('multtest_method', 'fdr_bh')

    # plotting params
    plot_kwargs = set_up_kwargs.get('plot_kwargs', {})
    function_dict = plot_kwargs.get('function_dict', None)
    cmap_custom = plot_kwargs.get('cmap_custom', None)
    vmin = parse_arg_type(plot_kwargs.get('vmin', None), int)
    vmax = parse_arg_type(plot_kwargs.get('vmax', None), int)
    if (cmap_custom is None) and (vmin is not None) and (vmax is not None):
        custom_div_cmap_arg = abs(vmin) + abs(vmax)
        if (vmin <= 0) and (vmax >= 0):
            custom_div_cmap_arg = custom_div_cmap_arg + 1
        mincol = plot_kwargs.get('mincol', None)
        midcol = plot_kwargs.get('midcol', None)
        maxcol = plot_kwargs.get('maxcol', None)
        if ((mincol is not None) and (midcol is not None)
                and (maxcol is not None)):
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg,
                                          mincol=mincol,
                                          midcol=midcol,
                                          maxcol=maxcol)
        else:
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg)

    highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool)
    if highRes:
        img_ext = '.pdf'
    else:
        img_ext = '.png'

    # initialize directories
    MainDataDir = set_path(os.path.join(script_path, '..', 'data'))

    # data input
    input_directory = set_up_kwargs.get('input_directory')
    if ',' in input_directory:
        input_directory = os.path.join(*input_directory.rsplit(','))
    input_directory = os.path.join(MainDataDir, input_directory)

    # sample info input
    sample_info_directory = set_up_kwargs.get('sample_info_directory')
    if ',' in sample_info_directory:
        sample_info_directory = os.path.join(
            *sample_info_directory.rsplit(','))
    sample_info_directory = os.path.join(MainDataDir, sample_info_directory)

    # gene info input
    gene_info_directory = set_up_kwargs.get('gene_info_directory')
    if gene_info_directory is None:
        gene_info_directory = input_directory
    else:
        if ',' in gene_info_directory:
            gene_info_directory = os.path.join(
                *gene_info_directory.rsplit(','))
            gene_info_directory = os.path.join(MainDataDir,
                                               gene_info_directory)

    # dupl_genes input
    dupl_genes_directory = set_up_kwargs.get('dupl_genes_directory')
    dupl_genes_directory = os.path.join(input_directory, dupl_genes_directory)

    # data output
    output_directory = set_up_kwargs.get('output_directory')
    if output_directory is None:
        output_directory = set_directory(
            os.path.join(input_directory, reportName))
    else:
        if ',' in output_directory:
            output_directory = os.path.join(*output_directory.rsplit(','))
        output_directory = set_directory(
            os.path.join(MainDataDir, output_directory, reportName))

    # save the set_up_kwargs in the output dir for reproducibility
    fname = 'set_up_kwargs.json'
    f = os.path.join(output_directory, fname)
    if toPrint:
        logger.info('-save set_up_kwargs dictionary for reproducibility in: ' +
                    f)
    with open(f, 'w') as fp:
        json.dump(set_up_kwargs, fp, indent=4)

    # load info table of samples
    if toPrint:
        logger.info('Load info table of samples')
    fpath = os.path.join(sample_info_directory, sample_info_fname)
    info_table = load_clinical(fpath, **sample_info_read_csv_kwargs)

    # load processed data
    fpath = os.path.join(input_directory, input_fname)
    data = pd.read_csv(fpath, sep='\t', header=0, index_col=0)
    empty_pat = data.sum(axis=1).isnull()
    if empty_pat.any():
        logger.info('Patients with missing values in all genes: ' +
                    str(data.index[empty_pat]))
    data = data.fillna(0)

    # load gene info
    if gene_info_fname is not None:
        fpath = os.path.join(gene_info_directory, gene_info_fname)
        genes_positions_table = pd.read_csv(fpath,
                                            sep='\t',
                                            header=0,
                                            index_col=0)
        # get gene chrom position
        xlabels, xpos = get_chr_ticks(genes_positions_table,
                                      data,
                                      id_col='gene',
                                      chr_col=chr_col)
    else:
        xlabels, xpos = None, None

    # select the samples for the chosen comparison (e.g. all, only TP53wt, etc)
    logger.info('select_samples_from: ' + str(select_samples_from) +
                ', select_samples_which: ' + str(select_samples_which) +
                ', select_samples_sort_by: ' + str(select_samples_sort_by) +
                ', select_samples_title: ' + str(select_samples_title))

    # keep only info_table with data
    temp = info_table.index.name
    info_table = info_table.loc[data.index].copy()
    info_table.index.name = temp
    ids_tmp = choose_samples(info_table.reset_index(),
                             info_table.index.name,
                             choose_from=select_samples_from,
                             choose_what=select_samples_which,
                             sortby=select_samples_sort_by,
                             ascending=False)
    # keep a subpart of the info_table (rows and columns)
    info_table = info_table.loc[ids_tmp, select_samples_sort_by].copy()
    # keep only these samples from the data
    data = data.loc[ids_tmp, :].copy()
    try:
        pat_labels_txt = info_table.astype(int).reset_index().values
    except:
        pat_labels_txt = info_table.reset_index().values
    pat_labels_title = str(info_table.reset_index().columns.values)

    # plot CNV frequencies of all samples
    data_ampl, data_del = _get_ampl_del_from_data(data)
    if toPlotFreq:
        _plot_oncoscan_frequency_plot(data_ampl, data_del,
                                      select_samples_title, '',
                                      gene_info_fname, xlabels, xpos,
                                      saveReport, img_ext, output_directory)

    extra_label = ''
    if toRemoveDupl:
        # keep a copy of the data with duplicate genes
        data_wDupl = data.copy()
        xlabels_wDupl = xlabels.copy()
        xpos_wDupl = xpos.copy()
        data_ampl_wDupl, data_del_wDupl = data_ampl.copy(), data_del.copy()

        # load data with uniq genes (this will be the default data from now on)
        fpath = os.path.join(dupl_genes_directory, data_uniq_fname + '.txt')
        if not os.path.exists(fpath):
            logger.warning('The data_uniq file does not exist, ' +
                           'the analysis will run on the processed data ' +
                           'only!\nfile path:\n' + fpath)
            toRemoveDupl = False
        else:
            extra_label = '_uniq'
            data = pd.read_csv(fpath, sep='\t', header=0, index_col=0)
            data = data.fillna(0)

            # keep the same samples as before
            data = data.loc[data_wDupl.index, :].copy()

            # get gene chrom position
            if gene_info_fname is not None:
                xlabels, xpos = get_chr_ticks(genes_positions_table,
                                              data,
                                              id_col='gene',
                                              chr_col=chr_col)

            # plot CNV frequencies of all samples with uniq genes
            data_ampl, data_del = _get_ampl_del_from_data(data)
            if toPlotFreq:
                _plot_oncoscan_frequency_plot(data_ampl, data_del,
                                              select_samples_title,
                                              extra_label, gene_info_fname,
                                              xlabels, xpos, saveReport,
                                              img_ext, output_directory)

            # load duplicate genes dictionary
            #  we will need that for the table we will save later
            fpath = os.path.join(dupl_genes_directory,
                                 data_uniq_fname + '.json')
            with open(fpath, 'r') as fp:
                dupl_genes_dict = json.load(fp)

    # separate patient groups and plot their CNV frequencies
    group0 = data.loc[info_table.index[info_table[clinical_label] ==
                                       class_values[0]]].copy()
    group1 = data.loc[info_table.index[info_table[clinical_label] ==
                                       class_values[1]]].copy()

    group0_ampl, group0_del = _get_ampl_del_from_data(group0)
    if toPlotFreq:
        _plot_oncoscan_frequency_plot(group0_ampl, group0_del,
                                      select_samples_title,
                                      class_labels[0] + extra_label,
                                      gene_info_fname, xlabels, xpos,
                                      saveReport, img_ext, output_directory)

    group1_ampl, group1_del = _get_ampl_del_from_data(group1)
    if toPlotFreq:
        _plot_oncoscan_frequency_plot(group1_ampl, group1_del,
                                      select_samples_title,
                                      class_labels[1] + extra_label,
                                      gene_info_fname, xlabels, xpos,
                                      saveReport, img_ext, output_directory)

    if toRemoveDupl:
        # plot with the duplicate genes too
        group0_wDupl = data_wDupl.loc[info_table.index[
            info_table[clinical_label] == class_values[0]]].copy()
        group1_wDupl = data_wDupl.loc[info_table.index[
            info_table[clinical_label] == class_values[1]]].copy()

        group0_ampl_wDupl, group0_del_wDupl = \
            _get_ampl_del_from_data(group0_wDupl)
        if toPlotFreq:
            _plot_oncoscan_frequency_plot(group0_ampl_wDupl, group0_del_wDupl,
                                          select_samples_title,
                                          class_labels[0], gene_info_fname,
                                          xlabels_wDupl, xpos_wDupl,
                                          saveReport, img_ext,
                                          output_directory)

        group1_ampl_wDupl, group1_del_wDupl = \
            _get_ampl_del_from_data(group1_wDupl)
        if toPlotFreq:
            _plot_oncoscan_frequency_plot(group1_ampl_wDupl, group1_del_wDupl,
                                          select_samples_title,
                                          class_labels[1], gene_info_fname,
                                          xlabels_wDupl, xpos_wDupl,
                                          saveReport, img_ext,
                                          output_directory)

    # run the Nexus Express diff analysis
    # select genes with significant p-value (multtest_alpha)
    # after mutliple test correction (multtest_method) and
    # absolute change higher than the defined threshold (min_diff_thres)
    mytitle = select_samples_title+': '+class_labels[0] +\
        '['+str(class_values[0])+'] vs. ' +\
        class_labels[1]+'['+str(class_values[1])+']'
    group0_ampl_new, group1_ampl_new, group0_del_new, group1_del_new, \
        pvals, pvals_corrected, pvals_reject, gained, deleted = \
        get_NexusExpress_diff_analysis(
            group0_ampl, group1_ampl, group0_del, group1_del,
            with_perc=with_perc, multtest_method=multtest_method,
            multtest_alpha=multtest_alpha, min_diff_thres=min_diff_thres,
            mytitle=mytitle
        )

    # create table with all genes
    if gene_info_fname is not None:
        diff_genes = genes_positions_table.set_index(
            ['gene']).loc[data.columns.values][['chr', 'start', 'end']].copy()
    else:
        diff_genes = pd.DataFrame(index=data.columns.values)
        diff_genes.index.name = 'gene'
    diff_genes[class_labels[0] + '_' + clinical_label +
               '_ampl'] = group0_ampl * with_perc
    diff_genes[class_labels[1] + '_' + clinical_label +
               '_ampl'] = group1_ampl * with_perc
    diff_genes[class_labels[0] + '_' + clinical_label +
               '_del'] = group0_del * with_perc
    diff_genes[class_labels[1] + '_' + clinical_label +
               '_del'] = group1_del * with_perc

    diff_genes['pvals'] = pvals
    diff_genes['pvals_corrected'] = pvals_corrected
    diff_genes['pvals_reject'] = pvals_reject
    diff_genes['gained'] = gained
    diff_genes['ampl_diff'] = np.abs(
        diff_genes[class_labels[0] + '_' + clinical_label + '_ampl'] -
        diff_genes[class_labels[1] + '_' + clinical_label + '_ampl'])
    diff_genes['deleted'] = deleted
    diff_genes['del_diff'] = np.abs(
        diff_genes[class_labels[0] + '_' + clinical_label + '_del'] -
        diff_genes[class_labels[1] + '_' + clinical_label + '_del'])

    # add the dupl_genes column only if there are duplicate genes
    if toRemoveDupl:
        diff_genes['dupl_genes'] = \
            diff_genes.reset_index()['gene'].map(dupl_genes_dict).values

        # save also the positions of these duplicate genes
        diff_genes['newGeneName'] = diff_genes.index.values
        diff_genes.loc[dupl_genes_dict.keys(), 'newGeneName'] += '__wDupl'
        if gene_info_fname is not None:
            diff_genes['aggChrGene'] = None
            diff_genes['aggPos'] = None
            diff_genes['aggChrStart'] = None
            diff_genes['aggChrEnd'] = None

            # for each duplicated gene, aggregate and save
            # the name, start, end, chr values in the table
            for agene in dupl_genes_dict.keys():
                l = [agene]
                # if agene in dupl_genes_dict.keys():
                l.extend(dupl_genes_dict[agene])
                diff_genes.loc[agene, 'aggChrEnd'] = str(
                    natsorted(
                        genes_positions_table.set_index('gene').loc[l].
                        reset_index().groupby(by=['chr'])['end'].apply(
                            lambda x: list(np.unique(np.append([], x))
                                           )).reset_index().values.tolist()))
                diff_genes.loc[agene, 'aggChrStart'] = str(
                    natsorted(
                        genes_positions_table.set_index('gene').loc[l].
                        reset_index().groupby(by=['chr'])['start'].apply(
                            lambda x: list(np.unique(np.append([], x))
                                           )).reset_index().values.tolist()))
                diff_genes.loc[agene, 'aggChrGene'] = str(
                    natsorted(
                        genes_positions_table.set_index('gene').loc[l].
                        reset_index().groupby(by=['chr'])['gene'].apply(
                            lambda x: list(np.unique(np.append([], x))
                                           )).reset_index().values.tolist()))
                aggPos = \
                    genes_positions_table.set_index('gene').loc[l].groupby(
                        by=['chr']).agg(
                            {'start': min, 'end': max}
                            ).reset_index().astype(str).apply(
                                lambda x: ':'.join(x), axis=1).values
                diff_genes.loc[agene, 'aggPos'] = np.apply_along_axis(
                    lambda x: '__'.join(x), 0, natsorted(aggPos))

    # from the above table: select only the selected genes
    # according to the Nexus Express diff analysis
    diff_genes_selected = diff_genes[(diff_genes['gained'] > 0) |
                                     (diff_genes['deleted'] > 0)].copy()

    # save tables
    if saveReport:
        fname = 'diff_genes_' + select_samples_title + '.csv'
        fpath = os.path.join(output_directory, fname)
        logger.info("-save all diff genes in :\n" + fpath)
        diff_genes.to_csv(fpath, sep='\t', header=True, index=True)

        if diff_genes_selected.shape[0] > 0:
            # keep only those genes in the data
            data = data.loc[:, diff_genes_selected.index]
            # change the name of the genes to indicate if they have duplicates
            if 'newGeneName' in diff_genes_selected.columns.values:
                newgeneNames = diff_genes_selected.loc[data.columns,
                                                       'newGeneName'].values
                data.columns = newgeneNames
            # save this data for future classification
            fname = 'data_features_class.csv'
            fpath = os.path.join(output_directory, fname)
            logger.info("-save data with selected diff features for " +
                        mytitle + " and samples class labels in :\n" + fpath)
            data.to_csv(fpath, sep='\t', header=True, index=True)

            # save as tab-delimited csv file
            fname = 'diff_genes_selected_' + select_samples_title + '.csv'
            fpath = os.path.join(output_directory, fname)
            logger.info("-save selected diff genes for " + mytitle +
                        " in :\n" + fpath)
            diff_genes_selected.to_csv(fpath,
                                       sep='\t',
                                       header=True,
                                       index=True)

            # save also as excel file
            fname = 'diff_genes_selected_' + select_samples_title + '.xlsx'
            fpath = os.path.join(output_directory, fname)
            logger.info('-save csv file as excel too')
            writer = pd.ExcelWriter(fpath)
            diff_genes_selected.to_excel(writer,
                                         sheet_name=select_samples_title)
            writer.save()

    # plot CNV frequencies OF SELECTED GENES for each group in comparison
    if toPlotFreq:
        if ((group0_ampl_new != 0).any() or (group0_del_new != 0).any()):
            _plot_oncoscan_frequency_plot(group0_ampl_new, group0_del_new,
                                          select_samples_title + '_DIFF',
                                          class_labels[0] + extra_label,
                                          gene_info_fname, xlabels, xpos,
                                          saveReport, img_ext,
                                          output_directory)
        if ((group1_ampl_new != 0).any() or (group1_del_new != 0).any()):
            _plot_oncoscan_frequency_plot(group1_ampl_new, group1_del_new,
                                          select_samples_title + '_DIFF',
                                          class_labels[1] + extra_label,
                                          gene_info_fname, xlabels, xpos,
                                          saveReport, img_ext,
                                          output_directory)

    if toRemoveDupl:
        group0_ampl_new_wDupl = group0_ampl_wDupl.copy()
        group0_ampl_new_wDupl[:] = 0
        group1_ampl_new_wDupl = group1_ampl_wDupl.copy()
        group1_ampl_new_wDupl[:] = 0
        group0_del_new_wDupl = group0_del_wDupl.copy()
        group0_del_new_wDupl[:] = 0
        group1_del_new_wDupl = group1_del_wDupl.copy()
        group1_del_new_wDupl[:] = 0

        list__diff_genes_selected_wDupl = []
        for i in range(diff_genes_selected.shape[0]):
            theGene = diff_genes_selected.index[i]
            genes2edit = [theGene]
            list__diff_genes_selected_wDupl.extend(genes2edit)
            duplgenes_ = diff_genes_selected.loc[theGene]['dupl_genes']
            if duplgenes_ is not np.nan:
                list__diff_genes_selected_wDupl.extend(duplgenes_)
                genes2edit.extend(duplgenes_)
            group0_ampl_new_wDupl.loc[genes2edit] = group0_ampl_new.loc[
                theGene]
            group1_ampl_new_wDupl.loc[genes2edit] = group1_ampl_new.loc[
                theGene]
            group0_del_new_wDupl.loc[genes2edit] = group0_del_new.loc[theGene]
            group1_del_new_wDupl.loc[genes2edit] = group1_del_new.loc[theGene]

        if len(list__diff_genes_selected_wDupl) > 0:
            # save this data for future classification
            fname = 'data_all_genes_class.csv'
            fpath = os.path.join(output_directory, fname)
            logger.info("-save data with selected diff genes for " + mytitle +
                        " and samples class labels in :\n" + fpath)
            data_wDupl[list__diff_genes_selected_wDupl].to_csv(fpath,
                                                               sep='\t',
                                                               header=True,
                                                               index=True)

        if toPlotFreq:
            # plot with the duplicate genes too
            if ((group0_ampl_new_wDupl != 0).any()
                    or (group0_del_new_wDupl != 0).any()):
                _plot_oncoscan_frequency_plot(
                    group0_ampl_new_wDupl, group0_del_new_wDupl,
                    select_samples_title + '_DIFF', class_labels[0],
                    gene_info_fname, xlabels_wDupl, xpos_wDupl, saveReport,
                    img_ext, output_directory)
            if ((group1_ampl_new_wDupl != 0).any()
                    or (group1_del_new_wDupl != 0).any()):
                _plot_oncoscan_frequency_plot(
                    group1_ampl_new_wDupl, group1_del_new_wDupl,
                    select_samples_title + '_DIFF', class_labels[1],
                    gene_info_fname, xlabels_wDupl, xpos_wDupl, saveReport,
                    img_ext, output_directory)

    # PLOT heatmaps of selected features
    if diff_genes_selected.shape[0] > 0:
        # get only the CNVs from the selected genes
        patientNames2plot = pat_labels_txt
        ds_y, ds_x = data.shape
        fs_x = 25 if ds_x > 45 else 15 if ds_x > 30 else 10 if ds_x > 3 else 5
        fs_y = 20 if ds_y > 40 else 15 if ds_y > 30 else 10
        plt.figure(figsize=(fs_x, fs_y))
        ax = sns.heatmap(data,
                         vmin=vmin,
                         vmax=vmax,
                         xticklabels=True,
                         yticklabels=patientNames2plot,
                         cmap=cmap_custom,
                         cbar=False)
        ax.set_ylabel(pat_labels_title)
        plt.xticks(rotation=90)
        cbar = ax.figure.colorbar(ax.collections[0])
        set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
        plt.title(mytitle)
        if saveReport:
            fpath = os.path.join(
                output_directory,
                'Fig_Heatmap_' + select_samples_title + extra_label + img_ext)
            logger.info('Save Heatmap of selected features as ' + img_ext +
                        ' in:\n' + fpath)
            plt.savefig(fpath,
                        transparent=True,
                        bbox_inches='tight',
                        pad_inches=0.1,
                        frameon=False)
            plt.close("all")
        else:
            plt.show()

        if toRemoveDupl:
            data2plot = data_wDupl[list__diff_genes_selected_wDupl]
            patientNames2plot = pat_labels_txt
            ds_y, ds_x = data2plot.shape
            fs_x = 25 if ds_x > 45 else 15 if ds_x > 30 else 10
            fs_y = 20 if ds_y > 40 else 15 if ds_y > 30 else 10
            plt.figure(figsize=(fs_x, fs_y))
            ax = sns.heatmap(data2plot,
                             vmin=vmin,
                             vmax=vmax,
                             xticklabels=True,
                             yticklabels=patientNames2plot,
                             cmap=cmap_custom,
                             cbar=False)
            ax.set_ylabel(pat_labels_title)
            plt.xticks(rotation=90)
            cbar = ax.figure.colorbar(ax.collections[0])
            set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
            plt.title(mytitle)
            if saveReport:
                fpath = os.path.join(
                    output_directory,
                    'Fig_Heatmap_' + select_samples_title + '_wDupl' + img_ext)
                logger.info('Save Heatmap of selected features as ' + img_ext +
                            ' in:\n' + fpath)
                plt.savefig(fpath,
                            transparent=True,
                            bbox_inches='tight',
                            pad_inches=0.1,
                            frameon=False)
                plt.close("all")
            else:
                plt.show()
Exemplo n.º 3
0
def run_pipeline(
        # load_data
        filepath="TCGA_PRAD/input/gistic-cn-processed.tsv",
        clinical_fpath="TCGA_PRAD/processed/clinical.txt",
        gaf_fpath="TCGA_PRAD/processed/gaf.json",
        output_directory="TCGA_PRAD/processed",
        data_type='cnv',
        # clean_samples
        sample_type=None,
        # split_data
        split_train_size=None,
        split_random_state=None,
        stratify_patients_by=None,
        # transform_data
        to_arcsinh=False,
        to_stand=True,
        # sort_data: genes
        to_sort_columns=True,
        gene_id_col='gene',
        gene_order_col='pos_order',
        # sort_data: samples
        to_sort_rows=True,
        sort_patients_by="grade_group",
        # remove_andSave_duplicates
        to_remove_duplicate_columns=True,
        to_compute_euclidean_distances=True,
        to_save_euclidean_distances=True,
        # final output main name
        output_filename=None):
    # set main data directory
    MainDataDir = set_path(os.path.join(script_path, '..', '..', 'data'))

    # make fpaths valid
    # data input
    filepath = set_path(filepath, parent_dir=MainDataDir, force=False)
    # gene order table
    gaf_fpath = set_path(gaf_fpath, parent_dir=MainDataDir, force=False)
    # clinical info
    clinical_fpath = set_path(clinical_fpath,
                              parent_dir=MainDataDir,
                              force=False)
    clinical = load_clinical(clinical_fpath, **{
        'sep': '\t',
        'header': 0,
        'index_col': 0
    })
    # output dir
    output_directory = set_path(output_directory,
                                parent_dir=MainDataDir,
                                force=True)
    if output_filename is None:
        output_filename = ''

    # load_data, clean_samples, split_data,
    # clean_genes, transform_data, sort_data
    data = load_data(filepath, data_type=data_type)
    output_filename = output_filename + data_type
    logger.debug("finished loading data")

    # clean genes
    data = clean_genes(data)
    logger.debug("finished cleaning genes in all samples")

    if to_sort_columns:
        genes_positions_table = pd.read_csv(gaf_fpath,
                                            sep='\t',
                                            header=0,
                                            index_col=0)
        gene_order = genes_positions_table.set_index(
            gene_id_col).loc[:, gene_order_col]
        logger.debug("finished loading genes order")
        data = sort_data(data,
                         to_sort_columns=True,
                         to_sort_rows=False,
                         gene_order=gene_order)
        logger.debug("finished sorting genes in all samples")

    data = clean_samples(data, sample_type=sample_type)
    logger.debug("finished cleaning samples")
    # in case multiple sample types exist,
    # the 'sample_type' will change inside clean_data()
    if sample_type is not None:
        output_filename = output_filename + '_' + sample_type

    if split_train_size is not None:
        stratify_patients_by = stratify_patients_by.rsplit(',')
        stratify_by = \
            clinical.loc[data.index, stratify_patients_by]
        data_list = split_data(data,
                               stratify_by=stratify_by,
                               split_train_size=split_train_size,
                               split_random_state=split_random_state)
        logger.debug("finished splitting data sample sets")
        if split_train_size < 1:
            splitname = '_split_perc' + \
                        str(int(split_train_size*100)) + \
                        '_seed'+str(split_random_state)
        else:
            splitname = '_split_size' + \
                        str(int(split_train_size)) + \
                        '_seed'+str(split_random_state)
        output_filename = output_filename + '_' + splitname
        fname_list = [output_filename + '_part1', output_filename + '_part2']
    else:
        data_list = [data]
        fname_list = [output_filename]

    _counter = 0
    for _data, _fname in zip(data_list, fname_list):

        _data = clean_genes(_data)
        logger.debug("finished cleaning genes in set " + str(_counter))

        _data, transformation_settings = transform_data(_data,
                                                        to_arcsinh=to_arcsinh,
                                                        to_stand=to_stand)
        logger.debug("finished transforming data in set " + str(_counter))

        if to_sort_rows:
            sort_patients_by = sort_patients_by.rsplit(',')
            logger.debug("sorting samples in set " + str(_counter))
            _data = sort_data(_data,
                              to_sort_columns=False,
                              to_sort_rows=True,
                              sort_patients_by=sort_patients_by,
                              clinical=clinical)
            logger.debug("finished sorting samples in set " + str(_counter))

        output_filename = _fname + '_processed'
        save_output(_data,
                    transformation_settings=transformation_settings,
                    output_directory=output_directory,
                    output_filename=output_filename)
        logger.debug("finished saving data output in set " + str(_counter))

        if to_remove_duplicate_columns:
            load_from_file = False
            to_save_output = True
            newdata, _, _, _ = remove_andSave_duplicates(
                _data,
                load_from_file=load_from_file,
                to_compute_euclidean_distances=to_compute_euclidean_distances,
                to_save_euclidean_distances=to_save_euclidean_distances,
                to_save_output=to_save_output,
                output_filename=output_filename,
                output_directory=output_directory)
            logger.debug("finished removing duplicate gene profiles in set " +
                         str(_counter))

            # sanity check (TODO: set imputation in case it fails)
            nan_exist = newdata.isnull().any().any()
            if nan_exist:
                logger.error('NaN values exist in the data!\n' +
                             'No imputation is set!')
                raise
        _counter += 1
Exemplo n.º 4
0
def combine_features(**set_up_kwargs):
    # initialize script params
    saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool)
    toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool)
    reportName = set_up_kwargs.get('reportName', script_fname)
    txt_label = set_up_kwargs.get('txt_label', 'test_txt_label')

    sample_final_id = set_up_kwargs.get('sample_final_id')
    sample_data_ids = set_up_kwargs.get('sample_data_ids')
    if ',' in sample_data_ids:
        sample_data_ids = sample_data_ids.rsplit(',')

    # plotting params
    plot_kwargs = set_up_kwargs.get('plot_kwargs', {})
    highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool)
    if highRes:
        img_ext = '.pdf'
    else:
        img_ext = '.png'
    cmap_custom = plot_kwargs.get('cmap_custom', None)
    vmin = parse_arg_type(plot_kwargs.get('vmin', None), int)
    vmax = parse_arg_type(plot_kwargs.get('vmax', None), int)
    if (cmap_custom is None) and (vmin is not None) and (vmax is not None):
        custom_div_cmap_arg = abs(vmin) + abs(vmax)
        if (vmin <= 0) and (vmax >= 0):
            custom_div_cmap_arg = custom_div_cmap_arg + 1
        mincol = plot_kwargs.get('mincol', None)
        midcol = plot_kwargs.get('midcol', None)
        maxcol = plot_kwargs.get('maxcol', None)
        if ((mincol is not None) and (midcol is not None)
                and (maxcol is not None)):
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg,
                                          mincol=mincol,
                                          midcol=midcol,
                                          maxcol=maxcol)
        else:
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg)

    # initialize directories
    MainDataDir = set_path(os.path.join(script_path, '..', 'data'))

    # data input
    file_short_ids = set_up_kwargs.get('file_short_ids', None)
    if ',' in file_short_ids:
        file_short_ids = file_short_ids.rsplit(',')
    else:
        file_short_ids = [file_short_ids]

    data_fpaths = _split_argument_to_list(set_up_kwargs,
                                          'files_to_combine_features',
                                          asPath=True,
                                          MainDataDir=MainDataDir)

    # sample info input
    sample_info_directory = set_up_kwargs.get('sample_info_directory')
    if ',' in sample_info_directory:
        sample_info_directory = os.path.join(
            *sample_info_directory.rsplit(','))
    sample_info_directory = os.path.join(MainDataDir, sample_info_directory)

    sample_info_fname = set_up_kwargs.get('sample_info_fname')
    if ',' in sample_info_fname:
        sample_info_fname = os.path.join(*sample_info_fname.rsplit(','))
    sample_info_read_csv_kwargs = set_up_kwargs.get(
        'sample_info_read_csv_kwargs', {})

    # data output
    output_directory = set_up_kwargs.get('output_directory')
    if ',' in output_directory:
        output_directory = os.path.join(*output_directory.rsplit(','))
    output_directory = set_directory(
        os.path.join(MainDataDir, output_directory, reportName))

    # save the set_up_kwargs in the output dir for reproducibility
    fname = 'set_up_kwargs.json'
    f = os.path.join(output_directory, fname)
    if toPrint:
        logger.info('-save set_up_kwargs dictionary for reproducibility in: ' +
                    f)
    with open(f, 'w') as fp:
        json.dump(set_up_kwargs, fp, indent=4)

    # load info table of samples
    try:
        fpath = os.path.join(sample_info_directory, sample_info_fname)
        sample_info_read_csv_kwargs['col_as_index'] = sample_final_id
        info_table = load_clinical(fpath, **sample_info_read_csv_kwargs)
    except Exception as ex:
        logger.error('Load info table of samples FAILED!')
        logger.error(ex)
        raise

    # load data
    data_dfs = []
    for i, fpath in enumerate(data_fpaths):
        try:
            df = pd.read_csv(fpath, sep='\t', header=0, index_col=0)
            logger.info('loaded data file with shape: ' + str(df.shape))
        except Exception as ex:
            logger.error('failed to read data file from: ' + str(fpath))
            logger.error(ex)
            raise

        # if datasets have different sample IDs
        # map them to a user defined common one
        if sample_data_ids[i] != sample_final_id:
            # get the two ids from the info_table
            matching_ids = info_table.reset_index()\
                .set_index(sample_data_ids[i])[sample_final_id]
            # add the new id and drop the old one
            # join help with the one-to-one mapping
            df = df.join(matching_ids, how='right')\
                .set_index(sample_final_id, drop=True)

        # add suffix to separate common genes between datasets
        df.columns += "__" + file_short_ids[i]

        data_dfs.append(df)

    # now we join the data features from the multiple datasets
    # on the common samples (inner join)
    data = pd.concat(data_dfs, axis=1, join='inner', sort=False)
    # sort the samples by name
    all_samples = natsorted(data.index.values)
    data = data.loc[all_samples, :]

    # heatmap of combined data (on features)
    _figure_x_size, _figure_y_size, _show_gene_names, _show_sample_names = \
        set_heatmap_size(data)

    plt.figure(figsize=(_figure_x_size, _figure_y_size))
    ax = sns.heatmap(data,
                     vmin=vmin,
                     vmax=vmax,
                     yticklabels=_show_sample_names,
                     xticklabels=_show_gene_names,
                     cmap=cmap_custom,
                     cbar=True)
    plt.xticks(rotation=90)
    plt.title(txt_label)

    if saveReport:
        logger.info('Save heatmap')
        fpath = os.path.join(
            output_directory,
            'Fig_heatmap_with_' + sample_final_id + '_id' + img_ext)
        plt.savefig(fpath,
                    transparent=True,
                    bbox_inches='tight',
                    pad_inches=0.1,
                    frameon=False)
        plt.close("all")
    else:
        plt.show()

    # save the combined data
    fname = 'data_with_' + sample_final_id + '_id.csv'
    fpath = os.path.join(output_directory, fname)
    logger.info('-save the combined data with different features ' +
                'with shape:\n' + str(data.shape))
    data.to_csv(fpath, sep='\t')
def feature_selection(**set_up_kwargs):
    # initialize script params
    saveReport = parse_arg_type(
        set_up_kwargs.get('saveReport', False),
        bool
    )
    toPrint = parse_arg_type(
        set_up_kwargs.get('toPrint', False),
        bool
    )
    reportName = set_up_kwargs.get('reportName', script_fname)
    txt_label = set_up_kwargs.get('txt_label', 'test_txt_label')
    sample_class_column = set_up_kwargs.get('sample_class_column', None)
    if sample_class_column is None:
        logger.error("NO class label was defined!")
        raise
    class_labels = set_up_kwargs.get('class_labels', None)
    if class_labels is not None:
        if ',' in class_labels:
            class_labels = class_labels.rsplit(',')
            class_labels = np.array(class_labels)
    class_values = set_up_kwargs.get('class_values', None)
    if class_values is not None:
        if ',' in class_values:
            class_values = class_values.rsplit(',')
            class_values = np.array(class_values).astype(int)

    # feature_selection_args
    feature_selection_args = set_up_kwargs.get('feature_selection_args', {})
    pval_thres = parse_arg_type(
        feature_selection_args.pop('pval_thres', 0.05),
        float
    )
    topN = parse_arg_type(
        feature_selection_args.pop('topN', 10),
        int
    )

    # plotting params
    plot_kwargs = set_up_kwargs.get('plot_kwargs', {})
    function_dict = plot_kwargs.get('function_dict', None)
    with_swarm = parse_arg_type(
        plot_kwargs.get('with_swarm', False),
        bool
    )
    highRes = parse_arg_type(
        plot_kwargs.get('highRes', False),
        bool
    )
    if highRes:
        img_ext = '.pdf'
    else:
        img_ext = '.png'
    cmap_custom = plot_kwargs.get('cmap_custom', None)
    vmin = parse_arg_type(
        plot_kwargs.get('vmin', None),
        int
    )
    vmax = parse_arg_type(
        plot_kwargs.get('vmax', None),
        int
    )
    if (cmap_custom is None) and (vmin is not None) and (vmax is not None):
        custom_div_cmap_arg = abs(vmin)+abs(vmax)
        if (vmin <= 0) and (vmax >= 0):
            custom_div_cmap_arg = custom_div_cmap_arg + 1
        mincol = plot_kwargs.get('mincol', None)
        midcol = plot_kwargs.get('midcol', None)
        maxcol = plot_kwargs.get('maxcol', None)
        if (
                (mincol is not None) and
                (midcol is not None) and
                (maxcol is not None)
                ):
            cmap_custom = custom_div_cmap(
                numcolors=custom_div_cmap_arg,
                mincol=mincol, midcol=midcol, maxcol=maxcol)
        else:
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg)

    # initialize directories
    MainDataDir = set_path(os.path.join(script_path, '..', 'data'))

    # data input
    data_fpath = set_up_kwargs.get('data_fpath')
    if ',' in data_fpath:
        data_fpath = os.path.join(*data_fpath.rsplit(','))
        data_fpath = os.path.join(MainDataDir, data_fpath)

    # sample info input
    sample_info_fpath = set_up_kwargs.get('sample_info_fpath')
    if ',' in sample_info_fpath:
        sample_info_fpath = os.path.join(*sample_info_fpath.rsplit(','))
    sample_info_fpath = os.path.join(MainDataDir, sample_info_fpath)
    sample_info_read_csv_kwargs = set_up_kwargs.get(
        'sample_info_read_csv_kwargs', {})

    # dupl_genes_dict
    dupl_genes_dict_fpath = set_up_kwargs.get('dupl_genes_dict_fpath', None)
    if dupl_genes_dict_fpath is not None:
        if ',' in dupl_genes_dict_fpath:
            dupl_genes_dict_fpath = os.path.join(
                    *dupl_genes_dict_fpath.rsplit(','))
        dupl_genes_dict_fpath = os.path.join(
            MainDataDir, dupl_genes_dict_fpath)

    # data output
    output_directory = set_up_kwargs.get('output_directory')
    if ',' in output_directory:
        output_directory = os.path.join(*output_directory.rsplit(','))
    output_directory = set_directory(
        os.path.join(MainDataDir, output_directory, reportName)
    )

    # save the set_up_kwargs in the output dir for reproducibility
    fname = 'set_up_kwargs.json'
    f = os.path.join(output_directory, fname)
    if toPrint:
        logger.info(
            '-save set_up_kwargs dictionary for reproducibility in: '+f)
    with open(f, 'w') as fp:
        json.dump(set_up_kwargs, fp, indent=4)

    # load data
    try:
        data = pd.read_csv(data_fpath, sep='\t', header=0, index_col=0)
        logger.info('loaded data file with shape: '+str(data.shape))
    except:
        logger.error('failed to read data file from: '+str(data_fpath))
        raise

    # load info table of samples
    try:
        info_table = load_clinical(
            sample_info_fpath, **sample_info_read_csv_kwargs)
    except:
        logger.error('Load info table of samples FAILED!')
        raise

    # set the ground truth
    ground_truth = info_table.loc[data.index, sample_class_column]
    ground_truth.sort_values(inplace=True)
    data = data.reindex(ground_truth.index, axis=0)
    try:
        yticklabels = ground_truth.index.values+',' + \
            ground_truth.values.astype(int).flatten().astype(str)
    except:
        yticklabels = ground_truth.index.values+',' + \
            ground_truth.values.flatten().astype(str)

    # load duplicate genes dictionary
    #  we will need that for the featsel results table we will save later
    if dupl_genes_dict_fpath is not None:
        with open(dupl_genes_dict_fpath, 'r') as fp:
            dupl_genes_dict = json.load(fp)
    else:
        dupl_genes_dict = None

    # Feature Selection
    model, all_coefs, printed_results, \
        (pval, correct, wrong), _sample_pred_diffs = \
        _feature_selection_by_classification(
            data, ground_truth, **feature_selection_args)

    #  save sample prediction scores
    compare_predictions = pd.concat(
        [ground_truth, _sample_pred_diffs], axis=1)
    compare_predictions.loc[:, 'pred_diffs'].fillna(1, inplace=True)
    compare_predictions = compare_predictions.astype(int)
    fname = 'sample_prediciton_scores.csv'
    fpath = os.path.join(output_directory, fname)
    logger.info("-save sample prediction scores in :\n"+fpath)
    compare_predictions.to_csv(
        fpath, sep='\t', header=True, index=True)

    # plot count of correct/wrong predictions per class
    y_maxlim = np.histogram(
        compare_predictions[sample_class_column], bins=2)[0].max()
    axes = compare_predictions.hist(
        by=sample_class_column, column='pred_diffs',
        bins=2, rwidth=0.4, figsize=(10, 6))
    for ax in axes:
        ax.set_ylim(0, y_maxlim+1)
        ax.set_xlim(0, 1)
        ax.set_xticks([0.25, 0.75])
        ax.set_xticklabels(['correct', 'wrong'], rotation=0, fontsize=18)
        ax_title = class_labels[np.where(
            class_values == float(
                ax.get_title()))[0][0]]+':'+str(ax.get_title())
        ax.set_title(ax_title, fontsize=18)
        plt.suptitle(sample_class_column+' predictions', fontsize=20)
    if saveReport:
        logger.info('Save count plot')
        fpath = os.path.join(
            output_directory, 'Fig_count_plot'+img_ext
        )
        plt.savefig(
            fpath, transparent=True, bbox_inches='tight',
            pad_inches=0.1, frameon=False)
        plt.close("all")
    else:
        plt.show()

    # plot confusion matrix
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(
        compare_predictions[sample_class_column],
        np.abs(
            compare_predictions[sample_class_column] -
            compare_predictions['pred_diffs']))
    np.set_printoptions(precision=2)
    _classes = [
        class_labels[class_values == 0][0],
        class_labels[class_values == 1][0]]

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(
        cnf_matrix, classes=_classes,
        title='Confusion matrix, without normalization')
    if saveReport:
        logger.info('Save count plot')
        fpath = os.path.join(
            output_directory, 'Fig_confusion_matrix'+img_ext
        )
        plt.savefig(
            fpath, transparent=True, bbox_inches='tight',
            pad_inches=0.1, frameon=False)
        plt.close("all")
    else:
        plt.show()

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(
        cnf_matrix, classes=_classes, normalize=True,
        title='Normalized confusion matrix')
    if saveReport:
        logger.info('Save count plot')
        fpath = os.path.join(
            output_directory, 'Fig_confusion_matrix_normalized'+img_ext
        )
        plt.savefig(
            fpath, transparent=True, bbox_inches='tight',
            pad_inches=0.1, frameon=False)
        plt.close("all")
    else:
        plt.show()

    # Save to model in the output_directory
    fname = 'joblib_model.pkl'
    fpath = os.path.join(output_directory, fname)
    logger.info('-save model with joblib')
    joblib.dump(model, fpath)

    # get the genes with the nnz coefficients in classification
    featsel_results = pd.DataFrame(index=data.columns.values)
    featsel_results.index.name = 'gene'
    featsel_results['nnz'] = 0
    featsel_results['mean_coef'] = all_coefs.mean(axis=0)
    featsel_results['std_coef'] = all_coefs.std(axis=0)
    nnz_coef_gene_names = (
        np.abs(all_coefs).max(axis=0) > 0
        ).index.values
    featsel_results.loc[nnz_coef_gene_names, 'nnz'] = 1
    n_names = nnz_coef_gene_names.shape[0]

    if dupl_genes_dict is not None:
        featsel_results = edit_names_with_duplicates(
            featsel_results, dupl_genes_dict)

        # change the name of the genes to indicate if they have duplicates
        newgeneNames_data = featsel_results.loc[
            data.columns, 'newGeneName'].values
        newgeneNames_coefs = featsel_results.loc[
            all_coefs.columns, 'newGeneName'].values
        featsel_results.reset_index(inplace=True, drop=False)
        featsel_results.set_index('newGeneName', inplace=True)
        data.columns = newgeneNames_data
        all_coefs.columns = newgeneNames_coefs

        nnz_coef_gene_names = (
            np.abs(all_coefs).max(axis=0) > 0
            ).index.values

    # boxplot of all nnz coefs
    coefs_to_plot = all_coefs.loc[:, nnz_coef_gene_names]
    boxplot(
        coefs_to_plot, coefs_to_plot.shape[1],
        coefs_to_plot.columns.values,
        title=txt_label+" - nnz coef genes",
        txtbox=printed_results, sidespace=2,
        swarm=False, n_names=coefs_to_plot.shape[1]
    )
    if saveReport:
        logger.info('Save boxplot')
        fpath = os.path.join(
            output_directory, 'Fig_boxplot_with_nnz_coefs'+img_ext
        )
        plt.savefig(
            fpath, transparent=True, bbox_inches='tight',
            pad_inches=0.1, frameon=False)
        plt.close("all")
    else:
        plt.show()

    # heatmap of genes with nnz coefs in classification
    fs_x, fs_y, _show_gene_names, _ = set_heatmap_size(data)
    plt.figure(figsize=(fs_x, fs_y))
    ax = sns.heatmap(data.loc[:, nnz_coef_gene_names], vmin=vmin, vmax=vmax,
                     yticklabels=yticklabels,
                     xticklabels=_show_gene_names,
                     cmap=cmap_custom, cbar=False)
    plt.xticks(rotation=90)
    cbar = ax.figure.colorbar(ax.collections[0])
    set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
    plt.title(
        str(n_names)+' genes with nnz coefs in classification: ' +
        class_labels[0]+'['+str(class_values[0])+'] vs. ' +
        class_labels[1]+'['+str(class_values[1])+']'
    )

    if saveReport:
        logger.info('Save heatmap')
        fpath = os.path.join(
            output_directory, 'Fig_heatmap_with_nnz_coefs'+img_ext
        )
        plt.savefig(fpath,
                    transparent=True, bbox_inches='tight',
                    pad_inches=0.1, frameon=False)
        plt.close("all")
    else:
        plt.show()

    if (pval < pval_thres) and (correct > wrong):
        logger.info(
            "selecting genes because bionomial test with pValue < " +
            str(pval_thres)+" and #correct("+str(correct) +
            ") > #wrong("+str(wrong)+") answers"
        )
        featsel_results['abs_mean_coef'] = np.abs(featsel_results['mean_coef'])
        if topN > data.shape[1]:
            topN = data.shape[1]
        featsel_results['top'+str(topN)] = 0
        featsel_results.sort_values(
            by=['abs_mean_coef'], inplace=True, ascending=False)
        selected_gene_names = featsel_results.index.values[:topN]
        featsel_results.loc[selected_gene_names, 'top'+str(topN)] = 1

        # keep only those genes in the data
        data = data.loc[:, selected_gene_names]

        # save this data for future classification
        fname = 'data_features_class.csv'
        fpath = os.path.join(output_directory, fname)
        logger.info("-save data with selected genes\n"+fpath)
        data.to_csv(fpath, sep='\t', header=True, index=True)

        # save as tab-delimited csv file
        fname = 'featsel_results.csv'
        fpath = os.path.join(output_directory, fname)
        logger.info("-save selected genes in :\n"+fpath)
        featsel_results.to_csv(
            fpath, sep='\t', header=True, index=True)

        # save also as excel file
        fname = 'featsel_results.xlsx'
        fpath = os.path.join(output_directory, fname)
        logger.info('-save csv file as excel too')
        writer = pd.ExcelWriter(fpath)
        featsel_results.to_excel(writer)
        writer.save()

        # boxplot of selected coefs
        coefs_to_plot = all_coefs.loc[:, selected_gene_names]
        boxplot(
            coefs_to_plot, coefs_to_plot.shape[1],
            coefs_to_plot.columns.values,
            title=txt_label+" - selected top"+str(topN)+" genes",
            txtbox=printed_results, sidespace=2,
            swarm=False, n_names=coefs_to_plot.shape[1]
        )
        if saveReport:
            logger.info('Save boxplot')
            fpath = os.path.join(
                output_directory, 'Fig_boxplot_with_selected_genes'+img_ext
            )
            plt.savefig(
                fpath, transparent=True, bbox_inches='tight',
                pad_inches=0.1, frameon=False)
            plt.close("all")
        else:
            plt.show()

        # heatmap of selected genes
        fs_x, fs_y, _show_gene_names, _ = set_heatmap_size(data)
        plt.figure(figsize=(fs_x, fs_y))
        ax = sns.heatmap(
            data.loc[:, selected_gene_names], vmin=vmin, vmax=vmax,
            yticklabels=yticklabels,
            xticklabels=_show_gene_names,
            cmap=cmap_custom, cbar=False)
        plt.xticks(rotation=90)
        cbar = ax.figure.colorbar(ax.collections[0])
        set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
        plt.title(
            'selected top'+str(topN)+' genes: ' +
            class_labels[0]+'['+str(class_values[0])+'] vs. ' +
            class_labels[1]+'['+str(class_values[1])+']'
        )

        if saveReport:
            logger.info('Save heatmap')
            fpath = os.path.join(
                output_directory, 'Fig_heatmap_with_selected_genes'+img_ext
            )
            plt.savefig(fpath,
                        transparent=True, bbox_inches='tight',
                        pad_inches=0.1, frameon=False)
            plt.close("all")
        else:
            plt.show()

    else:
        selected_gene_names = None

    if with_swarm:
        # swarmplots
        coefs_to_plot = all_coefs.loc[:, nnz_coef_gene_names]
        boxplot(
            coefs_to_plot, coefs_to_plot.shape[1],
            coefs_to_plot.columns.values,
            title=txt_label+" - nnz coef genes",
            txtbox=printed_results, sidespace=2,
            swarm=True, n_names=coefs_to_plot.shape[1]
        )
        if saveReport:
            logger.info('Save swarmplot')
            fpath = os.path.join(
                output_directory, 'Fig_swarmplot_with_nnz_coefs'+img_ext
            )
            plt.savefig(
                fpath, transparent=True, bbox_inches='tight',
                pad_inches=0.1, frameon=False)
            plt.close("all")
        else:
            plt.show()

        if selected_gene_names is not None:
            coefs_to_plot = all_coefs.loc[:, selected_gene_names]
            boxplot(
                coefs_to_plot, coefs_to_plot.shape[1],
                coefs_to_plot.columns.values,
                title=txt_label+" - nnz coef genes",
                txtbox=printed_results, sidespace=2,
                swarm=True, n_names=coefs_to_plot.shape[1]
            )
            if saveReport:
                logger.info('Save swarmplot')
                fpath = os.path.join(
                    output_directory,
                    'Fig_swarmplot_with_selected_genes'+img_ext
                )
                plt.savefig(
                    fpath, transparent=True, bbox_inches='tight',
                    pad_inches=0.1, frameon=False)
                plt.close("all")
            else:
                plt.show()
def classification(**set_up_kwargs):
    # initialize script params
    saveReport = parse_arg_type(
        set_up_kwargs.get('saveReport', False),
        bool
    )
    toPrint = parse_arg_type(
        set_up_kwargs.get('toPrint', False),
        bool
    )
    reportName = set_up_kwargs.get('reportName', script_fname)
    txt_label = set_up_kwargs.get('txt_label', 'test_txt_label')
    sample_class_column = set_up_kwargs.get('sample_class_column', None)
    if sample_class_column is None:
        logger.error("NO class label was defined!")
        raise
    class_labels = set_up_kwargs.get('class_labels', None)
    if class_labels is not None:
        if ',' in class_labels:
            class_labels = class_labels.rsplit(',')
    class_values = set_up_kwargs.get('class_values', None)
    if class_values is not None:
        if ',' in class_values:
            class_values = class_values.rsplit(',')
            class_values = np.array(class_values).astype(int)

    # feature_selection_args
    classification_args = set_up_kwargs.get('classification_args', {})
    split_train_size = classification_args.pop('split_train_size', 20)
    try:
        if '.' in split_train_size:
            split_train_size = parse_arg_type(split_train_size, float)
        else:
            split_train_size = parse_arg_type(split_train_size, int)
    except:
        pass

    split_random_state = parse_arg_type(
        classification_args.pop('split_random_state', 0),
        int
    )

    # plotting params
    plot_kwargs = set_up_kwargs.get('plot_kwargs', {})
    function_dict = plot_kwargs.get('function_dict', None)
    with_swarm = parse_arg_type(
        plot_kwargs.get('with_swarm', False),
        bool
    )
    highRes = parse_arg_type(
        plot_kwargs.get('highRes', False),
        bool
    )
    if highRes:
        img_ext = '.pdf'
    else:
        img_ext = '.png'
    cmap_custom = plot_kwargs.get('cmap_custom', None)
    vmin = parse_arg_type(
        plot_kwargs.get('vmin', None),
        int
    )
    vmax = parse_arg_type(
        plot_kwargs.get('vmax', None),
        int
    )
    if (cmap_custom is None) and (vmin is not None) and (vmax is not None):
        custom_div_cmap_arg = abs(vmin)+abs(vmax)
        if (vmin <= 0) and (vmax >= 0):
            custom_div_cmap_arg = custom_div_cmap_arg + 1
        mincol = plot_kwargs.get('mincol', None)
        midcol = plot_kwargs.get('midcol', None)
        maxcol = plot_kwargs.get('maxcol', None)
        if (
                (mincol is not None) and
                (midcol is not None) and
                (maxcol is not None)
                ):
            cmap_custom = custom_div_cmap(
                numcolors=custom_div_cmap_arg,
                mincol=mincol, midcol=midcol, maxcol=maxcol)
        else:
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg)

    # initialize directories
    MainDataDir = set_path(os.path.join(script_path, '..', 'data'))

    # data input
    data_fpath = set_up_kwargs.get('data_fpath')
    if ',' in data_fpath:
        data_fpath = os.path.join(*data_fpath.rsplit(','))
        data_fpath = os.path.join(MainDataDir, data_fpath)

    # sample info input
    sample_info_fpath = set_up_kwargs.get('sample_info_fpath')
    if ',' in sample_info_fpath:
        sample_info_fpath = os.path.join(*sample_info_fpath.rsplit(','))
    sample_info_fpath = os.path.join(MainDataDir, sample_info_fpath)
    sample_info_read_csv_kwargs = set_up_kwargs.get(
        'sample_info_read_csv_kwargs', {})

    # dupl_genes_dict
    dupl_genes_dict_fpath = set_up_kwargs.get('dupl_genes_dict_fpath', None)
    if dupl_genes_dict_fpath is not None:
        if ',' in dupl_genes_dict_fpath:
            dupl_genes_dict_fpath = os.path.join(
                    *dupl_genes_dict_fpath.rsplit(','))
        dupl_genes_dict_fpath = os.path.join(
            MainDataDir, dupl_genes_dict_fpath)

    # data output
    output_directory = set_up_kwargs.get('output_directory')
    if ',' in output_directory:
        output_directory = os.path.join(*output_directory.rsplit(','))
    output_directory = set_directory(
        os.path.join(MainDataDir, output_directory, reportName)
    )

    # save the set_up_kwargs in the output dir for reproducibility
    fname = 'set_up_kwargs.json'
    f = os.path.join(output_directory, fname)
    if toPrint:
        logger.info(
            '-save set_up_kwargs dictionary for reproducibility in: '+f)
    with open(f, 'w') as fp:
        json.dump(set_up_kwargs, fp, indent=4)

    # load data
    try:
        data = pd.read_csv(data_fpath, sep='\t', header=0, index_col=0)
        logger.info('loaded data file with shape: '+str(data.shape))
    except:
        logger.error('failed to read data file from: '+str(data_fpath))
        raise

    # load info table of samples
    try:
        info_table = load_clinical(
            sample_info_fpath, **sample_info_read_csv_kwargs)
    except:
        logger.error('Load info table of samples FAILED!')
        raise

    # set the ground truth
    ground_truth = info_table.loc[data.index, sample_class_column]

    # load duplicate genes dictionary
    #  we will need that for the featsel results table we will save later
    if dupl_genes_dict_fpath is not None:
        with open(dupl_genes_dict_fpath, 'r') as fp:
            dupl_genes_dict = json.load(fp)
    else:
        dupl_genes_dict = None

    # Classification
    if toTrain:
        # choose labels to stratify train_test_split
        if 'dataset' in info_table.columns.values:
            stratify_by = pd.concat(
                [ground_truth, info_table['dataset']], axis=1)
        else:
            stratify_by = ground_truth
        # split data in train and test
        data_train, data_test, y_train, y_test = train_test_split(
                data, ground_truth,
                train_size=split_train_size,
                test_size=None,
                random_state=split_random_state,
                stratify=stratify_by)

        try:
            yticklabels_train = y_train.index.values+',' + \
                y_train.values.astype(int).flatten().astype(str)
        except:
            yticklabels_train = y_train.index.values+',' + \
                y_train.values.flatten().astype(str)

        # train model
        model, all_coefs, y_train_predictions, y_train_scores = \
            _run_classification(
                data_train, y_train, **classification_args)
    else:
        # load model from file
        model = joblib.load(model_fpath)

        # load features of trained model

        data_test = data
        y_test = ground_truth

    try:
        yticklabels_test = y_test.index.values+',' + \
            y_test.values.astype(int).flatten().astype(str)
    except:
        yticklabels_test = y_test.index.values+',' + \
            y_test.values.flatten().astype(str)

    # Test the model
    y_test_score = model.score(data_test, y_test)
    y_test_predictions = model.predict(data_test)
    y_test_predictions = pd.Series(y_test_predictions, index=data_test.index)
    y_test_predictions.name = 'test_predictions'

    if toTrain:
        #################################################
        # plot accuracy scores of the train and test data
        plt.figure(figsize=(10, 6))
        plt.scatter(
            np.arange(len(y_train_scores))+1, y_train_scores, color='black')
        plt.scatter(0, y_test_score, color='red')
        plt.xlim(-1, len(y_train_scores)+1)
        plt.ylim(0, 1)
        plt.xlabel("test and train kfolds")
        plt.ylabel("accuracy scores")
        if saveReport:
            logger.info('Save boxplot')
            fpath = os.path.join(
                output_directory, 'Fig_scatter'+img_ext
            )
            plt.savefig(
                fpath, transparent=True, bbox_inches='tight',
                pad_inches=0.1, frameon=False)
            plt.close("all")
        else:
            plt.show()

        # save train sample cross prediction scores
        y_train_all_labels = pd.concat(
            [y_train, y_train_predictions], axis=1)
        fname = 'y_train_all_labels.csv'
        fpath = os.path.join(output_directory, fname)
        logger.info("-save train labels in :\n"+fpath)
        y_train_all_labels.to_csv(
            fpath, sep='\t', header=True, index=True)

        # Save to model in the output_directory
        fname = 'joblib_model.pkl'
        fpath = os.path.join(output_directory, fname)
        logger.info('-save model with joblib')
        joblib.dump(model, fpath)

        # training classification results
        classification_results = pd.DataFrame(index=data.columns.values)
        classification_results.index.name = 'gene'
        classification_results['mean_coef'] = all_coefs.mean(axis=0)
        classification_results['std_coef'] = all_coefs.std(axis=0)

        if dupl_genes_dict is not None:
            classification_results = edit_names_with_duplicates(
                classification_results, dupl_genes_dict)

            # change the name of the genes to indicate if they have duplicates
            newgeneNames_data = classification_results.loc[
                data.columns, 'newGeneName'].values
            newgeneNames_coefs = classification_results.loc[
                all_coefs.columns, 'newGeneName'].values
            classification_results.reset_index(inplace=True, drop=False)
            classification_results.set_index('newGeneName', inplace=True)
            data.columns = newgeneNames_data
            all_coefs.columns = newgeneNames_coefs

        # boxplot of coefs
        coefs_to_plot = all_coefs
        boxplot(
            coefs_to_plot, coefs_to_plot.shape[1],
            coefs_to_plot.columns.values,
            title=txt_label,
            txtbox='', sidespace=2,
            swarm=False, n_names=coefs_to_plot.shape[1]
        )
        if saveReport:
            logger.info('Save boxplot')
            fpath = os.path.join(
                output_directory, 'Fig_boxplot_train'+img_ext
            )
            plt.savefig(
                fpath, transparent=True, bbox_inches='tight',
                pad_inches=0.1, frameon=False)
            plt.close("all")
        else:
            plt.show()

        # heatmap of genes in classification - train
        fs_x, fs_y, _show_gene_names, _ = set_heatmap_size(data_train)
        plt.figure(figsize=(fs_x, fs_y))
        ax = sns.heatmap(
            data_train, vmin=vmin, vmax=vmax,
            yticklabels=yticklabels_train,
            xticklabels=_show_gene_names,
            cmap=cmap_custom, cbar=False)
        plt.xticks(rotation=90)
        cbar = ax.figure.colorbar(ax.collections[0])
        set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
        plt.title(
            'train data classification: ' +
            class_labels[0]+'['+str(class_values[0])+'] vs. ' +
            class_labels[1]+'['+str(class_values[1])+']'
        )

        if saveReport:
            logger.info('Save heatmap')
            fpath = os.path.join(
                output_directory, 'Fig_heatmap_train'+img_ext
            )
            plt.savefig(fpath,
                        transparent=True, bbox_inches='tight',
                        pad_inches=0.1, frameon=False)
            plt.close("all")
        else:
            plt.show()

        # save as tab-delimited csv file
        fname = 'classification_results.csv'
        fpath = os.path.join(output_directory, fname)
        logger.info("-save selected genes in :\n"+fpath)
        classification_results.to_csv(
            fpath, sep='\t', header=True, index=True)

        # save also as excel file
        fname = 'classification_results.xlsx'
        fpath = os.path.join(output_directory, fname)
        logger.info('-save csv file as excel too')
        writer = pd.ExcelWriter(fpath)
        classification_results.to_excel(writer)
        writer.save()

        if with_swarm:
            # swarmplots
            coefs_to_plot = all_coefs.loc[:, nnz_coef_gene_names]
            boxplot(
                coefs_to_plot, coefs_to_plot.shape[1],
                coefs_to_plot.columns.values,
                title=txt_label,
                txtbox='', sidespace=2,
                swarm=True, n_names=coefs_to_plot.shape[1]
            )
            if saveReport:
                logger.info('Save swarmplot')
                fpath = os.path.join(
                    output_directory, 'Fig_swarmplot_train'+img_ext
                )
                plt.savefig(
                    fpath, transparent=True, bbox_inches='tight',
                    pad_inches=0.1, frameon=False)
                plt.close("all")
            else:
                plt.show()
        #################################################

    # save test sample prediction scores
    y_test_all_labels = pd.concat(
        [y_test, y_test_predictions], axis=1)
    fname = 'y_test_all_labels.csv'
    fpath = os.path.join(output_directory, fname)
    logger.info("-save test labels in :\n"+fpath)
    y_test_all_labels.to_csv(
        fpath, sep='\t', header=True, index=True)

    # heatmap of genes in classification - test
    fs_x, fs_y, _show_gene_names, _ = set_heatmap_size(data_test)
    plt.figure(figsize=(fs_x, fs_y))
    ax = sns.heatmap(data_test, vmin=vmin, vmax=vmax,
                     yticklabels=yticklabels_test,
                     xticklabels=_show_gene_names,
                     cmap=cmap_custom, cbar=False)
    plt.xticks(rotation=90)
    cbar = ax.figure.colorbar(ax.collections[0])
    set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
    plt.title(
        'test data classification: ' +
        class_labels[0]+'['+str(class_values[0])+'] vs. ' +
        class_labels[1]+'['+str(class_values[1])+']'
    )

    if saveReport:
        logger.info('Save heatmap')
        fpath = os.path.join(
            output_directory, 'Fig_heatmap_test'+img_ext
        )
        plt.savefig(fpath,
                    transparent=True, bbox_inches='tight',
                    pad_inches=0.1, frameon=False)
        plt.close("all")
    else:
        plt.show()
Exemplo n.º 7
0
def process_data(**set_up_kwargs):
    # initialize script params
    saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool)
    toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool)
    reportName = set_up_kwargs.get('reportName', script_fname)

    input_fname = set_up_kwargs.get('input_fname', 'data_processed.csv')
    gene_info_fname = set_up_kwargs.get('gene_info_fname',
                                        'gene_info_fname.csv')
    txt_label = set_up_kwargs.get('txt_label', 'test_txt_label')
    chr_col = set_up_kwargs.get('chr_col', 'chr_int')
    gene_id_col = set_up_kwargs.get('gene_id_col', 'gene')
    remove_patients = set_up_kwargs.get('remove_patients', None)
    if remove_patients is None or remove_patients == "":
        remove_patients_list = []
    else:
        remove_patients_list = remove_patients.rsplit(',')

    select_genes = set_up_kwargs.get('select_genes', None)
    if select_genes is None or select_genes == "":
        select_genes_list = []
    else:
        select_genes_list = select_genes.rsplit(',')

    sample_info_fname = set_up_kwargs.get('sample_info_fname',
                                          '20180704_emca.csv')
    if ',' in sample_info_fname:
        sample_info_fname = os.path.join(*sample_info_fname.rsplit(','))
    sample_info_read_csv_kwargs = set_up_kwargs.get(
        'sample_info_read_csv_kwargs', {})

    old_data_sample_id = set_up_kwargs.get('old_data_sample_id', None)
    if old_data_sample_id is not None:
        change_id = True
    else:
        change_id = False

    # chose sample set from data
    # function: choose_samples()
    select_samples_from = set_up_kwargs.get('select_samples_from', None)
    select_samples_which = parse_arg_type(
        set_up_kwargs.get('select_samples_which', None), int)
    select_samples_sort_by = set_up_kwargs.get('select_samples_sort_by', None)
    if select_samples_sort_by is not None:
        select_samples_sort_by = select_samples_sort_by.rsplit(',')
    # map_values_dict
    map_values = set_up_kwargs.get('map_values', None)
    if map_values is not None:
        map_values_dict = None
        if isinstance(map_values, dict):
            map_values_dict = {int(k): int(v) for k, v in map_values.items()}

    # plotting params
    plot_kwargs = set_up_kwargs.get('plot_kwargs', {})
    function_dict = plot_kwargs.get('function_dict', None)
    cmap_custom = plot_kwargs.get('cmap_custom', None)
    vmin = parse_arg_type(plot_kwargs.get('vmin', None), int)
    vmax = parse_arg_type(plot_kwargs.get('vmax', None), int)
    if (cmap_custom is None) and (vmin is not None) and (vmax is not None):
        custom_div_cmap_arg = abs(vmin) + abs(vmax)
        if (vmin <= 0) and (vmax >= 0):
            custom_div_cmap_arg = custom_div_cmap_arg + 1
        mincol = plot_kwargs.get('mincol', None)
        midcol = plot_kwargs.get('midcol', None)
        maxcol = plot_kwargs.get('maxcol', None)
        if ((mincol is not None) and (midcol is not None)
                and (maxcol is not None)):
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg,
                                          mincol=mincol,
                                          midcol=midcol,
                                          maxcol=maxcol)
        else:
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg)
    highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool)
    if highRes:
        img_ext = '.pdf'
    else:
        img_ext = '.png'

    # initialize directories
    MainDataDir = set_path(os.path.join(script_path, '..', 'data'))

    # data input
    input_directory = set_up_kwargs.get('input_directory')
    if ',' in input_directory:
        input_directory = os.path.join(*input_directory.rsplit(','))
    input_directory = os.path.join(MainDataDir, input_directory)

    # sample info input
    sample_info_directory = set_up_kwargs.get('sample_info_directory')
    if ',' in sample_info_directory:
        sample_info_directory = os.path.join(
            *sample_info_directory.rsplit(','))
    sample_info_directory = os.path.join(MainDataDir, sample_info_directory)

    # gene info input
    gene_info_directory = set_up_kwargs.get('gene_info_directory')
    if gene_info_directory is None:
        gene_info_directory = input_directory
    else:
        if ',' in gene_info_directory:
            gene_info_directory = os.path.join(
                *gene_info_directory.rsplit(','))
            gene_info_directory = os.path.join(MainDataDir,
                                               gene_info_directory)

    # data output
    output_directory = set_up_kwargs.get('output_directory')
    if output_directory is None:
        output_directory = set_directory(
            os.path.join(input_directory, reportName))
    else:
        if ',' in output_directory:
            output_directory = os.path.join(*output_directory.rsplit(','))
        output_directory = set_directory(
            os.path.join(MainDataDir, output_directory, reportName))

    # save the set_up_kwargs in the output dir for reproducibility
    fname = 'set_up_kwargs.json'
    f = os.path.join(output_directory, fname)
    if toPrint:
        logger.info('-save set_up_kwargs dictionary for reproducibility in: ' +
                    f)
    with open(f, 'w') as fp:
        json.dump(set_up_kwargs, fp, indent=4)
    #########################################
    # load input_data
    fpath = os.path.join(input_directory, input_fname)
    data = pd.read_csv(fpath, sep='\t', header=0, index_col=0)
    data = data.fillna(0)

    # load info table of samples
    if toPrint:
        logger.info('Load info table of samples')
    fpath = os.path.join(sample_info_directory, sample_info_fname)
    info_table = load_clinical(fpath, **sample_info_read_csv_kwargs)

    # load gene info
    fpath = os.path.join(gene_info_directory, gene_info_fname)
    try:
        genes_positions_table = pd.read_csv(fpath,
                                            sep='\t',
                                            header=0,
                                            index_col=0)
        # get gene chrom position
        xlabels, xpos = get_chr_ticks(genes_positions_table,
                                      data,
                                      id_col='gene',
                                      chr_col=chr_col)
    except:
        logger.warning('could not get genes position info')
        xlabels, xpos = None, None

    #########################################
    # CHECK if there are empty genes and remove them
    is_empty = (data.isnull()).all(axis=0)
    if is_empty.any():
        genes2remove = data.columns[is_empty]
        data.drop(genes2remove, axis=1, inplace=True)
        if toPrint:
            logger.info('remove the following genes because ' +
                        'they have no values in the table: ' +
                        str(genes2remove))

    # CHECK if there are empty patients BUT don't remove them
    empty_pat = data.sum(axis=1).isnull()
    if empty_pat.any():
        logger.info('Patients with missing values in all genes: ' +
                    str(data.index[empty_pat]))

    # SELECT specific genes (optional)
    if len(select_genes_list) > 0:
        # first take intersection of with data
        select_genes_list = list(
            set(data.columns.values).intersection(set(select_genes_list)))
        # then keep only these genes from in the data
        data = data.loc[:, select_genes_list].copy()
        if genes_positions_table is not None:
            xlabels, xpos = get_chr_ticks(genes_positions_table,
                                          data,
                                          id_col='gene',
                                          chr_col=chr_col)

    # MAP values with a dictionary (optional)
    if map_values is not None:
        if map_values_dict is not None:
            _diff_set = set(np.unique(data.values.flatten().astype(int)))\
                .difference(set([0]))\
                .difference(set(list(map_values_dict.keys())))
            if _diff_set:
                logger.warning(
                    "the user\'s dict to replace data values is incomplete " +
                    "the following values in the data are not accounted for " +
                    "and will remain the same:\n" + str(_diff_set))
            logger.info("replacing data values with user\'s dictionary:\n" +
                        str(map_values_dict))
            data.replace(map_values_dict, inplace=True)

        elif map_values in ['bin', 'binary', 'binarize']:
            logger.info("binarizing data values" + str(map_values_dict))
            binarize(data, copy=False)

        else:
            logger.warning(
                "invalid map_values argument, no action will be taken: \n" +
                str(map_values))

    # SELECT sample groups (optional)
    if change_id:
        temp = info_table.index.name
        info_table = (
            info_table
            # delete NaNs in the data id
            .dropna(subset=[old_data_sample_id])
            # do not lose original index
            .reset_index()
            # same index as data
            .set_index(old_data_sample_id, drop=False)
            # keep same as data ids
            .loc[data.index]
            # return back to original index
            .set_index(temp)
        )
    else:
        temp = info_table.index.name
        info_table = info_table.loc[data.index].copy()
        info_table.index.name = temp
    ids_tmp = choose_samples(info_table.reset_index(),
                             info_table.index.name,
                             choose_from=select_samples_from,
                             choose_what=select_samples_which,
                             sortby=select_samples_sort_by,
                             ascending=False)
    info_table = info_table.loc[ids_tmp, :].copy()
    if change_id:
        old_index_sorted = info_table[old_data_sample_id].values.copy()
        data = data.loc[old_index_sorted, :].copy()
        new_ids = info_table.index.values
        # data = data.reindex(new_ids, axis=0) # gives me nan values!
        data.index = new_ids
        # in case there are patients with no data
        data.dropna(axis=0, inplace=True)
        info_table = info_table.loc[data.index, :].copy()
    else:
        data = data.loc[ids_tmp, :].copy()

    pat_labels = info_table[select_samples_sort_by].copy()
    try:
        pat_labels_txt = pat_labels.astype(int).reset_index().values
    except:
        pat_labels_txt = pat_labels.reset_index().values
    pat_labels_title = str(pat_labels.reset_index().columns.values)

    # PLOT heatmap without gene ordering
    if toPrint:
        logger.info('Plot heatmap before gene ordering')
    _figure_x_size, _figure_y_size, _show_gene_names, _ = \
        set_heatmap_size(data)
    plt.figure(figsize=(_figure_x_size, _figure_y_size))
    ax = sns.heatmap(data,
                     vmin=vmin,
                     vmax=vmax,
                     xticklabels=_show_gene_names,
                     yticklabels=pat_labels_txt,
                     cmap=cmap_custom,
                     cbar=False)
    ax.set_ylabel(pat_labels_title)
    plt.xticks(rotation=90)
    cbar = ax.figure.colorbar(ax.collections[0])
    set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
    if saveReport:
        if toPrint:
            logger.info('Save heatmap')
        plt.savefig(os.path.join(output_directory, 'Fig_heatmap' + img_ext),
                    transparent=True,
                    bbox_inches='tight',
                    pad_inches=0.1,
                    frameon=False)
        plt.close("all")
    else:
        plt.show()

    #########################################
    if (xlabels is not None) and (xpos is not None):
        # ORDER genes
        if toPrint:
            logger.info('Order data according to genomic position')

        # extract the gene relative order
        gene_order = genes_positions_table.set_index(
            gene_id_col).loc[:, 'order'].copy()
        # keep only gene_order with data
        ids_tmp = set(gene_order.index.values).intersection(
            set(data.columns.values))
        # keep only the order of these genes
        gene_order = gene_order.loc[ids_tmp].copy()
        gene_order = gene_order.sort_values()
        # then keep only these genes from the data
        data = data.loc[:, gene_order.index].copy()

        # data = pd.DataFrame(data, columns=sorted(
        #     gene_order_dict, key=gene_order_dict.get))

        # PLOT heatmap after gene ordering
        if toPrint:
            logger.info('Plot heatmap after gene ordering')
        _figure_x_size, _figure_y_size, _show_gene_names, _ = \
            set_heatmap_size(data)
        plt.figure(figsize=(_figure_x_size, _figure_y_size))
        ax = sns.heatmap(data,
                         vmin=vmin,
                         vmax=vmax,
                         xticklabels=_show_gene_names,
                         yticklabels=pat_labels_txt,
                         cmap=cmap_custom,
                         cbar=False)
        ax.set_xticks(xpos)
        ax.set_xticklabels(xlabels, rotation=90)
        ax.set_ylabel(pat_labels_title)
        cbar = ax.figure.colorbar(ax.collections[0])
        set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)
        if saveReport:
            if toPrint:
                logger.info('Save heatmap')
            plt.savefig(os.path.join(output_directory,
                                     'Fig_heatmap_ordered' + img_ext),
                        transparent=True,
                        bbox_inches='tight',
                        pad_inches=0.1,
                        frameon=False)
            plt.close("all")
        else:
            plt.show()

    #########################################
    # SAVE filtered data
    if saveReport:
        # save files
        fname = 'data_processed.csv'
        f = os.path.join(output_directory, fname)
        if toPrint:
            logger.info('-save ordered data: ' + f)
        data.to_csv(f, sep='\t', header=True, index=True)
Exemplo n.º 8
0
def set_up_data(**set_up_kwargs):
    # initialize script params
    saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool)
    toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool)
    reportName = set_up_kwargs.get('reportName', script_fname)

    # load_data_csv_kwargs = set_up_kwargs.get(
    #     'load_data_csv_kwargs', {}
    # )

    editWith = set_up_kwargs.get('editWith', 'Oncoscan')
    if 'VCF' in editWith:
        _edit_kwargs = set_up_kwargs.get('edit_kwargs', {})
        function_dict = _edit_kwargs.get('function_dict', None)

    txt_label = set_up_kwargs.get('txt_label', 'test_txt_label')

    select_genes = set_up_kwargs.get('select_genes', None)
    if select_genes is None or select_genes == "":
        select_genes_list = []
    else:
        select_genes_list = select_genes.rsplit(',')

    chr_col = set_up_kwargs.get('chr_col', 'chr_int')
    gene_id_col = set_up_kwargs.get('gene_id_col', 'gene')
    sample_info_fname = set_up_kwargs.get('sample_info_fname',
                                          '20180704_emca.csv')
    if ',' in sample_info_fname:
        sample_info_fname = os.path.join(*sample_info_fname.rsplit(','))
    sample_info_read_csv_kwargs = set_up_kwargs.get(
        'sample_info_read_csv_kwargs', {})
    sample_info_table_sortLabels = \
        set_up_kwargs.get('sample_info_table_sortLabels', None)

    sample_info_table_sortLabels_list = \
        sample_info_table_sortLabels.rsplit(',')

    # plotting params
    plot_kwargs = set_up_kwargs.get('plot_kwargs', {})
    highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool)
    if highRes:
        img_ext = '.pdf'
    else:
        img_ext = '.png'

    # initialize directories
    MainDataDir = set_path(os.path.join(script_path, '..', 'data'))

    input_directory = set_up_kwargs.get('input_directory')
    if ',' in input_directory:
        input_directory = os.path.join(*input_directory.rsplit(','))
    input_directory = os.path.join(MainDataDir, input_directory)
    output_directory = set_up_kwargs.get('output_directory')
    if output_directory is None:
        output_directory = set_directory(
            os.path.join(input_directory, reportName))
    else:
        if ',' in output_directory:
            output_directory = os.path.join(*output_directory.rsplit(','))
        output_directory = set_directory(
            os.path.join(MainDataDir, output_directory, reportName))

    # save the set_up_kwargs in the output dir for reproducibility
    fname = 'set_up_kwargs.json'
    f = os.path.join(output_directory, fname)
    if toPrint:
        logger.info('-save set_up_kwargs dictionary for reproducibility in: ' +
                    f)
    with open(f, 'w') as fp:
        json.dump(set_up_kwargs, fp, indent=4)

    data_directory = set_up_kwargs.get('data_directory', None)
    if data_directory is None:
        data_directory = input_directory
    else:
        if ',' in data_directory:
            data_directory = os.path.join(*data_directory.rsplit(','))
    data_files = set_up_kwargs.get('data_files', '')
    data_files_list = data_files.rsplit(',')
    if len(data_files_list) > 0:
        fpaths = [
            os.path.join(input_directory, data_directory, aFile)
            for aFile in data_files_list
        ]
    else:
        fpaths = os.path.join(input_directory, data_directory)

    # load info table of samples
    if toPrint:
        logger.info('Load info table of samples')
    fpath = os.path.join(input_directory, sample_info_fname)
    info_table = load_clinical(fpath, **sample_info_read_csv_kwargs)

    if toPrint:
        logger.info('Missing values for each column:\n')
        info_table_isna_sum = info_table.isna().sum()
        for _i in range(info_table_isna_sum.shape[0]):
            logger.info(
                str(info_table_isna_sum.index[_i]) + '\t' +
                str(info_table_isna_sum.iloc[_i]))

    #########################################
    if 'genepanel' in editWith:
        # print('ERROR: undefined scenario!')
        # raise
        edit_kwargs = set_up_kwargs.get('edit_kwargs', {})

        # load data table
        if len(fpaths) > 0:
            logger.error('more than one data files were given!\n' +
                         str(fpaths))
        variants = pd.read_csv(fpaths[0], sep='\t', header=0)

        # EDIT:
        # map function impact to value with function_dict
        # substitute allele frequencies with impact values
        # aggregate rows to unique genes, choose how to merge
        # remove patients
        data = edit_genepanel(variants, **edit_kwargs)
        xlabels, xpos = None, None

        # plot heatmap
        # save formated data

        # (optional) format data to binary
        # plot heatmap
        # save formated data

    else:
        load_files = parse_arg_type(set_up_kwargs.get('load_files', False),
                                    bool)
        # load data/files from each patient
        if load_files:
            if toPrint:
                logger.info(txt_label + ': load files from all patients\n')

            pat_data_list, pat_data_or_dict, dropped_rows_filter, \
                dropped_rows_process, dropped_rows_edit, info_table = \
                load_and_process_files(
                    fpaths, info_table, **set_up_kwargs)
        else:
            if toPrint:
                logger.info(txt_label + ': load data from all patients\n')

            pat_data_list, pat_data_or_dict, dropped_rows_filter, \
                dropped_rows_process, dropped_rows_edit, info_table = \
                load_and_process_summary_file(
                    fpaths, info_table, **set_up_kwargs)

        if (dropped_rows_filter.shape[0] > 0) and (saveReport):
            f_new = 'allsamples__dropped_rows_filter.txt'
            if toPrint:
                logger.info('-save dropped rows from filtering in:\n' + f_new)
            dropped_rows_filter.to_csv(os.path.join(output_directory, f_new),
                                       sep='\t',
                                       header=True,
                                       index=True)

        if (dropped_rows_process.shape[0] > 0) and (saveReport):
            f_new = 'allsamples__dropped_rows_process.txt'
            if toPrint:
                logger.info('-save dropped rows from processing in:\n' + f_new)
            dropped_rows_process.to_csv(os.path.join(output_directory, f_new),
                                        sep='\t',
                                        header=True,
                                        index=True)

        if (dropped_rows_edit.shape[0] > 0) and (saveReport):
            f_new = 'allsamples__dropped_rows_edit.txt'
            if toPrint:
                logger.info('-save dropped rows from editing in:\n' + f_new)
            dropped_rows_edit.to_csv(os.path.join(output_directory, f_new),
                                     sep='\t',
                                     header=True,
                                     index=True)

        # get size of each sample
        # (i.e. abundance of genes with in each sample)
        # and plot it
        counts = []
        sample_labels = []
        for df in pat_data_list:
            counts.append(df.shape[0])
            sample_labels.append(df.columns[0].rsplit(':')[0])

        ##################################################

        # concat all samples in one table and keep union of all genes,
        # then fill NaNs with zero
        if toPrint:
            logger.info('Concantanate all ' + editWith +
                        ' samples in 2 tables (with position, only values)\n')
        # samples in rows, genes in columns
        table_withPos = pd.concat(pat_data_list,
                                  join='outer',
                                  axis=1,
                                  sort=False).T

        # CLEAN THE data FROM ALL SAMPLES
        # extract the start, end and chrom info from the table
        # and keep only the functions values
        start_table = \
            table_withPos[table_withPos.index.str.contains('start')].copy()
        end_table = table_withPos[table_withPos.index.str.contains(
            'end')].copy()
        chr_table = table_withPos[table_withPos.index.str.contains(
            'chr')].copy()
        data = table_withPos.drop(np.concatenate(
            [start_table.index, end_table.index, chr_table.index], axis=0),
                                  axis=0)
        if toPrint:
            logger.info('Dimensions of data (samples,genes): ' +
                        str(data.shape))
        data.index = [index_name.rsplit(':')[0] for index_name in data.index]
        start_table.index = [
            index_name.rsplit(':')[0] for index_name in start_table.index
        ]
        end_table.index = [
            index_name.rsplit(':')[0] for index_name in end_table.index
        ]
        chr_table.index = [
            index_name.rsplit(':')[0] for index_name in chr_table.index
        ]

        # remove genes that exist in multiple chromosomes across samples
        ll = [
            list(chr_table[col].dropna().unique()) for col in chr_table.columns
        ]
        n, m = max(map(len, ll)), len(ll)
        uniq_chr_per_gene = pd.DataFrame(
            [[_uniq_chr_per_gene(j, i) for j in ll] for i in range(n)],
            columns=chr_table.columns)
        genes2drop = uniq_chr_per_gene.columns[(
            ~uniq_chr_per_gene.isnull()).sum() > 1].values
        if toPrint:
            logger.info('Remove ' + str(genes2drop.shape[0]) +
                        ' genes that exist in multiple chromosomes ' +
                        'across samples:\n' + str(genes2drop))

        if (genes2drop.shape[0] > 0):
            # if saveReport:
            #     fname = 'chr_table.csv'
            #     f = os.path.join(output_directory, fname)
            #     if toPrint:
            #         logger.info('-save chromosomes in: '+f)
            #     chr_table.to_csv(f, sep='\t', header=True, index=True)

            #     fname = 'chr_table_uniq.csv'
            #     f = os.path.join(output_directory, fname)
            #     if toPrint:
            #         logger.info('-save unique chromosomes in: '+f)
            #     uniq_chr_per_gene.to_csv(
            #           f, sep='\t', header=True, index=True)

            #     fname = 'chr_table_uniq_genes2drop.csv'
            #     f = os.path.join(output_directory, fname)
            #     if toPrint:
            #         logger.info('-save unique chromosomes ' +
            #                     'from genes to drop in: '+f)
            #     uniq_chr_per_gene.loc[:, genes2drop].to_csv(f, sep='\t',
            #                                                 header=True,
            #                                                 index=True)

            start_table.drop(genes2drop, axis=1, inplace=True)
            end_table.drop(genes2drop, axis=1, inplace=True)
            chr_table.drop(genes2drop, axis=1, inplace=True)
            data.drop(genes2drop, axis=1, inplace=True)
            uniq_chr_per_gene.drop(genes2drop, axis=1, inplace=True)
            uniq_chr_per_gene = uniq_chr_per_gene.iloc[0, :].copy()
            if toPrint:
                logger.info('Dimensions of data (samples,genes):' +
                            str(data.shape))
        else:
            uniq_chr_per_gene = uniq_chr_per_gene.iloc[0, :].copy()

        # ORDER THE GENES FROM ALL SAMPLES (SLOW?)
        if toPrint:
            logger.info('Create a Dataframe with the genes ' +
                        'and their genomic positions')
        gene_pos = pd.concat([
            start_table.apply(lambda x: pd.to_numeric(
                x, errors='ignore', downcast='integer')).min().astype(int),
            end_table.apply(lambda x: pd.to_numeric(
                x, errors='ignore', downcast='integer')).max().astype(int),
            uniq_chr_per_gene
        ],
                             axis=1,
                             sort=False)
        gene_pos.columns = ['start', 'end', 'chr']
        gene_pos.index.name = gene_id_col
        gene_pos.reset_index(inplace=True)
        gene_pos['chr_gene'] = gene_pos['chr'] + ':' + gene_pos[gene_id_col]
        gene_pos[chr_col] = gene_pos['chr'].str.split('chr', 2).str[1]
        gene_pos['toNatSort'] = [
            ':'.join([
                str(gene_pos[chr_col][row]),
                str(gene_pos['start'][row]),
                str(gene_pos['end'][row])
            ]) for row in range(gene_pos.shape[0])
        ]
        if toPrint:
            logger.info('Dataframes agree (?): ' +
                        str(gene_pos.shape[0] == data.shape[1]))

        # are the genes duplicated ?
        dupl_genes = gene_pos[gene_id_col].duplicated()
        if dupl_genes.any():
            logger.error('genes are duplicated, check your data first!')
            logger.info('duplicated genes:' +
                        gene_pos[gene_id_col][dupl_genes].values)
            raise ()
        else:
            if toPrint:
                logger.info('gene names are unique, continue..')

        if toPrint:
            logger.info('Order genes according to genomic position')
        gene_order = index_natsorted(gene_pos['toNatSort'])
        gene_pos = gene_pos.iloc[gene_order, :].copy()
        gene_pos.reset_index(drop=True, inplace=True)
        gene_pos.index.name = 'order'
        gene_pos.reset_index(inplace=True)

        #########################################
        # CREATE dictionary of gene names and their order
        gene_order_dict = dict(
            (gene_pos[gene_id_col][i], int(gene_pos['order'][i]))
            for i in range(gene_pos.shape[0]))

        xlabels, xpos = get_chr_ticks(gene_pos,
                                      data,
                                      id_col=gene_id_col,
                                      chr_col=chr_col)

        # SAVE ordered table and gene pos info table
        if saveReport:
            fname = 'genes_info.csv'
            f = os.path.join(output_directory, fname)
            if toPrint:
                logger.info('-save genes info: ' + f)
            gene_pos.to_csv(f, sep='\t', header=True, index=True)

            fname = 'gene_order_dict.json'
            f = os.path.join(output_directory, fname)
            if toPrint:
                logger.info('-save genes order dictionary: ' + f)
            with open(f, 'w') as fp:
                json.dump(gene_order_dict, fp, indent=4)
        #########################################
        for label in [
                'rows_in_sample', 'rows_in_sample_filt',
                'rows_in_sample_processed', 'rows_in_sample_editted'
        ]:
            if label in info_table.columns:
                # PLOT Abundance of gene data per sample
                if toPrint:
                    logger.info('Plot ' + label + ' for each sample')
                mutCount = info_table[[label]].copy()
                patient_new_order = info_table.loc[mutCount.index].sort_values(
                    by=sample_info_table_sortLabels_list)
                xticklabels = list(
                    zip(
                        patient_new_order.index.values, info_table.loc[
                            patient_new_order.index,
                            sample_info_table_sortLabels_list].values))
                mutCount = mutCount.loc[patient_new_order.index]
                rank = mutCount[label].argsort().argsort().values
                pal = sns.cubehelix_palette(mutCount.shape[0],
                                            reverse=True,
                                            dark=.40,
                                            light=.95)
                plt.figure(figsize=(10, 5))
                g = sns.barplot(np.arange(mutCount.shape[0]),
                                mutCount[label],
                                palette=np.array(pal[::-1])[rank])
                g.set_xticklabels(xticklabels, rotation=90)
                g.set(xlabel='samples', ylabel='count')
                g.set_title('Abundance of ' + label + ' per sample: ' +
                            str((mutCount[label] <= 0).sum()) +
                            ' empty samples')
                if saveReport:
                    logger.info('Save figure')
                    plt.savefig(os.path.join(output_directory,
                                             'Fig_samples_' + label + img_ext),
                                transparent=True,
                                bbox_inches='tight',
                                pad_inches=0.1,
                                frameon=False)
                    plt.close("all")
                else:
                    plt.show()

        #########################################
        # # SAVE data w/ and w/o positions
        # if saveReport:
        #     # save data
        #     fname = 'table_withPos.csv'
        #     f = os.path.join(output_directory, fname)
        #     if toPrint:
        #         logger.info('-save data in: '+f)
        #     table_withPos.to_csv(f, sep='\t', header=True, index=True)

        # if saveReport:
        #     # save data
        #     fname = 'data.csv'
        #     f = os.path.join(output_directory, fname)
        #     if toPrint:
        #         logger.info('-save data in: '+f)
        #     data.to_csv(f, sep='\t', header=True, index=True)

        # if toPrint:
        #     logger.info(
        #       'Dimensions of data (samples,genes):'+str(data.shape))

    #  -- END IF -- #
    #########################################
    # SAVE data and sample_info
    if saveReport:
        # save files
        fname = 'data.csv'
        f = os.path.join(output_directory, fname)
        if toPrint:
            logger.info('-save ordered data: ' + f)
        data.to_csv(f, sep='\t', header=True, index=True)

        fname = 'sample_info.csv'
        f = os.path.join(output_directory, fname)
        if toPrint:
            logger.info('-save sample_info: ' + f)
        info_table.to_csv(f, sep='\t', header=True, index=True)
def remove_duplicate_genes(**set_up_kwargs):

    # chose sample set from data
    select_samples_from = set_up_kwargs.get('select_samples_from', None)
    select_samples_which = parse_arg_type(
        set_up_kwargs.get('select_samples_which', None),
        int
    )
    select_samples_sort_by = set_up_kwargs.get('select_samples_sort_by',
                                               None)
    if select_samples_sort_by is not None:
        select_samples_sort_by = select_samples_sort_by.rsplit(',')
    select_samples_title = set_up_kwargs.get('select_samples_title',
                                             'select_all')

    # initialize script params
    saveReport = parse_arg_type(
        set_up_kwargs.get('saveReport', False),
        bool
    )
    toPrint = parse_arg_type(
        set_up_kwargs.get('toPrint', False),
        bool
    )
    reportName = set_up_kwargs.get('reportName', script_fname)
    txt_label = set_up_kwargs.get('txt_label', 'test_txt_label')
    input_fname = set_up_kwargs.get('input_fname',
                                    'data_processed.csv')
    gene_info_fname = set_up_kwargs.get('gene_info_fname',
                                        None)
    chr_col = set_up_kwargs.get('chr_col', 'chr_int')
    gene_id_col = set_up_kwargs.get('gene_id_col', 'gene')
    sample_info_fname = set_up_kwargs.get('sample_info_fname',
                                          None)
    if ',' in sample_info_fname:
        sample_info_fname = os.path.join(*sample_info_fname.rsplit(','))
    sample_info_read_csv_kwargs = set_up_kwargs.get(
        'sample_info_read_csv_kwargs', {})

    # plotting params
    plot_kwargs = set_up_kwargs.get('plot_kwargs', {})
    function_dict = plot_kwargs.get('function_dict', None)
    cmap_custom = plot_kwargs.get('cmap_custom', None)
    vmin = parse_arg_type(
        plot_kwargs.get('vmin', None),
        int
    )
    vmax = parse_arg_type(
        plot_kwargs.get('vmax', None),
        int
    )
    if (cmap_custom is None) and (vmin is not None) and (vmax is not None):
        custom_div_cmap_arg = abs(vmin)+abs(vmax)
        if (vmin <= 0) and (vmax >= 0):
            custom_div_cmap_arg = custom_div_cmap_arg + 1
        mincol = plot_kwargs.get('mincol', None)
        midcol = plot_kwargs.get('midcol', None)
        maxcol = plot_kwargs.get('maxcol', None)
        if (
                (mincol is not None) and
                (midcol is not None) and
                (maxcol is not None)
                ):
            cmap_custom = custom_div_cmap(
                numcolors=custom_div_cmap_arg,
                mincol=mincol, midcol=midcol, maxcol=maxcol)
        else:
            cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg)
    highRes = parse_arg_type(
        plot_kwargs.get('highRes', False),
        bool
    )
    if highRes:
        img_ext = '.pdf'
    else:
        img_ext = '.png'
    # initialize directories
    MainDataDir = set_path(os.path.join(script_path, '..', 'data'))

    # data input
    input_directory = set_up_kwargs.get('input_directory')
    if ',' in input_directory:
        input_directory = os.path.join(*input_directory.rsplit(','))
    input_directory = os.path.join(MainDataDir, input_directory)

    # sample info input
    sample_info_directory = set_up_kwargs.get('sample_info_directory')
    if ',' in sample_info_directory:
        sample_info_directory = os.path.join(
            *sample_info_directory.rsplit(','))
    sample_info_directory = os.path.join(MainDataDir, sample_info_directory)

    # gene info input
    gene_info_directory = set_up_kwargs.get('gene_info_directory')
    if gene_info_directory is None:
        gene_info_directory = input_directory
    else:
        if ',' in gene_info_directory:
            gene_info_directory = os.path.join(
                *gene_info_directory.rsplit(','))
            gene_info_directory = os.path.join(
                MainDataDir, gene_info_directory)

    # data output
    output_directory = set_up_kwargs.get('output_directory')
    if output_directory is None:
        output_directory = set_directory(
            os.path.join(input_directory, reportName)
        )
    else:
        if ',' in output_directory:
            output_directory = os.path.join(*output_directory.rsplit(','))
        output_directory = set_directory(
            os.path.join(MainDataDir, output_directory, reportName)
        )
    # save the set_up_kwargs in the output dir for reproducibility
    fname = 'set_up_kwargs.json'
    f = os.path.join(output_directory, fname)
    if toPrint:
        logger.info(
            '-save set_up_kwargs dictionary for reproducibility in: '+f)
    with open(f, 'w') as fp:
        json.dump(set_up_kwargs, fp, indent=4)

    # pairwise distances params
    compute_pdist = parse_arg_type(
        set_up_kwargs.get('compute_pdist', False),
        bool
    )
    pdist_fname = 'data_'+select_samples_title+'__genes_pdist.h5'
    pdist_fpath = os.path.join(input_directory, pdist_fname)
    if not os.path.exists(pdist_fpath):
        compute_pdist = True

    # load info table of samples
    if toPrint:
        logger.info('Load info table of samples')
    fpath = os.path.join(sample_info_directory, sample_info_fname)
    info_table = load_clinical(fpath,  **sample_info_read_csv_kwargs)

    # load input_data
    fpath = os.path.join(input_directory, input_fname)
    input_data = pd.read_csv(fpath, sep='\t', header=0, index_col=0)
    empty_pat = input_data.sum(axis=1).isnull()
    if empty_pat.any():
        logger.info('Patients with missing values in all genes: ' +
                    str(input_data.index[empty_pat]))
    input_data = input_data.fillna(0)

    # keep only info_table with input_data
    ids_tmp = set(info_table.index.values
                  ).intersection(set(input_data.index.values))
    info_table = info_table.loc[ids_tmp].copy()
    # info_table = info_table.reset_index()

    # load gene info
    fpath = os.path.join(gene_info_directory, gene_info_fname)
    genes_positions_table = pd.read_csv(fpath, sep='\t', header=0,
                                        index_col=0)
    # get gene chrom position
    xlabels, xpos = get_chr_ticks(genes_positions_table, input_data,
                                  id_col=gene_id_col, chr_col=chr_col)

    logger.info('select_samples_from: '+str(select_samples_from) +
                'select_samples_which: '+str(select_samples_which) +
                'select_samples_sort_by: '+str(select_samples_sort_by) +
                'select_samples_title: '+str(select_samples_title))

    # keep only info_table with data
    temp = info_table.index.name
    info_table = info_table.loc[input_data.index].copy()
    info_table.index.name = temp
    ids_tmp = choose_samples(info_table.reset_index(),
                             info_table.index.name,
                             choose_from=select_samples_from,
                             choose_what=select_samples_which,
                             sortby=select_samples_sort_by,
                             ascending=False)
    # keep a subpart of the info_table (rows and columns)
    info_table = info_table.loc[ids_tmp, select_samples_sort_by].copy()
    # keep only these samples from the data
    data = input_data.loc[ids_tmp, :].copy()
    try:
        pat_labels_txt = info_table.astype(int).reset_index().values
    except:
        pat_labels_txt = info_table.reset_index().values
    pat_labels_title = str(info_table.reset_index().columns.values)

    # remove all zero columns!
    orphancols = np.where(abs(data).sum(axis=0) == 0)[0]
    if len(orphancols) > 0:
        logger.warning(
            'removing '+str(len(orphancols)) +
            ' genes from data with zero columns!')
        cols2drop = data.columns.values[orphancols]
        data = data.drop(cols2drop, axis=1).copy()

    # REMOVE DUPLICATES!!!!
    uniqdata, dupldict, _, _ = remove_andSave_duplicates(
        data, to_compute_euclidean_distances=compute_pdist,
        to_save_euclidean_distances=saveReport, to_save_output=saveReport,
        output_filename=input_fname.rsplit('.')[0]+'__'+select_samples_title,
        output_directory=output_directory)

    # get gene chrom position
    xlabels_uniq, xpos_uniq = get_chr_ticks(
        genes_positions_table, uniqdata,
        id_col=gene_id_col, chr_col=chr_col)

    fext = ['', '_uniq']
    xlabels_choose = [xlabels, xlabels_uniq]
    xpos_choose = [xpos, xpos_uniq]
    for i_data, choose_data in enumerate([data, uniqdata]):
        if select_samples_which is None:
            # distplot DO NOT break Y-axis
            logger.info('Plotting distplot..')
            sns.distplot(choose_data.values.flatten(),
                         hist=True, kde=False, color='b')
            plt.title("Copy number abundance in "+txt_label+" (uniq genes)")
            if saveReport:
                logger.info('Save distplot')
                plt.savefig(os.path.join(
                    output_directory, 'Fig_distplot_' +
                    select_samples_title+fext[i_data]+img_ext),
                    transparent=True, bbox_inches='tight',
                    pad_inches=0.1, frameon=False)
                plt.close("all")
            else:
                plt.show()

            # distplot break Y-axis
            logger.info('Plotting break Y-axis distplot..')
            _, uniq_count = np.unique(choose_data.values.flatten(),
                                      return_counts=True)
            ymax_bottom = int(math.ceil(
                np.sort(uniq_count)[-2] / 1000.0)
                ) * 1000
            ymax_top = int(math.ceil(
                np.sort(uniq_count)[-1] / 10000.0)
                ) * 10000
            distplot_breakYaxis(choose_data.values, ymax_bottom,
                                ymax_top, color='b', d=0.005,
                                pad=1.5, figsize=(10, 6),
                                mytitle='Copy number abundance in '+txt_label +
                                        'with cropped y axis (uniq genes)')
            if saveReport:
                logger.info('Save distplot')
                plt.savefig(os.path.join(
                    output_directory, 'Fig_distplot_breakYaxis_' +
                    select_samples_title+fext[i_data]+img_ext),
                    transparent=True, bbox_inches='tight',
                    pad_inches=0.1, frameon=False)
                plt.close("all")
            else:
                plt.show()

        # Plot heatmap
        _figure_x_size, _figure_y_size, _show_gene_names, _ = \
            set_heatmap_size(data)
        plt.figure(figsize=(_figure_x_size, _figure_y_size))
        ax = sns.heatmap(choose_data, vmin=vmin, vmax=vmax,
                         yticklabels=pat_labels_txt, xticklabels=False,
                         cmap=cmap_custom, cbar=False)
        if (_show_gene_names and (
                (xpos_choose[i_data] is None) or
                (xlabels_choose[i_data] is None))):
            plt.xticks(rotation=90)
        elif (
                (xpos_choose[i_data] is not None) and
                (xlabels_choose[i_data] is not None)):
            plt.xticks(xpos_choose[i_data], xlabels_choose[i_data], rotation=0)
        plt.xlabel('chromosomes (the number is aligned at the end ' +
                   'of the chr region)')
        plt.ylabel('samples '+select_samples_title+'\n'+pat_labels_title)
        cbar = ax.figure.colorbar(ax.collections[0])
        set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg)

        plt.title(
            txt_label+'\nheatmap of ' +
            select_samples_title+' samples')

        if saveReport:
            logger.info('Save heatmap')
            plt.savefig(os.path.join(
                output_directory, 'Fig_heatmap_'+select_samples_title +
                fext[i_data]+img_ext),
                transparent=True, bbox_inches='tight',
                pad_inches=0.1, frameon=False)
            plt.close("all")
        else:
            plt.show()

        # Plot pairwise sample correlations
        data_cor = 1-squareform(pdist(choose_data, 'correlation'))
        plt.figure(figsize=(15, 10))
        sns.heatmap(data_cor, vmin=-1, vmax=1, yticklabels=pat_labels_txt,
                    xticklabels=pat_labels_txt, cmap='PiYG', square=True)
        plt.xlabel("samples "+select_samples_title)
        plt.ylabel(pat_labels_title)
        plt.title("Auto-corerelation of "+select_samples_title +
                  " samples - "+txt_label)
        if saveReport:
            logger.info('Save heatmap')
            plt.savefig(os.path.join(
                output_directory, 'Fig_corr_'+select_samples_title +
                fext[i_data]+img_ext),
                transparent=True, bbox_inches='tight',
                pad_inches=0.1, frameon=False)
            plt.close("all")
        else:
            plt.show()