Пример #1
0
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, normalized=False, 
                           savefig=None, show=False, savedata=None,
                           remove_bad_columns=True, **kwargs):
    """
    Compare the iteractions of two Hi-C matrices using their 6 first
    eigenvectors, with pearson correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 6 nvect: number of eigenvectors to compare
    :param None savefig: path to save the plot
    :param False show: displays the plot
    :param False normalized: use normalized data
    :param True remove_bads: computes the union of bad columns between samples
       and exclude them from the comparison
    :param kwargs: any argument to pass to matplotlib imshow function

    :returns: matrix of correlations
    """
    data1 = hic_data1.get_matrix(normalized=normalized)
    data2 = hic_data2.get_matrix(normalized=normalized)
    ## reduce matrices to remove bad columns
    if remove_bad_columns:
        # union of bad columns
        bads = hic_data1.bads.copy()
        bads.update(hic_data2.bads)
        # remove them form both matrices
        for bad in sorted(bads, reverse=True):
            del(data1[bad])
            del(data2[bad])
            for i in xrange(len(data1)):
                _ = data1[i].pop(bad)
                _ = data2[i].pop(bad)
    # get the log
    size = len(data1)
    data1 = nozero_log(data1, np.log2)
    data2 = nozero_log(data2, np.log2)
    # get the eigenvectors
    ev1, evect1 = eigh(data1)
    ev2, evect2 = eigh(data2)
    corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)]
    # sort eigenvectors according to their eigenvalues => first is last!!
    sort_perm = ev1.argsort()
    ev1.sort()
    evect1 = evect1[sort_perm]
    sort_perm = ev2.argsort()
    ev2.sort()
    evect2 = evect2[sort_perm]
    # calculate Pearson correlation
    for i in xrange(nvect):
        for j in xrange(nvect):
            corr[i][j] = abs(pearsonr(evect1[:,-i-1],
                                      evect2[:,-j-1])[0])
    # plot
    axe    = plt.axes([0.1, 0.1, 0.6, 0.8])
    cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8])
    if show or savefig:
        im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs)
        axe.set_xlabel('Eigen Vectors exp. 1')
        axe.set_ylabel('Eigen Vectors exp. 2')
        axe.set_xticks(range(nvect))
        axe.set_yticks(range(nvect))
        axe.set_xticklabels(range(1, nvect + 2))
        axe.set_yticklabels(range(1, nvect + 2))
        axe.xaxis.set_tick_params(length=0, width=0)
        axe.yaxis.set_tick_params(length=0, width=0)
        
        cbar = plt.colorbar(im, cax = cbaxes )
        cbar.ax.set_ylabel('Pearson correlation', rotation=90*3,
                           verticalalignment='bottom')
        axe2 = axe.twinx()
        axe2.set_yticks(range(nvect))
        axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]])
        axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3,
                        verticalalignment='bottom')
        axe2.set_ylim((-0.5, nvect - 0.5))
        axe2.yaxis.set_tick_params(length=0, width=0)
        
        axe3 = axe.twiny()
        axe3.set_xticks(range(nvect))
        axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]])
        axe3.set_xlabel('corresponding Eigen Values exp. 1')
        axe3.set_xlim((-0.5, nvect - 0.5))
        axe3.xaxis.set_tick_params(length=0, width=0)
        
        axe.set_ylim((-0.5, nvect - 0.5))
        axe.set_xlim((-0.5, nvect - 0.5))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')

    if savedata:
        out = open(savedata, 'w')
        out.write('# ' + '\t'.join(['Eigen Vector %s'% i
                                    for i in xrange(nvect)]) + '\n')
        for i in xrange(nvect):
            out.write('\t'.join([str(corr[i][j])
                                 for j in xrange(nvect)]) + '\n')
        out.close()
    if kwargs.get('get_bads', False):
        return corr, bads
    else:
        return corr
Пример #2
0
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None,
             cmap='jet', decay=False, perc=10, name=None, cistrans=None,
             decay_resolution=10000, normalized=False, max_diff=None):
    _ = plt.figure(figsize=(15.,12.5))
    if not max_diff:
        max_diff = len(data)
    ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205])
    ax2 = plt.axes([0.07, 0.65, 0.21, 0.15])
    if decay:
        ax3 = plt.axes([0.07, 0.42, 0.21, 0.15])
        plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3,
                                      resolution=decay_resolution,
                                      max_diff=max_diff, normalized=normalized)
    ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1)
    ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1)
    ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1)
    try:
        minoridata   = np.nanmin(data)
        maxoridata   = np.nanmax(data)
    except AttributeError:
        vals = [i for d in data for i in d if not np.isnan(i)]
        minoridata   = np.min(vals)
        maxoridata   = np.max(vals)
    totaloridata = np.nansum([data[i][j] for i in xrange(len(data))
                              for j in xrange(i, len(data[i]))]) # may not be square
    data = nozero_log(data, np.log2)
    vals = np.array([i for d in data for i in d])
    vals = vals[np.isfinite(vals)]

    mindata = np.nanmin(vals)
    maxdata = np.nanmax(vals)
    diff = maxdata - mindata
    posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01
    posF = 1.0  if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0
    if cmap == 'tadbit':
        cuts = perc
        cdict = {'red'  : [(0.0,  0.0, 0.0)],
                 'green': [(0.0,  0.0, 0.0)],
                 'blue' : [(0.0,  0.5, 0.5)]}
        prev_pos  = 0
        median = (np.median(vals) - mindata) / diff
        for prc in np.linspace(posI, median, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.) - mindata) / diff
                prc = ((prc - posI) / (median - posI)) + 1. / cuts
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict['red'  ].append([pos, prc, prc])
            cdict['green'].append([pos, prc, prc])
            cdict['blue' ].append([pos, 1, 1])
            prev_pos  = pos
        for prc in np.linspace(median + 1. / cuts, posF, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.) - mindata) / diff
                prc = ((prc - median) / (posF - median))
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict['red'  ].append([pos, 1.0, 1.0])
            cdict['green'].append([pos, 1 - prc, 1 - prc])
            cdict['blue' ].append([pos, 1 - prc, 1 - prc])
            prev_pos  = pos
        pos = (np.percentile(vals ,97.) - mindata) / diff
        cdict['red'  ].append([pos, 0.1, 0.1])
        cdict['green'].append([pos, 0, 0])
        cdict['blue' ].append([pos, 0, 0])

        cdict['red'  ].append([1.0, 1, 1])
        cdict['green'].append([1.0, 1, 1])
        cdict['blue' ].append([1.0, 0, 0])
        cmap  = LinearSegmentedColormap(cmap, cdict)
        clim = None
    else:
        cmap = plt.get_cmap(cmap)
    cmap.set_bad('darkgrey', 1)

    ax1.imshow(data, interpolation='none',
               cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None)
    size1 = len(data)
    size2 = len(data[0])
    if size1 == size2:
        for i in xrange(size1):
            for j in xrange(i, size2):
                if np.isnan(data[i][j]):
                    data[i][j] = 0
                    data[j][i] = 0
    else:
        for i in xrange(size1):
            for j in xrange(size2):
                if np.isnan(data[i][j]):
                    data[i][j] = 0
            #data[j][i] = data[i][j]
    try:
        evals, evect = eigh(data)
        sort_perm = evals.argsort()
        evect = evect[sort_perm]
    except:
        evals, evect = None, None
    data = [i for d in data for i in d if not np.isnan(i)]
    gradient = np.linspace(np.nanmin(data),
                           np.nanmax(data), max(size1, size2))
    gradient = np.vstack((gradient, gradient))
    h  = ax2.hist(data, color='darkgrey', linewidth=2,
                  bins=20, histtype='step', normed=True)
    _  = ax2.imshow(gradient, aspect='auto', cmap=cmap,
                    extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0])))
    if genome_seq:
        for crm in genome_seq:
            ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='w', linestyle='-', linewidth=1, alpha=1)
            ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='w', linestyle='-', linewidth=1, alpha=1)
            ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='k', linestyle='--')
            ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='k', linestyle='--')
        if not one:
            vals = [0]
            keys = ['']
            for crm in genome_seq:
                vals.append(cumcs[crm][0])
                keys.append(crm)
            vals.append(cumcs[crm][1])
            ax1.set_yticks(vals)
            ax1.set_yticklabels('')
            ax1.set_yticks([float(vals[i]+vals[i+1])/2
                            for i in xrange(len(vals) - 1)], minor=True)
            ax1.set_yticklabels(keys, minor=True)
            for t in ax1.yaxis.get_minor_ticks():
                t.tick1On = False
                t.tick2On = False
    # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',')
    # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(minoridata)[::-1])])[::-1].strip(',')
    # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(maxoridata)[::-1])])[::-1].strip(',')
    plt.figtext(0.05,0.25, ''.join([
        (name + '\n') if name else '',
        'Number of interactions: %s\n' % str(totaloridata),
        ('' if np.isnan(cistrans) else
         ('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))),
        'Min interactions: %s\n' % (minoridata),
        'Max interactions: %s\n' % (maxoridata)]))
    ax2.set_xlim((np.nanmin(data), np.nanmax(data)))
    ax2.set_ylim((0, max(h[0])))
    ax1.set_xlim ((-0.5, size1 - .5))
    ax1.set_ylim ((-0.5, size2 - .5))
    ax2.set_xlabel('log interaction count')
    # we reduce the number of dots displayed.... we just want to see the shape
    subdata = np.array(list(set([float(int(d*100))/100 for d in data])))
    try:
        normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data))
    except AttributeError:
        normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data))
    ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4)
    ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1)
    ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data),
                                                   kurtosis(data)))
    try: 
        ax4.vlines(range(size1), 0, evect[:,-1], color='k')
    except (TypeError, IndexError):
        pass
    ax4.hlines(0, 0, size2, color='red')
    ax4.set_ylabel('E1')
    ax4.set_yticklabels([])
    try:
        ax5.vlines(range(size1), 0, evect[:,-2], color='k')
    except (TypeError, IndexError):
        pass
    ax5.hlines(0, 0, size2, color='red')
    ax5.set_ylabel('E2')
    ax5.set_yticklabels([])
    try:
        ax6.vlines(range(size1), 0, evect[:,-3], color='k')
    except (TypeError, IndexError):
        pass
    ax6.hlines(0, 0, size2, color='red')
    ax6.set_ylabel('E3')
    ax6.set_yticklabels([])
    xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels()
    plt.setp(xticklabels, visible=False)
    if savefig:
        tadbit_savefig(savefig)
    elif show:
        plt.show()
    plt.close('all')
Пример #3
0
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6,
                           savefig=None, show=False, savedata=None, **kwargs):
    """
    Compare the iteractions of two Hi-C matrices using their 6 first
    eigenvectors, with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 6 nvect: number of eigenvectors to compare
    :param None savefig: path to save the plot
    :param False show: displays the plot
    :param kwargs: any argument to pass to matplotlib imshow function

    :returns: matrix of correlations
    """
    data1 = hic_data1.get_matrix()
    data2 = hic_data2.get_matrix()
    # get the log
    size = len(data1)
    data1 = nozero_log(data1, np.log2)
    data2 = nozero_log(data2, np.log2)
    # get the eigenvectors
    ev1, evect1 = eigh(data1)
    ev2, evect2 = eigh(data2)
    corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)]
    # sort eigenvectors according to their eigenvalues => first is last!!
    sort_perm = ev1.argsort()
    ev1.sort()
    evect1 = evect1[sort_perm]
    sort_perm = ev2.argsort()
    ev2.sort()
    evect2 = evect2[sort_perm]
    # calculate Pearson correlation
    for i in xrange(nvect):
        for j in xrange(nvect):
            corr[i][j] = abs(pearsonr(evect1[:,-i-1],
                                      evect2[:,-j-1])[0])
    # plot
    axe    = plt.axes([0.1, 0.1, 0.6, 0.8])
    cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8])
    if show or savefig:
        im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs)
        axe.set_xlabel('Eigen Vectors exp. 1')
        axe.set_ylabel('Eigen Vectors exp. 2')
        axe.set_xticks(range(nvect))
        axe.set_yticks(range(nvect))
        axe.set_xticklabels(range(1, nvect + 2))
        axe.set_yticklabels(range(1, nvect + 2))
        axe.xaxis.set_tick_params(length=0, width=0)
        axe.yaxis.set_tick_params(length=0, width=0)
        
        cbar = plt.colorbar(im, cax = cbaxes )
        cbar.ax.set_ylabel('Pearson correlation', rotation=90*3,
                           verticalalignment='bottom')
        axe2 = axe.twinx()
        axe2.set_yticks(range(nvect))
        axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]])
        axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3,
                        verticalalignment='bottom')
        axe2.set_ylim((-0.5, nvect - 0.5))
        axe2.yaxis.set_tick_params(length=0, width=0)
        
        axe3 = axe.twiny()
        axe3.set_xticks(range(nvect))
        axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]])
        axe3.set_xlabel('corresponding Eigen Values exp. 1')
        axe3.set_xlim((-0.5, nvect - 0.5))
        axe3.xaxis.set_tick_params(length=0, width=0)
        
        axe.set_ylim((-0.5, nvect - 0.5))
        axe.set_xlim((-0.5, nvect - 0.5))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')

    if savedata:
        out = open(savedata, 'w')
        out.write('# ' + '\t'.join(['Eigen Vector %s'% i
                                    for i in xrange(nvect)]) + '\n')
        for i in xrange(nvect):
            out.write('\t'.join([str(corr[i][j])
                                 for j in xrange(nvect)]) + '\n')
        out.close()

    return corr
Пример #4
0
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None,
             cmap='jet', decay=False, perc=10, name=None, cistrans=None,
             decay_resolution=10000, normalized=False, max_diff=None):
    _ = plt.figure(figsize=(15.,12.5))
    if not max_diff:
        max_diff = len(data)
    ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205])
    ax2 = plt.axes([0.07, 0.65, 0.21, 0.15])
    if decay:
        ax3 = plt.axes([0.07, 0.42, 0.21, 0.15])
        plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3,
                                      resolution=decay_resolution,
                                      max_diff=max_diff, normalized=normalized)
    ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1)
    ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1)
    ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1)
    try:
        minoridata   = np.nanmin(data)
        maxoridata   = np.nanmax(data)
    except AttributeError:
        vals = [i for d in data for i in d if not np.isnan(i)]
        minoridata   = np.min(vals)
        maxoridata   = np.max(vals)
    totaloridata = np.nansum([data[i][j] for i in xrange(len(data))
                              for j in xrange(i, len(data))])
    data = nozero_log(data, np.log2)
    vals = np.array([i for d in data for i in d])
    vals = vals[np.isfinite(vals)]

    mindata = np.nanmin(vals)
    maxdata = np.nanmax(vals)
    diff = maxdata - mindata
    posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01
    posF = 1.0  if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0
    if cmap == 'tadbit':
        cuts = perc
        cdict = {'red'  : [(0.0,  0.0, 0.0)],
                 'green': [(0.0,  0.0, 0.0)],
                 'blue' : [(0.0,  0.5, 0.5)]}
        prev_pos  = 0
        median = (np.median(vals) - mindata) / diff
        for prc in np.linspace(posI, median, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.) - mindata) / diff
                prc = ((prc - posI) / (median - posI)) + 1. / cuts
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict['red'  ].append([pos, prc, prc])
            cdict['green'].append([pos, prc, prc])
            cdict['blue' ].append([pos, 1, 1])
            prev_pos  = pos
        for prc in np.linspace(median + 1. / cuts, posF, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.) - mindata) / diff
                prc = ((prc - median) / (posF - median))
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict['red'  ].append([pos, 1.0, 1.0])
            cdict['green'].append([pos, 1 - prc, 1 - prc])
            cdict['blue' ].append([pos, 1 - prc, 1 - prc])
            prev_pos  = pos
        pos = (np.percentile(vals ,97.) - mindata) / diff
        cdict['red'  ].append([pos, 0.1, 0.1])
        cdict['green'].append([pos, 0, 0])
        cdict['blue' ].append([pos, 0, 0])

        cdict['red'  ].append([1.0, 1, 1])
        cdict['green'].append([1.0, 1, 1])
        cdict['blue' ].append([1.0, 0, 0])
        cmap  = LinearSegmentedColormap(cmap, cdict)
        clim = None
    else:
        cmap = plt.get_cmap(cmap)
    cmap.set_bad('darkgrey', 1)

    ax1.imshow(data, interpolation='none',
               cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None)
    size = len(data)
    for i in xrange(size):
        for j in xrange(i, size):
            if np.isnan(data[i][j]):
                data[i][j] = 0
                data[j][i] = 0
            #data[j][i] = data[i][j]
    evals, evect = eigh(data)
    sort_perm = evals.argsort()
    evect = evect[sort_perm]
    data = [i for d in data for i in d if not np.isnan(i)]
    gradient = np.linspace(np.nanmin(data),
                           np.nanmax(data), size)
    gradient = np.vstack((gradient, gradient))
    h  = ax2.hist(data, color='darkgrey', linewidth=2,
                  bins=20, histtype='step', normed=True)
    _  = ax2.imshow(gradient, aspect='auto', cmap=cmap,
                    extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0])))
    if genome_seq:
        for crm in genome_seq:
            ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='w', linestyle='-', linewidth=1, alpha=1)
            ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='w', linestyle='-', linewidth=1, alpha=1)
            ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='k', linestyle='--')
            ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='k', linestyle='--')
        if not one:
            vals = [0]
            keys = ['']
            for crm in genome_seq:
                vals.append(cumcs[crm][0])
                keys.append(crm)
            vals.append(cumcs[crm][1])
            ax1.set_yticks(vals)
            ax1.set_yticklabels('')
            ax1.set_yticks([float(vals[i]+vals[i+1])/2
                            for i in xrange(len(vals) - 1)], minor=True)
            ax1.set_yticklabels(keys, minor=True)
            for t in ax1.yaxis.get_minor_ticks():
                t.tick1On = False
                t.tick2On = False
    # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',')
    # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(minoridata)[::-1])])[::-1].strip(',')
    # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(maxoridata)[::-1])])[::-1].strip(',')
    plt.figtext(0.05,0.25, ''.join([
        (name + '\n') if name else '',
        'Number of interactions: %s\n' % str(totaloridata),
        ('' if np.isnan(cistrans) else
         ('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))),
        'Min interactions: %s\n' % (minoridata),
        'Max interactions: %s\n' % (maxoridata)]))
    ax2.set_xlim((np.nanmin(data), np.nanmax(data)))
    ax2.set_ylim((0, max(h[0])))
    ax1.set_xlim ((-0.5, size - .5))
    ax1.set_ylim ((-0.5, size - .5))
    ax2.set_xlabel('log interaction count')
    # we reduce the number of dots displayed.... we just want to see the shape
    subdata = np.array(list(set([float(int(d*100))/100 for d in data])))
    try:
        normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data))
    except AttributeError:
        normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data))
    ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4)
    ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1)
    ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data),
                                                   kurtosis(data)))
    ax4.vlines(range(size), 0, evect[:,-1], color='k')
    ax4.hlines(0, 0, size, color='red')
    ax4.set_ylabel('E1')
    ax4.set_yticklabels([])
    try:
        ax5.vlines(range(size), 0, evect[:,-2], color='k')
    except IndexError:
        pass
    ax5.hlines(0, 0, size, color='red')
    ax5.set_ylabel('E2')
    ax5.set_yticklabels([])
    try:
        ax6.vlines(range(size), 0, evect[:,-3], color='k')
    except IndexError:
        pass
    ax6.hlines(0, 0, size, color='red')
    ax6.set_ylabel('E3')
    ax6.set_yticklabels([])
    xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels()
    plt.setp(xticklabels, visible=False)
    if savefig:
        tadbit_savefig(savefig)
    elif show:
        plt.show()
    plt.close('all')
Пример #5
0
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6,
                           savefig=None, show=False, savedata=None, **kwargs):
    """
    Compare the iteractions of two Hi-C matrices using their 6 first
    eigenvectors, with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 6 nvect: number of eigenvectors to compare
    :param None savefig: path to save the plot
    :param False show: displays the plot
    :param kwargs: any argument to pass to matplotlib imshow function

    :returns: matrix of correlations
    """
    data1 = hic_data1.get_matrix()
    data2 = hic_data2.get_matrix()
    # get the log
    size = len(data1)
    data1 = nozero_log(data1, np.log2)
    data2 = nozero_log(data2, np.log2)
    # get the eigenvectors
    ev1, evect1 = eigh(data1)
    ev2, evect2 = eigh(data2)
    corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)]
    # sort eigenvectors according to their eigenvalues => first is last!!
    sort_perm = ev1.argsort()
    ev1.sort()
    evect1 = evect1[sort_perm]
    sort_perm = ev2.argsort()
    ev2.sort()
    evect2 = evect2[sort_perm]
    # calculate Pearson correlation
    for i in xrange(nvect):
        for j in xrange(nvect):
            corr[i][j] = abs(pearsonr(evect1[:,-i-1],
                                      evect2[:,-j-1])[0])
    # plot
    axe    = plt.axes([0.1, 0.1, 0.6, 0.8])
    cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8])
    if show or savefig:
        im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs)
        axe.set_xlabel('Eigen Vectors exp. 1')
        axe.set_ylabel('Eigen Vectors exp. 2')
        axe.set_xticks(range(nvect))
        axe.set_yticks(range(nvect))
        axe.set_xticklabels(range(1, nvect + 2))
        axe.set_yticklabels(range(1, nvect + 2))
        axe.xaxis.set_tick_params(length=0, width=0)
        axe.yaxis.set_tick_params(length=0, width=0)
        
        cbar = plt.colorbar(im, cax = cbaxes )
        cbar.ax.set_ylabel('Pearson correlation', rotation=90*3,
                           verticalalignment='bottom')
        axe2 = axe.twinx()
        axe2.set_yticks(range(nvect))
        axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]])
        axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3,
                        verticalalignment='bottom')
        axe2.set_ylim((-0.5, nvect - 0.5))
        axe2.yaxis.set_tick_params(length=0, width=0)
        
        axe3 = axe.twiny()
        axe3.set_xticks(range(nvect))
        axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]])
        axe3.set_xlabel('corresponding Eigen Values exp. 1')
        axe3.set_xlim((-0.5, nvect - 0.5))
        axe3.xaxis.set_tick_params(length=0, width=0)
        
        axe.set_ylim((-0.5, nvect - 0.5))
        axe.set_xlim((-0.5, nvect - 0.5))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')

    if savedata:
        out = open(savedata, 'w')
        out.write('# ' + '\t'.join(['Eigen Vector %s'% i
                                    for i in xrange(nvect)]) + '\n')
        for i in xrange(nvect):
            out.write('\t'.join([str(corr[i][j])
                                 for j in xrange(nvect)]) + '\n')
        out.close()

    return corr
Пример #6
0
def draw_map(
    data,
    genome_seq,
    cumcs,
    savefig,
    show,
    one=False,
    clim=None,
    cmap="jet",
    decay=False,
    perc=10,
    name=None,
    cistrans=None,
    decay_resolution=10000,
    normalized=False,
    max_diff=None,
):
    _ = plt.figure(figsize=(15.0, 12.5))
    if not max_diff:
        max_diff = len(data)
    ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205])
    ax2 = plt.axes([0.07, 0.65, 0.21, 0.15])
    if decay:
        ax3 = plt.axes([0.07, 0.42, 0.21, 0.15])
        plot_distance_vs_interactions(
            data, genome_seq=genome_seq, axe=ax3, resolution=decay_resolution, max_diff=max_diff, normalized=normalized
        )
    ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1)
    ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1)
    ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1)
    try:
        minoridata = np.nanmin(data)
        maxoridata = np.nanmax(data)
    except AttributeError:
        vals = [i for d in data for i in d if not np.isnan(i)]
        minoridata = np.min(vals)
        maxoridata = np.max(vals)
    totaloridata = np.nansum([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data))])
    data = nozero_log(data, np.log2)
    vals = np.array([i for d in data for i in d])
    vals = vals[np.isfinite(vals)]

    mindata = np.nanmin(vals)
    maxdata = np.nanmax(vals)
    diff = maxdata - mindata
    posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01
    posF = 1.0 if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0
    if cmap == "tadbit":
        cuts = perc
        cdict = {"red": [(0.0, 0.0, 0.0)], "green": [(0.0, 0.0, 0.0)], "blue": [(0.0, 0.5, 0.5)]}
        prev_pos = 0
        median = (np.median(vals) - mindata) / diff
        for prc in np.linspace(posI, median, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.0) - mindata) / diff
                prc = ((prc - posI) / (median - posI)) + 1.0 / cuts
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict["red"].append([pos, prc, prc])
            cdict["green"].append([pos, prc, prc])
            cdict["blue"].append([pos, 1, 1])
            prev_pos = pos
        for prc in np.linspace(median + 1.0 / cuts, posF, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.0) - mindata) / diff
                prc = (prc - median) / (posF - median)
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict["red"].append([pos, 1.0, 1.0])
            cdict["green"].append([pos, 1 - prc, 1 - prc])
            cdict["blue"].append([pos, 1 - prc, 1 - prc])
            prev_pos = pos
        pos = (np.percentile(vals, 97.0) - mindata) / diff
        cdict["red"].append([pos, 0.1, 0.1])
        cdict["green"].append([pos, 0, 0])
        cdict["blue"].append([pos, 0, 0])

        cdict["red"].append([1.0, 1, 1])
        cdict["green"].append([1.0, 1, 1])
        cdict["blue"].append([1.0, 0, 0])
        cmap = LinearSegmentedColormap(cmap, cdict)
        clim = None
    else:
        cmap = plt.get_cmap(cmap)
    cmap.set_bad("darkgrey", 1)

    ax1.imshow(data, interpolation="none", cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None)
    size = len(data)
    for i in xrange(size):
        for j in xrange(i, size):
            if np.isnan(data[i][j]):
                data[i][j] = 0
                data[j][i] = 0
            # data[j][i] = data[i][j]
    evals, evect = eigh(data)
    sort_perm = evals.argsort()
    evect = evect[sort_perm]
    data = [i for d in data for i in d if not np.isnan(i)]
    gradient = np.linspace(np.nanmin(data), np.nanmax(data), size)
    gradient = np.vstack((gradient, gradient))
    h = ax2.hist(data, color="darkgrey", linewidth=2, bins=20, histtype="step", normed=True)
    _ = ax2.imshow(gradient, aspect="auto", cmap=cmap, extent=(np.nanmin(data), np.nanmax(data), 0, max(h[0])))
    if genome_seq:
        for crm in genome_seq:
            ax1.vlines(
                [cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5],
                cumcs[crm][0] - 0.5,
                cumcs[crm][1] - 0.5,
                color="w",
                linestyle="-",
                linewidth=1,
                alpha=1,
            )
            ax1.hlines(
                [cumcs[crm][1] - 0.5, cumcs[crm][0] - 0.5],
                cumcs[crm][0] - 0.5,
                cumcs[crm][1] - 0.5,
                color="w",
                linestyle="-",
                linewidth=1,
                alpha=1,
            )
            ax1.vlines(
                [cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5],
                cumcs[crm][0] - 0.5,
                cumcs[crm][1] - 0.5,
                color="k",
                linestyle="--",
            )
            ax1.hlines(
                [cumcs[crm][1] - 0.5, cumcs[crm][0] - 0.5],
                cumcs[crm][0] - 0.5,
                cumcs[crm][1] - 0.5,
                color="k",
                linestyle="--",
            )
        if not one:
            vals = [0]
            keys = [""]
            for crm in genome_seq:
                vals.append(cumcs[crm][0])
                keys.append(crm)
            vals.append(cumcs[crm][1])
            ax1.set_yticks(vals)
            ax1.set_yticklabels("")
            ax1.set_yticks([float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1)], minor=True)
            ax1.set_yticklabels(keys, minor=True)
            for t in ax1.yaxis.get_minor_ticks():
                t.tick1On = False
                t.tick2On = False
    # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',')
    # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(minoridata)[::-1])])[::-1].strip(',')
    # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(maxoridata)[::-1])])[::-1].strip(',')
    plt.figtext(
        0.05,
        0.25,
        "".join(
            [
                (name + "\n") if name else "",
                "Number of interactions: %s\n" % str(totaloridata),
                ("" if np.isnan(cistrans) else ("Percentage of cis interactions: %.0f%%\n" % (cistrans * 100))),
                "Min interactions: %s\n" % (minoridata),
                "Max interactions: %s\n" % (maxoridata),
            ]
        ),
    )
    ax2.set_xlim((np.nanmin(data), np.nanmax(data)))
    ax2.set_ylim((0, max(h[0])))
    ax1.set_xlim((-0.5, size - 0.5))
    ax1.set_ylim((-0.5, size - 0.5))
    ax2.set_xlabel("log interaction count")
    # we reduce the number of dots displayed.... we just want to see the shape
    subdata = np.array(list(set([float(int(d * 100)) / 100 for d in data])))
    try:
        normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data))
    except AttributeError:
        normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data))
    ax2.plot(subdata, normfit, "w.", markersize=2.5, alpha=0.4)
    ax2.plot(subdata, normfit, "k.", markersize=1.5, alpha=1)
    ax2.set_title("skew: %.3f, kurtosis: %.3f" % (skew(data), kurtosis(data)))
    ax4.vlines(range(size), 0, evect[:, -1], color="k")
    ax4.hlines(0, 0, size, color="red")
    ax4.set_ylabel("E1")
    ax4.set_yticklabels([])
    try:
        ax5.vlines(range(size), 0, evect[:, -2], color="k")
    except IndexError:
        pass
    ax5.hlines(0, 0, size, color="red")
    ax5.set_ylabel("E2")
    ax5.set_yticklabels([])
    try:
        ax6.vlines(range(size), 0, evect[:, -3], color="k")
    except IndexError:
        pass
    ax6.hlines(0, 0, size, color="red")
    ax6.set_ylabel("E3")
    ax6.set_yticklabels([])
    xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels()
    plt.setp(xticklabels, visible=False)
    if savefig:
        tadbit_savefig(savefig)
    elif show:
        plt.show()
    plt.close("all")