Пример #1
0
def plot_pairwise_analysis(data_mat, feature_columns, dependent_column, column_names):
    """
        Does a basic pairwise correlation analysis between features and a dependent variable,
        meaning it plots a scatter plot with a linear curve fit through it, with the R^2.
        Then it plots a correlation matrix for all features and the dependent variable.

        data_mat: an NxM matrix, where there are N samples, M-1 features, and 1 dependent variable.

        feature_columns: the column indices of the features in data_mat that are being examined

        dependent_column: the column index of the dependent variable in data_mat

        column_names: a list of len(feature_columns)+1 feature/variable names. The last element is
                      the name of the dependent variable.
    """

    plot_data = list()
    for k,fname in enumerate(column_names[:-1]):
        fi = feature_columns[k]

        pdata = dict()
        pdata['x'] = data_mat[:, fi]
        pdata['y'] = data_mat[:, dependent_column]
        pdata['xlabel'] = column_names[fi]
        pdata['ylabel'] = column_names[-1]
        pdata['R2'] = compute_R2(pdata['x'], pdata['y'])
        plot_data.append(pdata)

    #sort by R^2
    plot_data.sort(key=operator.itemgetter('R2'), reverse=True)
    multi_plot(plot_data, plot_pairwise_scatter, title=None, nrows=3, ncols=3)

    all_columns = copy.copy(feature_columns)
    all_columns.append(dependent_column)

    C = np.corrcoef(data_mat[:, all_columns].transpose())

    Cy = C[:, -1]
    corr_list = [(column_names[k], np.abs(Cy[k]), Cy[k]) for k in range(len(column_names)-1)]
    corr_list.sort(key=operator.itemgetter(1), reverse=True)

    print 'Correlations  with %s' % column_names[-1]
    for cname,abscorr,corr in corr_list:
        print '\t%s: %0.6f' % (cname, corr)

    fig = plt.figure()
    plt.subplots_adjust(top=0.99, bottom=0.15, left=0.15)
    ax = fig.add_subplot(1, 1, 1)
    fig.autofmt_xdate(rotation=45)
    im = ax.imshow(C, interpolation='nearest', aspect='auto', vmin=-1.0, vmax=1.0, origin='lower')
    plt.colorbar(im)
    ax.set_yticks(range(len(column_names)))
    ax.set_yticklabels(column_names)
    ax.set_xticks(range(len(column_names)))
    ax.set_xticklabels(column_names)
Пример #2
0
def plot_pairwise_scatter(plot_data, ax):

    x = plot_data['x']
    y = plot_data['y']
    if 'R2' not in plot_data:
        R2 = compute_R2(x, y)
    else:
        R2 = plot_data['R2']
    slope, bias = np.polyfit(x, y, 1)
    sp = (x.max() - x.min()) / 25.0
    xrng = np.arange(x.min(), x.max(), sp)

    clr = '#aaaaaa'
    if 'color' in plot_data:
        clr = plot_data['color']
    ax.plot(x, y, 'o', mfc=clr)
    ax.plot(xrng, slope * xrng + bias, 'k-')
    ax.set_title('%s: R2=%0.2f' % (plot_data['xlabel'], R2))
    if 'ylabel' in plot_data:
        ax.set_ylabel(plot_data['ylabel'])
    ax.set_ylim(y.min(), y.max())
Пример #3
0
def plot_pairwise_scatter(plot_data, ax):

    x = plot_data['x']
    y = plot_data['y']
    if 'R2' not in plot_data:
        R2 = compute_R2(x, y)
    else:
        R2 = plot_data['R2']
    slope,bias = np.polyfit(x, y, 1)
    sp = (x.max() - x.min()) / 25.0
    xrng = np.arange(x.min(), x.max(), sp)

    clr = '#aaaaaa'
    if 'color' in plot_data:
        clr = plot_data['color']
    ax.plot(x, y, 'o', mfc=clr)
    ax.plot(xrng, slope*xrng + bias, 'k-')
    ax.set_title('%s: R2=%0.2f' % (plot_data['xlabel'], R2))
    if 'ylabel' in plot_data:
        ax.set_ylabel(plot_data['ylabel'])
    ax.set_ylim(y.min(), y.max())