예제 #1
0
def gaia_motion_analysis(data, norm=False, class_col='CLASS_PHOTO'):
    movement_mask = ~data[['parallax', 'pmdec', 'pmra']].isnull().any(axis=1)
    data_movement = data.loc[movement_mask]

    for class_name in BASE_CLASSES:

        motions = ['parallax', 'pmra', 'pmdec']
        if norm & (class_name == 'QSO'):
            motions = [m + '_norm' for m in motions]
        result_df = pd.DataFrame(
            index=['mean', 'sigma', 'mean_error', 'median'], columns=motions)

        for motion in motions:
            data_of_interest = data_movement.loc[data_movement[class_col] ==
                                                 class_name, motion]
            (mu, sigma) = stats.norm.fit(data_of_interest)
            median = np.median(data_of_interest)
            mu_error = sigma / math.sqrt(data_of_interest.shape[0])

            result_df.loc['mean', motion] = mu
            result_df.loc['sigma', motion] = sigma
            result_df.loc['mean_error', motion] = mu_error
            result_df.loc['median', motion] = median

            plt.figure()
            sns.distplot(data_of_interest,
                         color=get_cubehelix_palette(1)[0],
                         kde_kws=dict(bw=0.5))
            if motion == 'parallax':
                plt.xlim((-6, 6))
            plt.ylabel(class_name)

        print('{}:'.format(class_name))
        print(result_df)
예제 #2
0
def plot_z_hists(preds, z_max=None):
    preds_zlim = preds.loc[preds['Z'] <= z_max]
    to_plot = [
        ('Z', 'CLASS'),
        ('Z_PHOTO', 'CLASS_PHOTO'),
    ]
    color_palette = get_cubehelix_palette(len(BASE_CLASSES))
    for x_col, cls_col in to_plot:
        is_cls_photo = (cls_col == 'CLASS_PHOTO')
        plt.figure()
        for i, cls in enumerate(['QSO', 'GALAXY']):
            hist, bins = np.histogram(
                preds_zlim.loc[preds_zlim[cls_col] == cls][x_col], bins=40)
            hist_norm = hist / max(hist)
            ax = sns.lineplot(bins[:-1],
                              hist_norm,
                              drawstyle='steps-post',
                              label=get_plot_text(cls, is_cls_photo),
                              color=color_palette[i])
            ax.lines[i].set_linestyle(get_line_style(i))

        plt.xlabel(get_plot_text(x_col))
        plt.ylabel('normalized counts per bin')
        plt.legend(framealpha=1.0)
        plt.show()
예제 #3
0
def precision_z_report(predictions, col_true='CLASS', z_max=None):
    """
    Compare predicted classes against true redshifts
    :param predictions:
    :param col_true:
    :param z_max:
    :return:
    """
    predictions_zlim = predictions.loc[predictions['Z'] <= z_max]
    color_palette = get_cubehelix_palette(len(BASE_CLASSES))

    for cls_pred in BASE_CLASSES:

        photo_class_as_dict = {}
        for cls_true in BASE_CLASSES:
            photo_class_as_dict[cls_true] = predictions_zlim.loc[
                (predictions_zlim[col_true] == cls_true)
                & (predictions_zlim['CLASS_PHOTO'] == cls_pred)]['Z_PHOTO']

        plt.figure()
        _, bin_edges = np.histogram(np.hstack((
            photo_class_as_dict['QSO'],
            photo_class_as_dict['STAR'],
            photo_class_as_dict['GALAXY'],
        )),
                                    bins=40)

        for i, cls_true in enumerate(BASE_CLASSES):
            hist_kws = {
                'alpha': 1.0,
                'histtype': 'step',
                'linewidth': 1.5,
                'linestyle': get_line_style(i)
            }
            label = '{}'.format(get_plot_text(cls_true))
            ax = sns.distplot(photo_class_as_dict[cls_true],
                              label=label,
                              bins=bin_edges,
                              kde=False,
                              rug=False,
                              color=color_palette[i],
                              hist_kws=hist_kws)
            ax.set(yscale='log')

        plt.title(get_plot_text(cls_pred, is_photo=True))
        plt.xlabel(get_plot_text('Z_PHOTO'))
        plt.ylabel('counts per bin')
        plt.legend(loc='upper right', framealpha=1.0)
        plt.show()
예제 #4
0
def plot_linear_data(data, annotations=True):
    color_palette = get_cubehelix_palette(len(data))
    for i, (scale, bias, x_lim) in enumerate(data):
        label_base = '$10^{' + '{}'.format(scale) + ' * m'
        prefix = ''
        if annotations:
            if scale == 0.6:
                prefix = 'euclidean '
            else:
                prefix = 'eBOSS '
        label_base = prefix + label_base
        x_linear = np.arange(x_lim[0], x_lim[1] + 0.25, 0.25)
        y_linear = [10**(scale * m - bias) for m in x_linear]
        v_bias = bias if bias > 0 else -bias
        label_bias = ' - ' + str(v_bias) + '}$' if bias > 0 else ' + ' + str(
            v_bias) + '}$'
        plt.plot(x_linear,
                 y_linear,
                 '--',
                 c=color_palette[i],
                 label=(label_base + label_bias))
예제 #5
0
def spatial_number_density(data_dict,
                           nside=128,
                           z_bin_step=0.5,
                           z_bin_size=0.5,
                           cosmo_model=cosmo_wmap9,
                           z_max=None,
                           legend_size=None):
    volume_proportion = (hp.nside2pixarea(nside, degrees=True) / 41253.0)

    fig, ax = plt.subplots()
    to_plot_df = pd.DataFrame()
    x_col = 'z'
    y_col = r'spatial density [N / comoving Mpc$^3$]'
    for data_name, (data, map) in data_dict.items():
        z_column = 'Z' if 'Z' in data else 'Z_PHOTO'
        z_half_bin_size = z_bin_size / 2
        steps = np.arange(data[z_column].min() + z_half_bin_size,
                          data[z_column].max() + z_half_bin_size, z_bin_step)
        comoving_volumes = np.array([
            (cosmo_model.comoving_volume(step + z_half_bin_size) -
             cosmo_model.comoving_volume(step - z_half_bin_size)).value
            for step in steps
        ])
        mask_non_zero = np.nonzero(map)
        print('{} area: {:.2f} deg^2'.format(
            data_name,
            len(mask_non_zero[0]) * hp.nside2pixarea(nside, degrees=True)))

        density_v_max_mean, density_v_max_error = [], []
        (ra_col, dec_col) = ('RAJ2000',
                             'DECJ2000') if ('RAJ2000' in data) else ('RA',
                                                                      'DEC')
        for i, step in enumerate(steps):
            data_step = data.loc[(data[z_column] > step - z_half_bin_size)
                                 & (data[z_column] < step + z_half_bin_size)]
            step_map, _, _ = get_map(data_step[ra_col],
                                     data_step[dec_col],
                                     v=data_step['v_weight'].values,
                                     nside=nside)
            v_max_values = step_map[mask_non_zero] / comoving_volumes[
                i] / volume_proportion
            (mu, sigma) = stats.norm.fit(v_max_values)
            density_v_max_mean.append(mu)
            density_v_max_error.append(sigma /
                                       math.sqrt(v_max_values.shape[0]))

        density_v_max_mean = np.array(density_v_max_mean)
        density_v_max_error = np.array(density_v_max_error)

        # comoving_v_max_densities = (density_v_max_mean / comoving_volumes / volume_proportion)

        to_plot_df = to_plot_df.append(pd.DataFrame({
            x_col:
            steps,
            y_col:
            density_v_max_mean,
            'error':
            density_v_max_error,
            'data name': [data_name] * len(steps),
        }),
                                       ignore_index=True)

    color_palette = get_cubehelix_palette(
        len(data_dict), reverse=False) if len(data_dict) > 1 else [(0, 0, 0)]
    sns.lineplot(x=x_col,
                 y=y_col,
                 data=to_plot_df,
                 hue='data name',
                 palette=color_palette,
                 style='data name',
                 markers=True,
                 dashes=False)

    ax = plt.gca()
    for i, data_name in enumerate(to_plot_df['data name'].unique()):
        to_plot_single_data = to_plot_df.loc[to_plot_df['data name'] ==
                                             data_name]
        lower = to_plot_single_data[
            y_col].values - to_plot_single_data['error'].values / 2
        upper = to_plot_single_data[
            y_col].values + to_plot_single_data['error'].values / 2
        ax.fill_between(to_plot_single_data[x_col],
                        lower,
                        upper,
                        color=color_palette[i],
                        alpha=0.2)

    plt.xlim(right=z_max)
    plt.yscale('log')
    # handles, labels = ax.get_legend_handles_labels()
    prop = {'size': legend_size} if legend_size else {}
    ax.legend(loc='upper right', framealpha=1.0,
              prop=prop)  # handles=handles[1:], labels=labels[1:],
    plt.setp(ax.get_legend().get_texts(), fontsize='9')
    plt.show()
예제 #6
0
def number_counts(data_dict,
                  linear_data,
                  nside=128,
                  step=.1,
                  band_column='MAG_GAAP_r',
                  legend_loc='upper left',
                  legend_size=None):
    fig, ax = plt.subplots()
    to_plot_df = pd.DataFrame()
    x_col = pretty_print_magnitude(band_column)
    y_col = r'surface density (≤ m) [N / deg$^2$]'
    for i, (data_name, (data, map)) in enumerate(data_dict.items()):
        (ra_col, dec_col) = ('RAJ2000',
                             'DECJ2000') if 'RAJ2000' in data else ('RA',
                                                                    'DEC')

        mask_non_zero = np.nonzero(map)
        print('{} area: {:.2f} deg^2'.format(
            data_name,
            len(mask_non_zero[0]) * hp.nside2pixarea(nside, degrees=True)))

        m_min = int(math.ceil(data[band_column].min()))
        m_max = int(math.ceil(data[band_column].max()))
        magnitude_arr = np.arange(m_min, m_max + step, step)
        density_mean_arr, density_error_arr = [], []
        for m_max in magnitude_arr:
            data_m_max = data.loc[data[band_column] < m_max]
            map_m_max, _, _ = get_map(data_m_max[ra_col],
                                      data_m_max[dec_col],
                                      nside=nside)
            densities = map_m_max[mask_non_zero] / hp.nside2pixarea(
                nside, degrees=True)
            (mu, sigma) = stats.norm.fit(densities)
            density_mean_arr.append(mu)
            density_error_arr.append(sigma / math.sqrt(densities.shape[0]))

        to_plot_df = to_plot_df.append(pd.DataFrame({
            x_col:
            magnitude_arr,
            y_col:
            density_mean_arr,
            'error':
            density_error_arr,
            'data name': [data_name] * len(magnitude_arr),
        }),
                                       ignore_index=True)

    color_palette = get_cubehelix_palette(
        len(data_dict), reverse=True) if len(data_dict) > 1 else [(0, 0, 0)]
    sns.lineplot(x=x_col,
                 y=y_col,
                 data=to_plot_df,
                 hue='data name',
                 palette=color_palette,
                 style='data name',
                 markers=True)
    plot_linear_data(linear_data)

    ax = plt.gca()
    for i, data_name in enumerate(to_plot_df['data name'].unique()):
        to_plot_single_data = to_plot_df.loc[to_plot_df['data name'] ==
                                             data_name]
        lower = to_plot_single_data[
            y_col].values - to_plot_single_data['error'].values / 2
        upper = to_plot_single_data[
            y_col].values + to_plot_single_data['error'].values / 2
        ax.fill_between(to_plot_single_data[x_col],
                        lower,
                        upper,
                        color=color_palette[i],
                        alpha=0.2)

    plt.yscale('log')
    # handles, labels = ax.get_legend_handles_labels()
    prop = {'size': legend_size} if legend_size else {}
    ax.legend(loc=legend_loc, framealpha=1.0,
              prop=prop)  # handles=handles[1:], labels=labels[1:],
    plt.setp(ax.get_legend().get_texts(), fontsize='9')
    plt.show()
예제 #7
0
def plot_cleaning_metrics(preds_class,
                          cls,
                          metrics_to_plot,
                          thresholds,
                          step,
                          cleaning,
                          y_lim=None):
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    label = '{} probability threshold' if cleaning == 'clf_proba' else '{} redshift uncertainity threshold'
    ax1.set_xlabel(label.format(get_plot_text(cls, is_photo=True)))
    if cleaning == 'z_std_dev':
        ax1.invert_xaxis()
    ax_arr = [ax1, ax2]
    plotted_arr = []
    color_palette = get_cubehelix_palette(len(metrics_to_plot))
    for i, (metric_name, metric_func) in enumerate(metrics_to_plot):
        metric_std_func = None
        if type(metric_func) is tuple:
            metric_std_func = metric_func[1]
            metric_func = metric_func[0]

        # Get metrics in thresholds
        metric_values = []
        metric_errors = []
        thresholds_to_use = thresholds if metric_name != 'fraction of objects' else (
            np.append(thresholds, [thresholds[-1] + step]))
        for thr in thresholds_to_use:
            preds_lim = preds_class.loc[preds_class['{}_PHOTO'.format(
                cls)] >= thr] if cleaning == 'clf_proba' else preds_class.loc[
                    preds_class['Z_PHOTO_STDDEV'] <= thr]

            # Get mean and standard error
            metric_mean = metric_func(preds_lim['Z'], preds_lim['Z_PHOTO'])
            metric_error = None
            if metric_std_func:
                metric_std = metric_std_func(preds_lim['Z'],
                                             preds_lim['Z_PHOTO'])
                metric_error = metric_std / math.sqrt(preds_lim.shape[0])
            metric_values.append(np.around(metric_mean, 4))
            metric_errors.append(metric_error)

        # Make plots
        plotted, = ax_arr[i].plot(thresholds_to_use,
                                  metric_values,
                                  label=metric_name,
                                  color=color_palette[i],
                                  linestyle=get_line_style(i))
        if metric_errors[0]:
            lower = np.array(metric_values) - np.array(metric_errors) / 2
            upper = np.array(metric_values) + np.array(metric_errors) / 2
            ax_arr[i].fill_between(thresholds_to_use,
                                   lower,
                                   upper,
                                   color=color_palette[i],
                                   alpha=0.2)

        ax_arr[i].tick_params(axis='y', labelcolor=color_palette[i])
        ax_arr[i].set_ylabel(metric_name)
        plotted_arr.append(plotted)
    ax_arr[1].yaxis.grid(False)
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.legend(handles=plotted_arr, loc='lower left', framealpha=1.0)
    if y_lim:
        ax_arr[0].set_ylim(y_lim)
    plt.show()
    return ax_arr[0].get_ylim()
예제 #8
0
def proba_motion_analysis(data_x_gaia,
                          motions=None,
                          x_lim=(0.3, 1),
                          step=0.004,
                          mean_y_lines=None):
    motions = ['parallax'] if motions is None else motions
    mu_dict, sigma_dict, median_dict, error_dict = defaultdict(
        list), defaultdict(list), defaultdict(list), defaultdict(list)

    # Get QSOs
    qso_x_gaia = data_x_gaia.loc[data_x_gaia['CLASS_PHOTO'] == 'QSO']

    # Limit QSOs to proba thresholds
    thresholds = np.arange(x_lim[0], x_lim[1], step)
    for thr in thresholds:
        qso_x_gaia_limited = qso_x_gaia.loc[qso_x_gaia['QSO_PHOTO'] >= thr]

        for motion in motions:
            # Get stats
            (mu, sigma) = stats.norm.fit(qso_x_gaia_limited[motion])
            median = np.median(qso_x_gaia_limited[motion])
            error = sigma / math.sqrt(qso_x_gaia_limited.shape[0])

            # Store values
            mu_dict[motion].append(mu)
            sigma_dict[motion].append(sigma)
            median_dict[motion].append(median)
            error_dict[motion].append(error)

    # Plot statistics
    to_plot = [((mu_dict, error_dict), 'mean'), (sigma_dict, 'sigma'),
               (median_dict, 'median')]
    color_palette = get_cubehelix_palette(len(motions))

    for t in to_plot:
        plt.figure()

        label = None
        for i, motion in enumerate(motions):
            if len(motions) != 1:
                label = motion

            if t[1] == 'mean':
                vals = t[0][0][motion]
                errors = t[0][1][motion]
            else:
                vals = t[0][motion]
                errors = None

            plt.plot(thresholds,
                     vals,
                     label=label,
                     color=color_palette[i],
                     linestyle=get_line_style(i))
            ax = plt.gca()
            if errors:
                lower = np.array(vals) - np.array(errors) / 2
                upper = np.array(vals) + np.array(errors) / 2
                ax.fill_between(thresholds,
                                lower,
                                upper,
                                color=color_palette[i],
                                alpha=0.2)

            if t[1] == 'mean' and mean_y_lines is not None:
                x_lim = ax.get_xlim()
                thr_x_lim = np.arange(x_lim[0], x_lim[1] + 0.01, 0.01)
                for line_name, y, y_err in mean_y_lines:
                    plt.axhline(y, linestyle='--', color='b')
                    ax.fill_between(thr_x_lim,
                                    y - y_err / 2,
                                    y + y_err / 2,
                                    color='b',
                                    alpha=0.2)
                    plt.text(
                        thresholds[0] +
                        0.01 * abs(max(thresholds) - min(thresholds)),
                        y + 0.06 * abs(max(vals) - min(vals)), line_name)
                ax.set_xlim(x_lim)

            plt.xlabel('minimum classification probability')
            plt.ylabel('{} parallax {}'.format(t[1], '[mas]'))

        if label:
            plt.legend(framealpha=1.0)