def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pd.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the datasets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = super(IntercomparisonMetrics, self).calc_metrics(data, gpi_info) subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < self.min_obs: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict, mse_corr_dict, mse_bias_dict, mse_var_dict = \ mse._asdict(), mse_corr._asdict(), mse_bias._asdict(), mse_var._asdict() # calculate RSS rss = df_metrics.RSS(data) rss_dict = rss._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None # No extra scaling is performed here. # always scale for ubRMSD with mean std # calculate ubRMSD data_scaled = scale(data, method='mean_std') ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() for tds_name in self.tds_names: R, p_R = pearson_R[tds_name], pearson_p[tds_name] rho, p_rho = spea_rho[tds_name], spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] rss = rss_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split(self.ds_names_split) tds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]] ]) dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho dataset[self.metric_ds_split.join(['p_rho', tds_name_key])][0] = p_rho dataset[self.metric_ds_split.join(['BIAS', tds_name_key])][0] = bias dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse dataset[self.metric_ds_split.join(['mse_corr', tds_name_key])][0] = mse_corr dataset[self.metric_ds_split.join(['mse_bias', tds_name_key])][0] = mse_bias dataset[self.metric_ds_split.join(['mse_var', tds_name_key])][0] = mse_var dataset[self.metric_ds_split.join(['RMSD', tds_name_key])][0] = rmsd dataset[self.metric_ds_split.join(['urmsd', tds_name_key])][0] = ubRMSD dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss if self.calc_tau: dataset[self.metric_ds_split.join(['tau', tds_name_key])][0] = tau dataset[self.metric_ds_split.join(['p_tau', tds_name_key])][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < 10: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict = tau._asdict() p_tau_dict = p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None #data_scaled = scale(data, method='mean_std') # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data) ubRMSD_dict = ubRMSD_nT._asdict() # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_snr'.format(name)][0] = snr[i] dataset['{:}_err_var'.format(name)][0] = err[i] dataset['{:}_beta'.format(name)][0] = beta[i] for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format( self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]]) dataset['R_between_{:}'.format(tds_name_key)][0] = R dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R dataset['rho_between_{:}'.format(tds_name_key)][0] = rho dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho dataset['bias_between_{:}'.format(tds_name_key)][0] = bias dataset['mse_between_{:}'.format(tds_name_key)][0] = mse dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD if self.calc_tau: dataset['tau_between_{:}'.format(tds_name_key)][0] = tau dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ Calculate Triple Collocation metrics Parameters ---------- data : pd.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] if self.metadata_template != None: for key, value in self.metadata_template.items(): dataset[key][0] = gpi_info[3][key] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < self.min_obs: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calculate RSS rss = df_metrics.RSS(data) rss_dict = rss._asdict() # calculate ubRMSD # todo: we could use the TC derived scaling parameters here? data_scaled = scale(data, method='mean_std') ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None # calculate TC metrics ref_ind = np.where(np.array(data.columns) == self.ref_name)[0][0] snrs, err_stds, betas = df_metrics.tcol_snr(data, ref_ind=ref_ind) snr_dict = self._tc_res_dict(snrs) err_std_dict = self._tc_res_dict(err_stds) beta_dict = self._tc_res_dict(betas) # store TC results for thds_name in self.thds_names: snr = snr_dict[thds_name] err_std = err_std_dict[thds_name] beta = beta_dict[thds_name] split_thds_name = thds_name.split(self.ds_names_split) thds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_thds_name[0]], self.ds_names_lut[split_thds_name[1]], self.ds_names_lut[split_thds_name[2]] ]) for metr, res in dict(snr=snr, err_std=err_std, beta=beta).items(): for ds, ds_res in res.items(): m_ds = "{}_{}".format(metr, self.ds_names_lut[ds]) n = '{}{}{}'.format(m_ds, self.metric_ds_split, thds_name_key) if n in dataset.keys(): dataset[n][0] = ds_res # Store basic metrics results for tds_name in self.tds_names: R, p_R = pearson_R[tds_name], pearson_p[tds_name] rho, p_rho = spea_rho[tds_name], spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] rss = rss_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split(self.ds_names_split) tds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]] ]) dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho dataset[self.metric_ds_split.join(['p_rho', tds_name_key])][0] = p_rho dataset[self.metric_ds_split.join(['BIAS', tds_name_key])][0] = bias dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse dataset[self.metric_ds_split.join(['mse_corr', tds_name_key])][0] = mse_corr dataset[self.metric_ds_split.join(['mse_bias', tds_name_key])][0] = mse_bias dataset[self.metric_ds_split.join(['mse_var', tds_name_key])][0] = mse_var dataset[self.metric_ds_split.join(['RMSD', tds_name_key])][0] = rmsd dataset[self.metric_ds_split.join(['urmsd', tds_name_key])][0] = ubRMSD dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss if self.calc_tau: dataset[self.metric_ds_split.join(['tau', tds_name_key])][0] = tau dataset[self.metric_ds_split.join(['p_tau', tds_name_key])][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < 10: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict = tau._asdict() p_tau_dict = p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None #data_scaled = scale(data, method='mean_std') # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data) ubRMSD_dict = ubRMSD_nT._asdict() # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_snr'.format(name)][0] = snr[i] dataset['{:}_err_var'.format(name)][0] = err[i] dataset['{:}_beta'.format(name)][0] = beta[i] for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format(self.ds_names_lut[ split_tds_name[0]], self.ds_names_lut[ split_tds_name[1]]) dataset['R_between_{:}'.format(tds_name_key)][0] = R dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R dataset['rho_between_{:}'.format(tds_name_key)][0] = rho dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho dataset['bias_between_{:}'.format(tds_name_key)][0] = bias dataset['mse_between_{:}'.format(tds_name_key)][0] = mse dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD if self.calc_tau: dataset['tau_between_{:}'.format(tds_name_key)][0] = tau dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau return dataset