def spacetime2(reg, sReg, df, ko, omega_age_smooth, lambda_time_smooth, lambda_time_smooth_nodata, zeta_space_smooth, zeta_space_smooth_nodata): """ Compute the spacetime weight matrix for a super region. Full data set tells which values need weights, train data set are the residuals which need weighting. """ full_sub = df[(df.region == reg)] train_sub = df[(df.super_region == sReg) & (ko)] year_start = np.min(df.year) year_end = np.max(df.year) Wat = timeW(full_sub, train_sub, omega_age_smooth, lambda_time_smooth, lambda_time_smooth_nodata, year_start, year_end).astype("float32") NR, SN, C, R, SR = matCRS(full_sub, train_sub) xi_mat = calculate_xi_matrix(full_sub, train_sub, zeta_space_smooth, zeta_space_smooth_nodata).astype("float32") NR = weight_matrix(NR, xi_mat[:,0], Wat).astype("float32") SN = weight_matrix(SN, xi_mat[:,1], Wat).astype("float32") C = weight_matrix(C, xi_mat[:,2], Wat).astype("float32") R = weight_matrix(R, xi_mat[:,3], Wat).astype("float32") SR = weight_matrix(SR, xi_mat[:,4], Wat).astype("float32") final = EV("NR + SN + C + R + SR").astype("float32") del NR, SN, C, R, SR account_missing = final.sum(0) account_missing[account_missing == .0] = 1. return EV("final / account_missing").astype("float32")
def weight_matrix(valid_positions, xi_vector, weight_matrix): """ (matrix, vector, matrix) Given a matrix of valid positions for an analytic region (valid_positions), a vector of appropriate xi weights to use for each column in that vector (xi_vector), and an age year weighted matrix generated by timeW will return a matrix re-weighted so that each column adds up to the corresponding xi value in the xi_vector. """ weights = EV("valid_positions * weight_matrix") sum_of_weights = weights.sum(0) sum_of_weights[sum_of_weights == .0] = 1. return EV("(weights / sum_of_weights) * xi_vector")
def calculate_xi_matrix(full, train, zeta_space_smooth, zeta_space_smooth_nodata): ''' (data frame, data frame, float, float) -> array Given two data frames ("full", "train") where train is a subset of full used to train a model and two float values for possible use of xi value ("zeta_space_smooth", "zeta_space_smooth_nodata") returns a matrix of xi values with a number of rows equal to the number of observations in the training data and 5 columns. Each cell is given a value depending on the weighting that should be used for each observation in the full set in comparison to training observations if they share the same sub_national, country, region or super region for columns 2 through 5. The first column is the weight of data that is to the most specific level for that observation but not representative. Each row sum should add up to 1. ''' depths = location_depth(full, train) def f(x): return calc_xi_vec(x, zeta_space_smooth, zeta_space_smooth_nodata) base = np.array(map(f, depths)) train_copy = train.set_index("location_id") non_rep_loc = train[train.national != 1].location_id.unique() non_rep = full.location_id.map(lambda x: x in non_rep_loc) non_rep_vec = EV( "zeta_space_smooth * (non_rep - non_rep * zeta_space_smooth)") # keep track of the place that only have no rep data so we can give them the full location weight only_non_rep_loc = np.setdiff1d( non_rep_loc, train[train.national == 1].location_id.unique()) only_non_rep = full.location_id.map(lambda x: x in only_non_rep_loc) modify_SN = non_rep_vec * (base[:, 0] != 0).astype(int) modify_C = non_rep_vec * (base[:, 0] == 0).astype(int) base[:, 0] = base[:, 0] - modify_SN base[:, 1] = base[:, 1] - modify_C base = np.append(non_rep_vec.reshape(len(base), 1), base, 1) base[only_non_rep.values & (depths == 3), 0] = \ base[only_non_rep.values & (depths == 3), :][:, [0, 2]].sum(axis=1) base[only_non_rep.values & (depths == 3), 2] = 0 base[only_non_rep.values & (depths == 4), 0] = \ base[only_non_rep.values & (depths == 4), :][:, [0, 1]].sum(axis=1) base[only_non_rep.values & (depths == 4), 1] = 0 return base