def spacetime2(reg, sReg, df, ko, omega_age_smooth, lambda_time_smooth, lambda_time_smooth_nodata, zeta_space_smooth, zeta_space_smooth_nodata): """ Compute the spacetime weight matrix for a super region. Full data set tells which values need weights, train data set are the residuals which need weighting. """ full_sub = df[(df.region == reg)] train_sub = df[(df.super_region == sReg) & (ko)] year_start = np.min(df.year) year_end = np.max(df.year) Wat = timeW(full_sub, train_sub, omega_age_smooth, lambda_time_smooth, lambda_time_smooth_nodata, year_start, year_end).astype("float32") NR, SN, C, R, SR = matCRS(full_sub, train_sub) xi_mat = calculate_xi_matrix(full_sub, train_sub, zeta_space_smooth, zeta_space_smooth_nodata).astype("float32") NR = weight_matrix(NR, xi_mat[:,0], Wat).astype("float32") SN = weight_matrix(SN, xi_mat[:,1], Wat).astype("float32") C = weight_matrix(C, xi_mat[:,2], Wat).astype("float32") R = weight_matrix(R, xi_mat[:,3], Wat).astype("float32") SR = weight_matrix(SR, xi_mat[:,4], Wat).astype("float32") final = EV("NR + SN + C + R + SR").astype("float32") del NR, SN, C, R, SR account_missing = final.sum(0) account_missing[account_missing == .0] = 1. return EV("final / account_missing").astype("float32")
def timeW(full_sub, train_sub, omega_age_smooth, lambda_time_smooth, lambda_time_smooth_nodata): ''' Gets the time age weight of a superregion given a full and training data set. Returns a matrix of size equal to the row size of the traing data set by the row size of the full data set each subsetted by the super region. Each cell represents the age by time weight for each observation (the column) for each residual (the row). ''' ageS = makeS(full_sub, train_sub, "ageC").astype("float32") # make stride of age values yearS = makeS(full_sub, train_sub, "year").astype("float32") # stride of year values l = calculate_lamda_array(full_sub, train_sub, lambda_time_smooth, lambda_time_smooth_nodata).astype( "float32") # assign lambda i1 = full_sub.ageC.values.astype("float32") # age vector i2 = full_sub.year.values.astype("float32") # year vector start = full_sub.year.min() end = full_sub.year.max() aMax = np.maximum(EV("abs(i2-start)"), EV("abs(end-i2)")).astype("float32") # argMax vector return EV( "(1/exp(omega_age_smooth*abs(ageS-i1))) * (1 - (abs(yearS-i2)/(aMax+1))**l)**3" )
def matCRS(full_sub, train_sub): """ For a designated super region returns 3 matrices where each column is an observation from the full data frame and each row is a residual from the training data set. The three matrices have either values of 1 or 0 and designate whether the residual and the observation are in the same country(C), same region but not same country(R), and same super_region but not the same region or country(S). """ sub_nat_S = makeS(full_sub, train_sub, "location_id") country_S = makeS(full_sub, train_sub, "country_id") region_S = makeS(full_sub, train_sub, "region") sub_nat_V = full_sub.location_id.values country_V = full_sub.country_id.values region_V = full_sub.region.values has_sub_nat_V = (train_sub.country_id != train_sub.location_id).values.astype(np.int8) not_representitive_V = (0**train_sub.national.values).astype(np.int8) SN = EV("sub_nat_S == sub_nat_V").astype(np.int8).T NR = EV("SN * not_representitive_V").T SN = EV("SN * has_sub_nat_V").T C = EV("country_S == country_V").astype(np.int8) C = EV("C - SN") SN = EV("SN * 0**NR") C = EV("C * 0**NR") R = EV("region_S == region_V").astype(np.int8) R = EV("R - C - SN - NR") SR = EV("1 - R - C - SN - NR") return NR, SN, C, R, SR
def weight_matrix(valid_positions, xi_vector, weight_matrix): """ (matrix, vector, matrix) Given a matrix of valid positions for an analytic region (valid_positions), a vector of appropriate xi weights to use for each column in that vector (xi_vector), and an age year weighted matrix generated by timeW will return a matrix re-weighted so that each column adds up to the corresponding xi value in the xi_vector. """ weights = EV("valid_positions * weight_matrix") sum_of_weights = weights.sum(0) sum_of_weights[sum_of_weights == .0] = 1. return EV("(weights / sum_of_weights) * xi_vector")
def calculate_xi_matrix(full, train, zeta_space_smooth, zeta_space_smooth_nodata): ''' (data frame, data frame, float, float) -> array Given two data frames ("full", "train") where train is a subset of full used to train a model and two float values for possible use of xi value ("zeta_space_smooth", "zeta_space_smooth_nodata") returns a matrix of xi values with a number of rows equal to the number of observations in the training data and 5 columns. Each cell is given a value depending on the weighting that should be used for each observation in the full set in comparison to training observations if they share the same sub_national, country, region or super region for columns 2 through 5. The first column is the weight of data that is to the most specific level for that observation but not representative. Each row sum should add up to 1. ''' depths = location_depth(full, train) def f(x): return calc_xi_vec(x, zeta_space_smooth, zeta_space_smooth_nodata) base = np.array(map(f, depths)) train_copy = train.set_index("location_id") non_rep_loc = train[train.national != 1].location_id.unique() non_rep = full.location_id.map(lambda x: x in non_rep_loc) non_rep_vec = EV( "zeta_space_smooth * (non_rep - non_rep * zeta_space_smooth)") # keep track of the place that only have no rep data so we can give them the full location weight only_non_rep_loc = np.setdiff1d( non_rep_loc, train[train.national == 1].location_id.unique()) only_non_rep = full.location_id.map(lambda x: x in only_non_rep_loc) modify_SN = non_rep_vec * (base[:, 0] != 0).astype(int) modify_C = non_rep_vec * (base[:, 0] == 0).astype(int) base[:, 0] = base[:, 0] - modify_SN base[:, 1] = base[:, 1] - modify_C base = np.append(non_rep_vec.reshape(len(base), 1), base, 1) base[only_non_rep.values & (depths == 3), 0] = \ base[only_non_rep.values & (depths == 3), :][:, [0, 2]].sum(axis=1) base[only_non_rep.values & (depths == 3), 2] = 0 base[only_non_rep.values & (depths == 4), 0] = \ base[only_non_rep.values & (depths == 4), :][:, [0, 1]].sum(axis=1) base[only_non_rep.values & (depths == 4), 1] = 0 return base