def confidence_calibration_check(round_idx, respondent_idx, d_metric_dict, n_neighbors): _, X_test, _, y_test, _ = load_dataset(onehot=True) X_test = preprocessing_data_with_unit_var(X_test) d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)] weighted_y_test = pd.DataFrame( weighted_prediction_round(round_idx)[respondent_idx - 1].values, index=y_test.index, columns=['weighted_y_test']) weighted_y_test['y_test'] = y_test weighted_y_test['neighbor reci prop'] = y_test for idx in X_test.index: diff = X_test.values - X_test.loc[idx].values X_test['distance'] = np.dot(diff @ d_metric, diff.T).diagonal() max_d = X_test['distance'].sort_values( ascending=True).values[n_neighbors] neighbor_list = [ my_idx for my_idx in X_test.loc[X_test['distance'] <= max_d].index.values if my_idx != idx ] reci = 0 for neighbor in neighbor_list: reci += weighted_y_test.loc[neighbor, 'y_test'] weighted_y_test.loc[neighbor, 'neighbor reci prop'] = reci / n_neighbors X_test.drop(['distance'], axis=1, inplace=True) return weighted_y_test
def similarity_fig_from_metric_by_nca_part(round_idx, respondent_idx, n_neighbors, d_metric_dict): """ similar as nca, only here the neighborhood is defined as the minimal hypersphere that contains the n_neighbors similar points of x_i :param round_idx: integer, ranging from 1 to 10 :param respondent_idx: id of respondents, ranging from the 1 to 35 :param n_neighbors: the number of similar points to define the neighborhood :param d_metric_dict: dict to save learned distance metric, indexed by the respondent_id and the round_idx :return: a similarity connection graph and the similarity edge list """ _, X_test, _, y_test, _ = load_dataset(onehot=True) X_test = preprocessing_data_with_unit_var(X_test) d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)] # weighted_y_test = pd.DataFrame(weighted_prediction(round_idx)[respondent_idx - 1].values, # index=y_test.index) weighted_y_test = pd.DataFrame(weighted_prediction_quantile( weighted_prediction_round(round_idx))[respondent_idx - 1].values, index=y_test.index) similarity_edges = [] for idx in X_test.index: diff = X_test.values - X_test.loc[idx].values X_test['distance'] = np.dot(diff @ d_metric, diff.T).diagonal() same_class = X_test.loc[( weighted_y_test == weighted_y_test.loc[idx]).values.ravel()] max_d = same_class['distance'].sort_values(ascending=True).values[min( len(same_class) - 1, n_neighbors)] #print('idx: %5d'%idx, ' max_d: %2.2f.'%max_d) neighbor_list = [ my_idx for my_idx in X_test.loc[X_test['distance'] <= max_d].index.values if my_idx != idx ] for item in neighbor_list: similarity_edges.append((idx, item)) X_test.drop(['distance'], axis=1, inplace=True) fig, edges = similarity_fig_from_weighted_prediction( round_idx=round_idx, respondent_idx=respondent_idx, similarity_edge_list=similarity_edges) return fig, edges
def similarity_fig_from_metric_by_nca(round_idx, respondent_idx, d_metric_dict): """ nca -> neighborhood component analysis First is to define the neighborhood of data point x_i in the transformed space, it is the minimal hypersphere which centers at x_i, and its radius is equal to the farthest point which is similar with x_i. This function is to build similarity edges between x_i and all the data points within its neighborhood. :param round_idx: integer, ranging from 1 to 10 :param respondent_idx: id of respondents, ranging from the 1 to 35 :param d_metric_dict: dict to save learned distance metric, indexed by the respondent_id and the round_idx :return: a similarity connection graph and the similarity edge list """ _, X_test, _, y_test, _ = load_dataset(onehot=True) X_test = preprocessing_data_with_unit_var(X_test) d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)] weighted_y_test = pd.DataFrame(weighted_prediction_quantile( weighted_prediction_round(round_idx))[respondent_idx - 1].values, index=y_test.index) similarity_edges = [] for idx in X_test.index: diff = X_test.values - X_test.loc[idx].values X_test['distance'] = np.dot(diff @ d_metric, diff.T).diagonal() max_d = X_test.loc[(weighted_y_test == weighted_y_test.loc[idx] ).values.ravel()]['distance'].max() #print('idx: %5d' % idx, ' max_d: %2.2f.' % max_d) neighbor_list = [ my_idx for my_idx in X_test.loc[X_test['distance'] <= max_d].index.values if my_idx != idx ] for item in neighbor_list: similarity_edges.append((idx, item)) X_test.drop(['distance'], axis=1, inplace=True) fig, edges = similarity_fig_from_weighted_prediction( round_idx=round_idx, respondent_idx=respondent_idx, similarity_edge_list=similarity_edges) return fig, edges
def cal_distance_with_df(d_metric_dict): _, X_test, _, y_test, _ = load_dataset(onehot=True) X_test = preprocessing_data_with_unit_var(X_test) distance_dict = dict() for round_idx in range(1, 11): for respondent_idx in range(1, 36): distance_df = pd.DataFrame(np.zeros((len(X_test), len(X_test))), index=X_test.index, columns=X_test.index) d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)] for idx in X_test.index: diff = X_test.values - X_test.loc[idx].values distance_df[idx] = np.dot(diff @ d_metric, diff.T).diagonal() distance_dict["R%d_W%d" % (respondent_idx, round_idx)] = distance_df print(respondent_idx, round_idx) np.save(r'../new_data/lmnn_distance.npy', distance_dict) return None
def similarity_edges_from_metric_by_knn(round_idx, respondent_idx, n_neighbors, d_metric_dict): _, X_test, _, y_test, _ = load_dataset(onehot=True) X_test = preprocessing_data_with_unit_var(X_test) d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)] similarity_edges = {} for idx in X_test.index: diff = X_test.values - X_test.loc[idx].values X_test['distance'] = np.dot(diff @ d_metric, diff.T).diagonal() max_d = X_test['distance'].sort_values( ascending=True).values[n_neighbors] neighbor_list = [ my_idx for my_idx in X_test.loc[X_test['distance'] <= max_d].index.values if my_idx != idx ] similarity_edges[idx] = neighbor_list X_test.drop(['distance'], axis=1, inplace=True) return similarity_edges
def similarity_fig_from_metric_by_thres(round_idx, respondent_idx, d_metric_dict, d_thres): """ Basically to build the similarity edges between node_i and node_j if there d^2(node_i, node_j) within the transformed feature space is less than the threshold value, d_thres. :param round_idx: integer, ranging from 1 to 10 :param respondent_idx: id of respondents, ranging from the 1 to 35 :param d_metric_dict: dict to save learned distance metric, indexed by the respondent_id and the round_idx :param d_thres: threshold value to determine whether the similarity edges should be built :return: a similarity connection graph and the similarity edge list """ _, X_test, _, y_test, _ = load_dataset(onehot=True) X_test = preprocessing_data_with_unit_var(X_test) d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)] similarity_edges = [] for idx in X_test.index: diff = X_test.values - X_test.loc[idx].values X_test['distance'] = np.dot(diff @ d_metric, diff.T).diagonal() neighbor_list = [ my_idx for my_idx in X_test.loc[ X_test['distance'] <= d_thres].index.values if my_idx != idx ] for item in neighbor_list: similarity_edges.append((idx, item)) X_test.drop(['distance'], axis=1, inplace=True) fig, edges = similarity_fig_from_weighted_prediction( round_idx=round_idx, respondent_idx=respondent_idx, similarity_edge_list=similarity_edges) return fig, edges