comparison_validate = np.array(pd.read_csv('PATH/comparison_validate_%s.csv' % (name), header=None)) print(np.sum(comparison_validate)) comparison_test = np.array(pd.read_csv('PATH/comparison_test_%s.csv' % (name), header=None)) print(np.sum(comparison_test)) used_train = np.array(pd.read_csv('PATH/used_train_%s.csv' % (name), header=None)) print(np.sum(used_train)) used_train_coarse = np.array(pd.read_csv('PATH/used_train_coarse_%s.csv' % (name), header=None)) print(np.sum(used_train)) used_validate = np.array(pd.read_csv('PATH/used_validate_coarse_%s.csv' %(name), header=None)) print(np.sum(used_validate)) used_test = np.array(pd.read_csv('PATH/used_test_coarse_%s.csv' %(name), header=None)) print(np.sum(used_test)) train_pairwise_data = fit_model_class.pairwise_comparisons(comparison_train, train_features, used_train, {}, standardized = True) train_coarse_pairwise_data = fit_model_class.pairwise_comparisons(comparison_train, train_features, used_train_coarse, {}, standardized = True, StandardScalarObject = train_pairwise_data.StandardScalerObject) validate_pairwise_data = fit_model_class.pairwise_comparisons(comparison_validate, validate_features, used_validate, {}, standardized = True, StandardScalarObject = train_pairwise_data.StandardScalerObject) test_pairwise_data = fit_model_class.pairwise_comparisons(comparison_test, test_features, used_test, {}, standardized = True, StandardScalarObject = train_pairwise_data.StandardScalerObject) # first number is the l2 regularization strength, second number is the number of hidden nodes parameters = [(0.001, 400), (0.0001, 50), (0.0001, 200), (0.0001, 400), (0.0001, 600), (0.001, 50), (0.001, 200), (0.001, 600), (0.01, 50), (0.01, 200), (0.01, 400), (0.01, 600)] c, nodes = parameters[idx] print('c is', float(c)) print('nodes is', int(nodes)) total_comparisons = np.sum(train_pairwise_data.comparison_data) total_comparisons = int(total_comparisons) d, num_items = train_pairwise_data.standardized_features.shape ct_to_item1 = []
def threshold_sweep_samples(dim, num_items, num_exp, thresholds, num_samples, rescale=False): ''' this function gets the estimation error and the kendall tau correlation as the threshold varies ''' threshold_function = 'var' threshold_type = 'top' results = [[] for i in range(len(thresholds))] btl_results = [[] for i in range(len(thresholds))] kt_results = [[] for i in range(len(thresholds))] btl_kt_results = [[] for i in range(len(thresholds))] for i in range(num_exp): embedding_object = embedding_class.embedding(num_items, dim, 0, np.sqrt(1 / dim)) original_U = np.copy(embedding_object.U) for idx, k in enumerate(thresholds): print('on exp {} and threshold {}'.format(i, k)) if rescale: embedding_object.U = (np.sqrt(dim) / np.sqrt(k)) * original_U synthetic_object = synthetic_pair_class.synthetic_pairs( embedding_object, { 'threshold_function': threshold_function, 'threshold_type': threshold_type, 'threshold': k, 'relative_flag': True }) comparison_data = synthetic_object.get_comparison_matrix( num_samples) fit_model_instance = fit_model_class.pairwise_comparisons( comparison_data, embedding_object.U, np.ones((num_items, num_items)), {}) result = fit_model_instance.fit_threshold(k, [1 for i in range(dim)], -1, [1000000000], threshold_type, threshold_function, False, solver='sag') orig_norm = np.linalg.norm(embedding_object.w) results[idx].append( synthetic_object.get_w_est_error(result[0][1000000000][2][0]) / orig_norm) kt, my_kt = synthetic_object.get_ranking_error_and_score_rsme( result[0][1000000000][2][0]) kt_results[idx].append(my_kt) fit_model_instance = fit_model_class.pairwise_comparisons( comparison_data, embedding_object.U, np.ones((num_items, num_items)), {}) result = fit_model_instance.fit_threshold(dim, [1 for i in range(dim)], -1, [1000000000], threshold_type, threshold_function, False, solver='sag') orig_norm = np.linalg.norm(embedding_object.w) btl_results[idx].append( synthetic_object.get_w_est_error(result[0][1000000000][2][0]) / orig_norm) kt, my_kt = synthetic_object.get_ranking_error_and_score_rsme( result[0][1000000000][2][0]) btl_kt_results[idx].append(my_kt) return results, btl_results, kt_results, btl_kt_results
def sample_sweep(dim, num_items, num_exp, threshold, samples): ''' this function gets the estimation error, kendall tau correlation, and the pairwise accuracy rates as the number of samples vary ''' threshold_function = 'var' threshold_type = 'top' results = [[] for i in range(len(samples))] btl_results = [[] for i in range(len(samples))] prediction_results = [[] for i in range(len(samples))] kt_results = [[] for i in range(len(samples))] btl_kt_results = [[] for i in range(len(samples))] btl_prediction_results = [[] for i in range(len(samples))] embedding_object = embedding_class.embedding(num_items, dim, 0, np.sqrt(1 / dim)) synthetic_object = synthetic_pair_class.synthetic_pairs( embedding_object, { 'threshold_function': threshold_function, 'threshold_type': threshold_type, 'threshold': threshold, 'relative_flag': True }) embedding_object_unseen = embedding_class.embedding( num_items, dim, 0, np.sqrt(1 / dim)) embedding_object_unseen.w = np.copy(synthetic_object.w) parameters = synthetic_object.get_parameters() synthetic_object_unseen = synthetic_pair_class.synthetic_pairs( embedding_object_unseen, { 'threshold_function': threshold_function, 'threshold_type': threshold_type, 'threshold': threshold, 'relative_flag': True }) print('object unsen@') parameters = synthetic_object_unseen.get_parameters() parameters = synthetic_object.get_parameters() bound = [[parameters[5] * (1 / np.sqrt(i)) for j in range(10)] for i in samples] print('samples', parameters[6], parameters[7]) model_info = { 'threshold_function': threshold_function, 'threshold_type': threshold_type, 'threshold': threshold, 'relative_flag': True } for i in range(num_exp): for idx, s in enumerate(samples): print('on exp {} and samples {}'.format(i, s)) comparison_data = synthetic_object.get_comparison_matrix(s) fit_model_instance = fit_model_class.pairwise_comparisons( comparison_data, embedding_object.U, np.ones((num_items, num_items)), {}) result = fit_model_instance.fit_threshold(threshold, [1 for i in range(dim)], -1, [1000000000], threshold_type, threshold_function, False, solver='sag') results[idx].append( synthetic_object.get_w_est_error(result[0][1000000000][2][0])) kt, _ = synthetic_object_unseen.get_ranking_error_and_score_rsme( result[0][1000000000][2][0]) kt_results[idx].append(kt) prediction_results[idx].append( synthetic_object_unseen.get_prediction_error( result[0][1000000000][2][0], model_info, btl=False)) fit_model_instance = fit_model_class.pairwise_comparisons( comparison_data, embedding_object.U, np.ones((num_items, num_items)), {}) result = fit_model_instance.fit_threshold(dim, [1 for i in range(dim)], -1, [1000000000], threshold_type, threshold_function, False, solver='sag') btl_results[idx].append( synthetic_object.get_w_est_error(result[0][1000000000][2][0])) kt, _ = synthetic_object_unseen.get_ranking_error_and_score_rsme( result[0][1000000000][2][0]) btl_kt_results[idx].append(kt) btl_prediction_results[idx].append( synthetic_object_unseen.get_prediction_error( result[0][1000000000][2][0], model_info, btl=True)) return results, btl_results, bound, kt_results, btl_kt_results, prediction_results, btl_prediction_results
def get_df(name, min_comparisons, confidence = 0): usable_features = ['points', 'var_xcoord', 'var_ycoord', 'varcoord_ratio', 'avgline', 'varline', 'boyce', 'lenwid', 'jagged', 'parts', 'hull', 'bbox', 'reock', 'polsby', 'schwartzberg', 'circle_area', 'circle_perim', 'hull_area', 'hull_perim', 'orig_area', 'district_perim', 'corners', 'xvar', 'yvar', 'cornervar_ratio', 'sym_x', 'sym_y'] df = pd.read_csv('district_data/paired_comparisons.csv') features_df = pd.read_csv('district_data/subset_features.csv') if name != 'all': df = df.loc[df['study'] == name] df = df.loc[df['alternate_id_1'] != df['alternate_id_2']] districts_in_shiny = set(df.alternate_id_1.unique()).union(set(df.alternate_id_2.unique())) districts_in_shiny_item_num_dict = {} for idx, district in enumerate(districts_in_shiny): districts_in_shiny_item_num_dict[district] = idx districts_in_shiny_item_num_dict[idx] = district comparisons = np.zeros([len(districts_in_shiny), len(districts_in_shiny)]) seen_pairs = np.zeros([len(districts_in_shiny), len(districts_in_shiny)]) features = np.zeros([len(usable_features), len(districts_in_shiny)]) for index, row in df.iterrows(): district1 = districts_in_shiny_item_num_dict[row['alternate_id_1']] district2 = districts_in_shiny_item_num_dict[row['alternate_id_2']] winner = districts_in_shiny_item_num_dict[row['alternate_id_winner']] if winner == district1: comparisons[district1, district2] += 1 else: comparisons[district2, district1] += 1 seen_pairs[district1, district2] = 1 seen_pairs[district2, district1] = 1 their_compactness_measure = [] for i in range(len(districts_in_shiny)): their_compactness_measure.append(float(features_df.loc[features_df['district'] == districts_in_shiny_item_num_dict[i]]['compactness'])) their_ranking = np.argsort(np.array(their_compactness_measure))[::-1] print('their ranking', their_ranking) for i in range(len(districts_in_shiny)): for idx, feat in enumerate(usable_features): features[idx, i] = features_df.loc[features_df['district'] == districts_in_shiny_item_num_dict[i]][feat] for i in range(len(districts_in_shiny)): for j in range(i+1, len(districts_in_shiny)): if comparisons[i,j] + comparisons[j,i] < min_comparisons: comparisons[i,j] = 0 comparisons[j,i] = 0 seen_pairs[i,j] = 0 seen_pairs[j,i] = 0 if confidence != 0: for i in range(len(districts_in_shiny)): for j in range(i+1, len(districts_in_shiny)): if comparisons[i,j] + comparisons[j,i]> 0: prob_i_beats_j = comparisons[i,j] / (comparisons[j,i] + comparisons[i,j]) if not (prob_i_beats_j > confidence or prob_i_beats_j < 1-confidence): #or comparisons[i,j] + comparisons[j,i] < max_compares: print(i,j, prob_i_beats_j, comparisons[i,j] + comparisons[j,i] ) comparisons[i,j] = 0 comparisons[j,i] = 0 seen_pairs[i,j] = 0 seen_pairs[j,i] = 0 pairwise_data = fit_model_class.pairwise_comparisons(comparisons, features, seen_pairs, districts_in_shiny_item_num_dict, standardized = True) return pairwise_data, their_ranking, districts_in_shiny_item_num_dict