def _gen_mnist(c_per): constraints = [] if os.path.exists("mnist_saved_matrix.npy"): distance_matrix = np.load("mnist_saved_matrix.npy") else: x_test, y_test, class_distribution = read_mnist(subsample=False) # Subsample STE_NUM_DIGITS digits subsample_idxs = subsample(range(len(list(x_test))), STE_NUM_DIGITS) subsampled_x = [x_test[idx] for idx in subsample_idxs] subsampled_labels = dict([(i, np.argmax(y_test[digit_idx])) for i, digit_idx in enumerate(subsample_idxs)]) # create distance matrix distance_matrix = np.zeros((STE_NUM_DIGITS, STE_NUM_DIGITS)) for i in tqdm(range(len(subsampled_x)), desc="Distance Generation: ", position=BAR_POSITION_OFFSET + 1, leave=False): for j in range(len(subsampled_x)): distance_matrix[i, j] = np.linalg.norm(subsampled_x[i] - subsampled_x[j], ord=2) np.save("mnist_saved_matrix", distance_matrix) for i in tqdm(range(STE_NUM_DIGITS), desc="Triplet Generation : ", position=BAR_POSITION_OFFSET + 2, leave=False): # Take 50 Nearest neighbours indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))] closest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1])][:50] for close_index in closest_indices: # take the 50 farthest neighbors indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))] farthest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1], reverse=True)][:50] for far_index in farthest_indices: if rand() >= c_per: constraints.append([i, close_index, far_index]) else: constraints.append([i, far_index, close_index]) # Subsample again reduce the number of constraints constraints subsampled_constraints = subsample(constraints, 3000) return subsampled_constraints
def _create_sin(contamination_percentage): x = np.linspace(0, 1, ROE_SAMPLES) y = 3 * np.sin(20 * x) + np.random.rand(ROE_SAMPLES) * 2 dataset = np.array([x, y]) distance_matrix = np.zeros((len(dataset[0]), len(dataset[0]))) for i in tqdm(range(len(distance_matrix)), desc="Distance Generation: ", leave=False): for j in range(len(distance_matrix)): distance_matrix[i, j] = np.linalg.norm(dataset[:, i] - dataset[:, j], ord=2) constraints = format_triplets_from_distance(distance_matrix, poison_perc=contamination_percentage) subsampled_constraints = subsample(constraints, 3000) return subsampled_constraints
def _create_n_density_squares(contamination_percentage): close_to_zero = np.random.rand(int(ROE_SAMPLES / 3), 2) / 4 mid_from_zero = np.random.rand(int(ROE_SAMPLES / 3), 2) / 3 + 0.5 far_from_zero = np.random.rand(int(ROE_SAMPLES / 3), 2) / 2 + 1 dataset = np.concatenate((close_to_zero, mid_from_zero, far_from_zero)) n_points = ROE_SAMPLES distance_matrix = np.zeros((n_points, n_points)) for i in tqdm(range(len(distance_matrix)), desc="Distance Generation: ", leave=False): for j in range(len(distance_matrix)): distance_matrix[i, j] = np.linalg.norm(dataset[i, :] - dataset[j, :], ord=2) constraints = format_triplets_from_distance(distance_matrix, poison_perc=contamination_percentage) subsampled_constraints = subsample(constraints, 3000) return subsampled_constraints
def _create_dd_squares(contamination_percentage): num_points_inner = int((1 - outer_density) * ROE_SAMPLES * 2) num_points_outer = int(outer_density * ROE_SAMPLES * 2) points_outer = np.random.rand(num_points_outer, 2) * 2 - 1 points_inner = np.random.rand(num_points_inner, 2) - 0.5 dataset = np.concatenate((points_outer, points_inner)) n_points = num_points_inner + num_points_outer distance_matrix = np.zeros((n_points, n_points)) constraints = [] contamination_percentage = 0 for i in tqdm(range(len(distance_matrix)), desc="Distance Generation: ", leave=False): for j in range(len(distance_matrix)): distance_matrix[i, j] = np.linalg.norm(dataset[i, :] - dataset[j, :], ord=2) constraints = format_triplets_from_distance(distance_matrix, poison_perc=contamination_percentage) return subsample(constraints, 3000)
def format_ml_dataset(x, y, using="features", dataset_name="None", subsample_factor=0.0): constraints = [] if using == "features": distance_matrix = np.zeros((len(x), len(x))) for i in tqdm(range(len(x)), desc=f"[{dataset_name.upper()}]Distance Generation: ", leave=False): for j in range(len(x)): distance_matrix[i, j] = np.linalg.norm(x[i] - x[j], ord=2) for i in tqdm(range(len(x)), desc=f"[{dataset_name.upper()}]Triplet Generation : ", position=BAR_POSITION_OFFSET + 2, leave=False): # Take 50 Nearest neighbours indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))] closest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1])][:50] for close_index in closest_indices: # take the 50 farthest neighbors indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))] farthest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1], reverse=True)][:50] for far_index in farthest_indices: constraints.append([i, close_index, far_index]) elif using == "labels": for i, el_1 in tqdm(enumerate(y), desc=f"[{dataset_name.upper()}]Triplet Generation : "): for j, el_2 in enumerate(y): if j != i: for k, el_3 in enumerate(y): if k != i and j != k: close = el_1 == el_2 distant = el_1 != el_3 if close and distant: constraints.append([i, j, k]) try: constraints = subsample(constraints, len(y) * 70 * 70) except: pass return constraints
def create_random_dataset(contamination_percentage=CONTAMINATION_PERCENTAGE, sparsify=False): # First lookup if a dataset with that contamination percentage has already been created if os.path.exists(f"./datasets/random/random-{contamination_percentage}.txt"): print("Using old dataset", file=sys.stderr) with open(f"./datasets/random/random-{contamination_percentage}.txt") as random_ds: constraints = [] for line in random_ds.readlines(): i, j, k = [int(x) for x in line.replace("\n", "").split(",")] constraints.append([i, j, k]) return constraints, ROE_SAMPLES else: if os.path.exists(f"./datasets/random/random-0.0.txt"): print("Using old dataset", file=sys.stderr) with open(f"./datasets/random/random-0.0.txt") as random_ds: constraints = [] for line in random_ds.readlines(): i, j, k = [int(x) for x in line.replace("\n", "").split(",")] if rand() > contamination_percentage: constraints.append([i, j, k]) else: constraints.append([i, k, j]) with open(f"./datasets/random/random-{contamination_percentage}.txt", "w+") as random_ds: for idx, constraint in enumerate(constraints): i, j, k = constraint if idx != len(constraints) - 1: random_ds.write(f"{i},{j},{k}\n") else: random_ds.write(f"{i},{j},{k}") return constraints, ROE_SAMPLES else: # Create the dataset and ... print("Creating dataset", file=sys.stderr) dataset = [np.random.rand(1, 10) * 1 / 20 for _ in range(ROE_SAMPLES)] distance_matrix = np.zeros((len(dataset), len(dataset))) constraints = [] for i in tqdm(range(len(dataset)), desc="Distance Generation: ", leave=False): for j in range(len(dataset)): distance_matrix[i, j] = np.linalg.norm(dataset[i] - dataset[j], ord=2) for i in tqdm(range(len(dataset)), desc="Triplet Generation : ", leave=False): # Take 50 Nearest neighbours indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))] closest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1])][:50] for close_index in closest_indices: # take the 50 farthest neighbors indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))] farthest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1], reverse=True)][:50] for far_index in farthest_indices: next = [close_index, far_index] if rand() >= contamination_percentage: constraints.append([i, *next]) else: constraints.append([i, *np.random.permutation(next)]) subsampled_constraints = subsample(constraints, 3000) if sparsify: return sparsify_instance(subsampled_constraints) # Save it as file! with open(f"./datasets/random/random-{contamination_percentage}.txt", "w+") as random_ds: for idx, constraint in enumerate(subsampled_constraints): i, j, k = constraint if idx != len(subsampled_constraints) - 1: random_ds.write(f"{i},{j},{k}\n") else: random_ds.write(f"{i},{j},{k}") return subsampled_constraints, ROE_SAMPLES