def hierarchical_clustering_testing(encoder_model, data_path, batch_size, device, distance): # load data strings, similarities = load_hc_data(data_path) strings = torch.from_numpy(strings).long() print("Hierarchical", strings.shape) strings = index_to_one_hot(strings) strings_loader = torch.utils.data.DataLoader(strings, batch_size=batch_size, shuffle=False) # embed sequences and compute distance matrix embedded_strings = embed_strings(strings_loader, encoder_model, device) estimate_distances = DISTANCE_MATRIX[distance](embedded_strings, embedded_strings, encoder_model.scaling) # fix the problems caused by floating point arithmetic: it must be symmetric and with diagonal 0 estimate_distances = (estimate_distances + estimate_distances.T)/2 ind = np.diag_indices(estimate_distances.shape[0]) estimate_distances[ind[0], ind[1]] = 0.0 # run agglomerative clustering algorithms metrics = {} for method in ["single", "complete", "average", "ward"]: metrics[method] = {} baseline_tree = to_nx_tree(linkage(squareform(estimate_distances), method)) dc = dasgupta_cost(baseline_tree, similarities) metrics[method]["DC"] = dc print(metrics)
def __init__(self, sequences, distances): self.len_sequence = sequences.shape[-1] self.sequences = index_to_one_hot(sequences) self.distances = distances self.N_sequences = sequences.shape[0] # Normalise labels self.normalisation_constant = self.sequences.shape[-2] self.distances = self.distances / self.normalisation_constant
def __init__(self, sequences, distances, multiplicity=1): # multiplicity indicates (1/2) the number of times a string is sampled at every epoch self.len_sequence = sequences.shape[-1] self.sequences = index_to_one_hot(sequences) self.distances = distances self.N_batches = self.sequences.shape[0] self.batch_size = self.sequences.shape[1] self.multiplicity = multiplicity # Normalise labels self.normalisation_constant = self.sequences.shape[-2] self.distances = self.distances / self.normalisation_constant
def __init__(self, sequences, distances): self.len_sequence = sequences.shape[-1] self.sequences = index_to_one_hot(sequences) self.distances = distances self.N_batches = self.sequences.shape[0] self.batch_size = self.sequences.shape[1] # Normalise labels self.normalisation_constant = self.sequences.shape[-2] self.distances = [ d / (self.normalisation_constant * 2**p) for p, d in enumerate(self.distances) ]
def forward(self, sequence): (B, N) = sequence.shape sequence = index_to_one_hot(sequence, device=self.device) embedding = self.mlp(sequence.reshape(B, -1)) return embedding
def __init__(self, sequences): self.sequences = index_to_one_hot(sequences)
def __init__(self, sequences, labels): self.sequences = index_to_one_hot(sequences) self.labels = labels