def test_same(self): original = '/home/user/test.txt' new = '/home/user/test.txt' ret = paths.find_transformation(original, new) self.assertEqual(ret, '')
def test_down_file(self): new = '/home/user/doc.pdf' original = '/home/user' ret = paths.find_transformation(original, new) self.assertEqual(ret, 'doc.pdf')
def test_down_folder(self): new = '/home/user/typography/doc.pdf' original = '/home/user' ret = paths.find_transformation(original, new) self.assertEqual(ret, 'typography')
def test_up_different(self): original = '/home/user/typography/doc.pdf' new = '/home/user/test.txt' ret = paths.find_transformation(original, new) self.assertEqual(ret, '..')
def find_clusters(self, n): # short names for static methods find_median = self.find_median transform = self.transform # Choose cluster centers randomly from neurons centers = random.sample(self.neurons, n) improvement = True count = 0 while improvement: count += 1 # print('clustering', count) improvement = False # Convert centers to tuples so they can be hashed centers = [tuple(x) for x in centers] # Determine the categories of neurons categories = defaultdict(list) # dictionary center -> [neurons] for i, neuron in enumerate(self.neurons): bmu = min(((self._distance(center, neuron), center, index) for index, center in enumerate(centers)), key=lambda x: x[0]) categories[bmu[1]].append(neuron) self.categories = categories # Compute new value for each category new_centers = [list(x) for x in centers] ### Process ### # k is active feature of the vector feature = 0 for center_id, center in enumerate(centers): processes = [x[feature] for x in categories.get(center, [])] if not processes: continue process_counter = BetterCounter(processes) most_common_list = process_counter.most_common() most_common_process = random.choice(most_common_list)[0] if new_centers[center_id][feature] != most_common_process: improvement = True new_centers[center_id][feature] = most_common_process ### Path ### for center_id, center in enumerate(centers): # Compute new value for each neuron # Get the neighborhood of the neuron (input vectors from the topological neighborhood) neighborhood = categories.get(center, []) # When there are no neighbours: if not neighborhood: # empty_neighborhood += 1 continue # Compute average path from the neighborhood average, error = find_median(neighborhood) improvement = True while improvement: counter = BetterCounter() # Search how the average path needs to be changed to be the # same as each of its neighbour. Count the transformations. # There are three transformations: # 1) go up (..) # 2) go down in the directory structure # 3) do nothing (paths are equal) for neighbour in neighborhood: transformation = find_transformation( average, neighbour) if transformation == '.': breakpoint() counter.update([transformation]) # Apply the most frequent transformation most_frequent = counter.most_common() if len(most_frequent) == 1: # There were no ties --- ideal case transformation = most_frequent[0][0] if not transformation: # empty string means the paths are equal, we are # finished --- unlikely to happen break transformations = [transformation] elif len(most_frequent) > 1: # There were ties. We are going to try each transformation # and find one that gives smaller error value transformations = [x[0] for x in most_frequent] else: raise RuntimeError( 'Unexpected length of most_frequent') for t in transformations: new_average = transform(average, t) new_error = SOM.summed_distance( new_average, neighborhood) if new_error < error: average = new_average error = new_error break else: # No transformation was better --- we are finished with # neuron `p` improvement = False # This is a new path for the `center_id`-th cluster center if new_centers[center_id][1] != average and count < 50: improvement = True # print(center_id, 'improved') new_centers[center_id][1] = average # Permissions centers = new_centers self.centers = centers # Determine the categories of neurons # once again because there are old values from the previous run of the # loop categories = defaultdict(list) # dictionary center -> [neurons] for i, neuron in enumerate(self.neurons): bmu = min(((self._distance(center, neuron), center, index) for index, center in enumerate(centers)), key=lambda x: x[0]) categories[bmu[2]].append(neuron) self.neurons_by_cat_id = categories return centers
def _update_paths(self, iteration: int): """Updates file paths of neurons. Update is not stored immediately, new values are returned in a list. :param iteration: iteration of the update function """ find_median = self.find_median transform = self.transform # This is where we store new paths for the neurons, they will be stored # in original order new_paths = [] # Counter for neurons with empty neighborhood empty_neighborhood = 0 # Compute neighborhood for each neuron in the *input space* neighborhood_for_neuron = self._neuron_neighborhood() distance = int((1 - self.col_sz / 3.5) / (self.max_iter - 1) * iteration + self.col_sz / 3.5) print('distance is', distance) for p_id, p in enumerate(self.neurons): # Compute new value for each neuron # Get the neighborhood of the neuron (input vectors from the topological neighborhood) neighborhood = self._neighborhood_set(p_id, distance, neighborhood_for_neuron) neighborhood = list(neighborhood) # When there are no neighbours: if not neighborhood: new_paths.append(p[PATH_CATEGORY]) empty_neighborhood += 1 continue # Compute average path from the neighborhood average, error = find_median(neighborhood) improvement = True while improvement: counter = BetterCounter() # Search how the average path needs to be changed to be the # same as each of its neighbour. Count the transformations. # There are three transformations: # 1) go up (..) # 2) go down in the directory structure # 3) do nothing (paths are equal) for neighbour in neighborhood: transformation = find_transformation(average, neighbour) # print(average, neighbour, transformation) if transformation == '.': breakpoint() counter.update([transformation]) # Apply the most frequent transformation most_frequent = counter.most_common() if len(most_frequent) == 1: # There were no ties --- ideal case transformation = most_frequent[0][0] if not transformation: # empty string means the paths are equal, we are # finished --- unlikely to happen break transformations = [transformation] elif len(most_frequent) > 1: # There were ties. We are going to try each transformation # and find one that gives smaller error value transformations = [x[0] for x in most_frequent] else: raise RuntimeError('Unexpected length of most_frequent') for t in transformations: new_average = transform(average, t) new_error = SOM.summed_distance(new_average, neighborhood) # TODO Try all possible transformations and choose the best one # disadvantage - slows down the algorithm if new_error < error: average = new_average error = new_error break else: # No transformation was better --- we are finished with # neuron `p` improvement = False # This is a new path for the `p_id`-th neuron # print('appending', average) new_paths.append(average) # print('_update_paths(): neighborhood was empty', empty_neighborhood, 'times') return new_paths