def test_route_steps_distance(self): matrix_instance = DistanceMatrix(os.path.dirname(os.path.abspath("."))) df = pd.read_csv(os.path.join(os.path.abspath(".."), r"Data\Geocodes.csv"), error_bad_lines=False) new_row = pd.DataFrame({ 'latitude': 51.742503, 'longitude': 19.432956 }, index=[0]) df = pd.concat([new_row, df]).reset_index(drop=True) matrix = matrix_instance.calculate_distance_matrix(df) self.assertEqual(matrix.shape[0], df.shape[0]) route, cost = tsp.min_distance_search(df.shape[0]) df_brews = pd.read_csv(os.path.join(os.path.abspath(".."), r"Data\Breweries.csv"), error_bad_lines=False) df_beers = pd.read_csv(os.path.join(os.path.abspath(".."), r"Data\Beers.csv"), error_bad_lines=False) collected, breweries = tsp.collect_beer(route, df_beers, df, df_brews) self.assertEqual(len(breweries), len(route)) for index, brewery in enumerate(breweries): if index != 0: self.assertLessEqual( abs(brewery[3] - np.round(matrix[route[index - 1], route[index]], 0)[0]), 1)
def __init__(self, distMatrtix, alignment=None, names=None): self._alignment = alignment self._names = names if self._alignment: self._distMatrix = DistanceMatrix(distMatrtix, [x.id for x in self._alignment]) elif self._names: self._distMatrix = DistanceMatrix(distMatrtix, self._names) else: raise RuntimeError("You must pass either an alignment or a list of names.")
def test_find_nearest_with_one_circle_point(self): distance_matrix = DistanceMatrix() distance_matrix.add_distance(3.8, 'blue') result = distance_matrix.find_nearest_label(k=1) expected = ['blue'] self.assertEqual(expected, result)
def test_find_nearest_with_same_label(self): distance_matrix = DistanceMatrix() distance_matrix.add_distance(3.8, 'blue') distance_matrix.add_distance(3.8, 'blue') result = distance_matrix.find_nearest_label(k=1) expected = ['blue'] self.assertEqual(sorted(expected), sorted(result))
def put_distance_matrices(self, metrics, tmpdir="/tmp", normalise=False): """ Pass this function a list of metrics valid kwargs - invert (bool), normalise (bool) """ if not isinstance(metrics, list): metrics = [metrics] trees = [rec.tree for rec in self.get_records()] for metric in metrics: dm = DistanceMatrix(trees, tmpdir=tmpdir) dm.get_distance_matrix(metric, normalise=normalise) self.distance_matrices[metric] = dm
def test_distance_matrix(self): df = pd.read_csv(os.path.join(os.path.abspath(".."), r"Data\Geocodes.csv"), error_bad_lines=False) df = df.drop(columns=['id', 'brewery_id', 'accuracy']) df = df.head(30) matrix_instance = DistanceMatrix(os.path.dirname(os.path.abspath("."))) matrix = matrix_instance.test_disttance_matrix(30) for row_idx, row in enumerate(matrix): for val_idx, value in enumerate(row): actual_val = haversine.haversine(df.iloc[row_idx]['longitude'], df.iloc[row_idx]['latitude'], df.iloc[val_idx]['longitude'], df.iloc[val_idx]['latitude']) self.assertEqual(value, actual_val)
def fit(self, n_clusters): ''' this method uses the main Divisive Analysis algorithm to do the clustering arguements ---------- n_clusters - integer number of clusters we want returns ------- cluster_labels - numpy array an array where cluster number of a sample corrosponding to the same index is stored ''' similarity_matrix = DistanceMatrix( self.data) # similarity matrix of the data clusters = [ list(range(self.n_samples)) ] # list of clusters, initially the whole dataset is a single cluster while True: c_diameters = [ np.max(similarity_matrix[cluster][:, cluster]) for cluster in clusters ] #cluster diameters max_cluster_dia = np.argmax(c_diameters) #maximum cluster diameter max_difference_index = np.argmax( np.mean(similarity_matrix[clusters[max_cluster_dia]] [:, clusters[max_cluster_dia]], axis=1)) splinters = [clusters[max_cluster_dia][max_difference_index] ] #spinter group last_clusters = clusters[max_cluster_dia] del last_clusters[max_difference_index] while True: split = False for j in range(len(last_clusters))[::-1]: splinter_distances = similarity_matrix[last_clusters[j], splinters] last_distances = similarity_matrix[ last_clusters[j], np.delete(last_clusters, j, axis=0)] if np.mean(splinter_distances) <= np.mean(last_distances): splinters.append(last_clusters[j]) del last_clusters[j] split = True break if split == False: break del clusters[max_cluster_dia] clusters.append(splinters) clusters.append(last_clusters) if len(clusters) == n_clusters: break cluster_labels = np.zeros(self.n_samples) for i in range(len(clusters)): cluster_labels[clusters[i]] = i return cluster_labels
def get_distances(csv_file): csv_file = CsvHelper(csv_file) try: url_index = 0 price_index = 1 address_index = 10 distance_matrix = {} for index, row in enumerate(csv_file.read()): if index == 0: continue url = row[url_index] price = int(row[price_index].replace("£", "").replace("$", "").replace( "€", "").replace(",", "")) address = row[address_index] if price > 200000: continue print("distance matrix:", url) distance_matrix[index] = DistanceMatrix(address).get() finally: csv_file.file.close() return distance_matrix
def put_distance_matrices( self, metrics, tmpdir='/tmp', normalise=False, ): """ Pass this function a list of metrics valid kwargs - invert (bool), normalise (bool) """ if not isinstance(metrics, list): metrics = [metrics] trees = [rec.tree for rec in self.get_records()] for metric in metrics: dm = DistanceMatrix(trees, tmpdir=tmpdir) dm.get_distance_matrix(metric, normalise=normalise) self.distance_matrices[metric] = dm
def __init__(self, dm): if isinstance(dm, np.ndarray): dm = DistanceMatrix(dm) if not isinstance(dm, DistanceMatrix): raise ValueError( 'Distance matrix should be a numpy array or treeCl.DistanceMatrix' ) self.dm = dm
def test_indexing(y): x = pdist(y) s = squareform(x) d = DistanceMatrix(x) assert d[0, 0] == 0 assert d[0, 1] == d[1, 0] assert d[0, 1] == s[0, 1] assert d[1, 0] == s[1, 0]
def test_attrs(y): x = pdist(y) s = squareform(x) d = DistanceMatrix(x) assert d.T == d # np.testing.assert_array_equal(d.flat[0], x[0]) assert d.size == s.size assert d.ndim == 2 assert d.shape == s.shape assert len(d) == d.shape[0]
def get_inter_tree_distances(self, metric, normalise=False, batchsize=100, background=False): """ Generate a distance matrix from a fully-populated Collection """ array = _get_inter_tree_distances(metric, self.trees, normalise, batchsize, background) if background: # return IPython.parallel map result object to the user before jobs are finished return array return DistanceMatrix(array, self.names)
def test_find_nearest_with_two_identical_labels(self): distance_matrix = DistanceMatrix() distance_matrix.add_distance(3.8, 'blue') distance_matrix.add_distance(3, 'blue') result = distance_matrix.find_nearest_label(k=3) expected = ['blue'] self.assertEqual(expected, result)
def test_find_nearest_point_between_two(self): distance_matrix = DistanceMatrix() distance_matrix.add_distance(3.8, 'red') distance_matrix.add_distance(3, 'blue') result = distance_matrix.find_nearest_label(k=1) expected = ['blue'] self.assertEqual(expected, result)
def test_find_most_present_label_between_three(self): distance_matrix = DistanceMatrix() distance_matrix.add_distance(3.8, 'red') distance_matrix.add_distance(3.8, 'red') distance_matrix.add_distance(3, 'blue') result = distance_matrix.find_nearest_label(k=3) expected = ['red'] self.assertEqual(expected, result)
def test_find_most_present_ambigous_labels_between_four(self): distance_matrix = DistanceMatrix() distance_matrix.add_distance(3.8, 'red') distance_matrix.add_distance(3.8, 'red') distance_matrix.add_distance(3, 'blue') distance_matrix.add_distance(3, 'blue') result = distance_matrix.find_nearest_label(k=3) expected = ['blue', 'red'] self.assertEqual(sorted(expected), sorted(result))
def write_distances(csv_file, json_file): original_csv_file = CsvHelper(csv_file) new_csv_file = CsvHelper(f"new_props_{time.time()}") distance_matrix = JsonHelper(json_file).read() try: for index, row in enumerate(original_csv_file.read()): key = str(index + 1) if key in distance_matrix: result = DistanceMatrix.to_h(distance_matrix[key]) price = int(row["Price"].replace("£", "").replace("$", "").replace( "€", "").replace(",", "")) row["Price"] = price address_index = 10 row_keys = list(row.keys()) row_values = list(row.values()) postal_info = Postal(row["Address"]).info() postal_length = len(postal_info) postal_keys = list(postal_info.keys()) for index, key in enumerate(postal_keys): offset_index = address_index + index + 1 row_keys.insert(offset_index, key) postal_values = list(postal_info.values()) for index, value in enumerate(postal_values): offset_index = address_index + index + 1 row_values.insert(offset_index, value) keys = list(result.keys()) for index, key in enumerate(keys): offset_index = address_index + postal_length + index + 1 row_keys.insert(offset_index, key) values = list(result.values()) for index, value in enumerate(values): offset_index = address_index + postal_length + index + 1 row_values.insert(offset_index, value) new_row = dict(zip(row_keys, row_values)) new_csv_file.write(new_row) finally: original_csv_file.file.close() new_csv_file.file.close()
def main(source_path, destination_path, tensor_dimension): for root, dirs, files in os.walk(source_path, topdown=False): for name in tqdm(files): dis = DistanceMatrix(source_path, destination_path, name.split('_')[0], tensor_dimension) try: dis.standardise_flatten_distance_matrix() except: pass try: dis.save_file() except: pass
def test_complex(y): x = pdist(y) d = DistanceMatrix(x) np.testing.assert_array_equal(d.imag.values, DistanceMatrix(x.imag).values) np.testing.assert_array_equal(d.real.values, DistanceMatrix(x.real).values)
def test_conversions(y): x = pdist(y) d = DistanceMatrix(x) s = squareform(x) np.testing.assert_array_equal(d.toarray(), np.triu(s)) assert (d.tosparse() != sps.coo_matrix(np.triu(s))).nnz == 0
def update_distance_matrix(lat, lon): matrix = DistanceMatrix(os.path.dirname(os.path.abspath(__file__))) matrix.update_csv(lat, lon, os.path.dirname(os.path.abspath(__file__)))
def dist(self, obj1, obj2): distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1] return (-distance)
class NeighborJoining: ## # The constructor just saves the data for the execution. # # @param distMatrtix the distance matrix # @param alignment should be specified when the names parameter is not present # @param names the names of the taxa in the distance matrix def __init__(self, distMatrtix, alignment=None, names=None): self._alignment = alignment self._names = names if self._alignment: self._distMatrix = DistanceMatrix(distMatrtix, [x.id for x in self._alignment]) elif self._names: self._distMatrix = DistanceMatrix(distMatrtix, self._names) else: raise RuntimeError("You must pass either an alignment or a list of names.") ## # Neighbor-Joining implementation. # # @return constructed tree with edges weighted by distances @property def tree(self): L = self._distMatrix.columnNames tree = Tree() tree.name = "root" tree.dist = 0 for seq in L: tree.add_child(name=seq, dist=0) iter_count = 1 while len(L) > 2: nearest_nbs = self._distMatrix.getNearestNeigbors() node_i = tree.search_nodes(name=nearest_nbs[0])[0] node_j = tree.search_nodes(name=nearest_nbs[1])[0] L.remove(nearest_nbs[0]) L.remove(nearest_nbs[1]) node_k = Tree() node_k.dist = 0 node_k.name = "X" + str(iter_count) d_ij = self._distMatrix.getDistance(node_i.name, node_j.name) assert d_ij > 0 d_ik = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_i.name) - self._distMatrix.getSeparation(node_j.name)) d_jk = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_j.name) - self._distMatrix.getSeparation(node_i.name)) tree.remove_child(node_i) tree.remove_child(node_j) node_k.add_child(node_i, dist=d_ik) node_k.add_child(node_j, dist=d_jk) tree.add_child(node_k) d_km = [] for node_m in L: d_km.append(0.5 * (self._distMatrix.getDistance(node_i.name, node_m) + self._distMatrix.getDistance(node_j.name, node_m) - d_ij) ) assert d_km > 0 self._distMatrix.removeData((node_i.name, node_j.name)) self._distMatrix.appendData(d_km, node_k.name) iter_count+=1 L = self._distMatrix.columnNames last_nodes = tree.get_children() d_ij = self._distMatrix.getDistance(last_nodes[0].name, last_nodes[1].name) leaf = None new_root = None for node in last_nodes: if node.is_leaf(): node.dist = d_ij leaf = node.detach() else: new_root = node.detach() if not leaf: leaf = last_nodes[0] leaf.dist = d_ij new_root.add_child(leaf) return new_root ## # @var _distMatrix # the distance matrix in more or less arbitrary form # @var _names # taxa identification strings # @var _alignment # multiple sequence alignment
def test_return_only_the_minimal_necessary_point(self): distance_matrix = DistanceMatrix() distance_matrix.add_distance(2, 'blue') distance_matrix.add_distance(3, 'green') distance_matrix.add_distance(4, 'blue') distance_matrix.add_distance(4, 'green') distance_matrix.add_distance(4, 'red') result = distance_matrix.find_nearest_label(k=3) expected = ['blue', 'green'] self.assertEqual(sorted(expected), sorted(result))
def distance_matrix(self, metric, **kwargs): """ Generate a distance matrix from a fully-populated Collection """ trees = [rec.tree for rec in self.records] return DistanceMatrix(trees, metric, tmpdir=self.tmpdir, **kwargs)
import numpy as np from distance_matrix import DistanceMatrix import random import pandas as pd import os matrix = DistanceMatrix(os.path.dirname(os.path.abspath(__file__))) limit = 2000 def __initialize__(size): random_list = random.sample(range(1, 1305), size) route = np.array([0] + random_list + [0]) return route def __initialize_empty__(size): route = np.zeros((size, 2)) return route def evaluate_distance(): pass def min_distance_search(size): # initialize values route = __initialize_empty__(size) # route contains vertex id and distance to it from previous vertex cost = 0 # total distance vertex = 0 # current vertex read = {} # read lines from distance matrix file (minimizes reading from file)