Exemplo n.º 1
0
    def test_route_steps_distance(self):
        matrix_instance = DistanceMatrix(os.path.dirname(os.path.abspath(".")))
        df = pd.read_csv(os.path.join(os.path.abspath(".."),
                                      r"Data\Geocodes.csv"),
                         error_bad_lines=False)

        new_row = pd.DataFrame({
            'latitude': 51.742503,
            'longitude': 19.432956
        },
                               index=[0])
        df = pd.concat([new_row, df]).reset_index(drop=True)
        matrix = matrix_instance.calculate_distance_matrix(df)
        self.assertEqual(matrix.shape[0], df.shape[0])
        route, cost = tsp.min_distance_search(df.shape[0])

        df_brews = pd.read_csv(os.path.join(os.path.abspath(".."),
                                            r"Data\Breweries.csv"),
                               error_bad_lines=False)
        df_beers = pd.read_csv(os.path.join(os.path.abspath(".."),
                                            r"Data\Beers.csv"),
                               error_bad_lines=False)
        collected, breweries = tsp.collect_beer(route, df_beers, df, df_brews)
        self.assertEqual(len(breweries), len(route))
        for index, brewery in enumerate(breweries):
            if index != 0:
                self.assertLessEqual(
                    abs(brewery[3] -
                        np.round(matrix[route[index - 1],
                                        route[index]], 0)[0]), 1)
Exemplo n.º 2
0
 def __init__(self, distMatrtix, alignment=None, names=None):
     self._alignment = alignment
     self._names = names
     if self._alignment:
         self._distMatrix = DistanceMatrix(distMatrtix, [x.id for x in self._alignment])
     elif self._names:
         self._distMatrix = DistanceMatrix(distMatrtix, self._names)
     else:
         raise RuntimeError("You must pass either an alignment or a list of names.")
    def test_find_nearest_with_one_circle_point(self):
        distance_matrix = DistanceMatrix()

        distance_matrix.add_distance(3.8, 'blue')

        result = distance_matrix.find_nearest_label(k=1)
        expected = ['blue']

        self.assertEqual(expected, result)
    def test_find_nearest_with_same_label(self):
        distance_matrix = DistanceMatrix()

        distance_matrix.add_distance(3.8, 'blue')
        distance_matrix.add_distance(3.8, 'blue')

        result = distance_matrix.find_nearest_label(k=1)
        expected = ['blue']

        self.assertEqual(sorted(expected), sorted(result))
    def put_distance_matrices(self, metrics, tmpdir="/tmp", normalise=False):
        """
        Pass this function a list of metrics
        valid kwargs - invert (bool), normalise (bool)
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        trees = [rec.tree for rec in self.get_records()]
        for metric in metrics:
            dm = DistanceMatrix(trees, tmpdir=tmpdir)
            dm.get_distance_matrix(metric, normalise=normalise)
            self.distance_matrices[metric] = dm
Exemplo n.º 6
0
 def test_distance_matrix(self):
     df = pd.read_csv(os.path.join(os.path.abspath(".."),
                                   r"Data\Geocodes.csv"),
                      error_bad_lines=False)
     df = df.drop(columns=['id', 'brewery_id', 'accuracy'])
     df = df.head(30)
     matrix_instance = DistanceMatrix(os.path.dirname(os.path.abspath(".")))
     matrix = matrix_instance.test_disttance_matrix(30)
     for row_idx, row in enumerate(matrix):
         for val_idx, value in enumerate(row):
             actual_val = haversine.haversine(df.iloc[row_idx]['longitude'],
                                              df.iloc[row_idx]['latitude'],
                                              df.iloc[val_idx]['longitude'],
                                              df.iloc[val_idx]['latitude'])
             self.assertEqual(value, actual_val)
Exemplo n.º 7
0
    def fit(self, n_clusters):
        '''
		this method uses the main Divisive Analysis algorithm to do the clustering

		arguements
		----------
		n_clusters - integer
					 number of clusters we want
		
		returns
		-------
		cluster_labels - numpy array
						 an array where cluster number of a sample corrosponding to 
						 the same index is stored
		'''
        similarity_matrix = DistanceMatrix(
            self.data)  # similarity matrix of the data
        clusters = [
            list(range(self.n_samples))
        ]  # list of clusters, initially the whole dataset is a single cluster
        while True:
            c_diameters = [
                np.max(similarity_matrix[cluster][:, cluster])
                for cluster in clusters
            ]  #cluster diameters
            max_cluster_dia = np.argmax(c_diameters)  #maximum cluster diameter
            max_difference_index = np.argmax(
                np.mean(similarity_matrix[clusters[max_cluster_dia]]
                        [:, clusters[max_cluster_dia]],
                        axis=1))
            splinters = [clusters[max_cluster_dia][max_difference_index]
                         ]  #spinter group
            last_clusters = clusters[max_cluster_dia]
            del last_clusters[max_difference_index]
            while True:
                split = False
                for j in range(len(last_clusters))[::-1]:
                    splinter_distances = similarity_matrix[last_clusters[j],
                                                           splinters]
                    last_distances = similarity_matrix[
                        last_clusters[j],
                        np.delete(last_clusters, j, axis=0)]
                    if np.mean(splinter_distances) <= np.mean(last_distances):
                        splinters.append(last_clusters[j])
                        del last_clusters[j]
                        split = True
                        break
                if split == False:
                    break
            del clusters[max_cluster_dia]
            clusters.append(splinters)
            clusters.append(last_clusters)
            if len(clusters) == n_clusters:
                break

        cluster_labels = np.zeros(self.n_samples)
        for i in range(len(clusters)):
            cluster_labels[clusters[i]] = i

        return cluster_labels
Exemplo n.º 8
0
def get_distances(csv_file):
    csv_file = CsvHelper(csv_file)
    try:
        url_index = 0
        price_index = 1
        address_index = 10

        distance_matrix = {}
        for index, row in enumerate(csv_file.read()):
            if index == 0:
                continue

            url = row[url_index]
            price = int(row[price_index].replace("£",
                                                 "").replace("$", "").replace(
                                                     "€", "").replace(",", ""))
            address = row[address_index]

            if price > 200000:
                continue

            print("distance matrix:", url)
            distance_matrix[index] = DistanceMatrix(address).get()
    finally:
        csv_file.file.close()

    return distance_matrix
    def put_distance_matrices(
        self,
        metrics,
        tmpdir='/tmp',
        normalise=False,
    ):
        """
        Pass this function a list of metrics
        valid kwargs - invert (bool), normalise (bool)
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        trees = [rec.tree for rec in self.get_records()]
        for metric in metrics:
            dm = DistanceMatrix(trees, tmpdir=tmpdir)
            dm.get_distance_matrix(metric, normalise=normalise)
            self.distance_matrices[metric] = dm
Exemplo n.º 10
0
    def __init__(self, dm):
        if isinstance(dm, np.ndarray):
            dm = DistanceMatrix(dm)

        if not isinstance(dm, DistanceMatrix):
            raise ValueError(
                'Distance matrix should be a numpy array or treeCl.DistanceMatrix'
            )
        self.dm = dm
Exemplo n.º 11
0
def test_indexing(y):
    x = pdist(y)
    s = squareform(x)
    d = DistanceMatrix(x)

    assert d[0, 0] == 0
    assert d[0, 1] == d[1, 0]
    assert d[0, 1] == s[0, 1]
    assert d[1, 0] == s[1, 0]
Exemplo n.º 12
0
def test_attrs(y):
    x = pdist(y)
    s = squareform(x)
    d = DistanceMatrix(x)

    assert d.T == d
    # np.testing.assert_array_equal(d.flat[0], x[0])
    assert d.size == s.size
    assert d.ndim == 2
    assert d.shape == s.shape
    assert len(d) == d.shape[0]
Exemplo n.º 13
0
 def get_inter_tree_distances(self,
                              metric,
                              normalise=False,
                              batchsize=100,
                              background=False):
     """ Generate a distance matrix from a fully-populated Collection """
     array = _get_inter_tree_distances(metric, self.trees, normalise,
                                       batchsize, background)
     if background:  # return IPython.parallel map result object to the user before jobs are finished
         return array
     return DistanceMatrix(array, self.names)
    def test_find_nearest_with_two_identical_labels(self):
        distance_matrix = DistanceMatrix()

        distance_matrix.add_distance(3.8, 'blue')
        distance_matrix.add_distance(3, 'blue')

        result = distance_matrix.find_nearest_label(k=3)
        expected = ['blue']

        self.assertEqual(expected, result)
    def test_find_nearest_point_between_two(self):
        distance_matrix = DistanceMatrix()

        distance_matrix.add_distance(3.8, 'red')
        distance_matrix.add_distance(3, 'blue')

        result = distance_matrix.find_nearest_label(k=1)
        expected = ['blue']

        self.assertEqual(expected, result)
    def test_find_most_present_label_between_three(self):
        distance_matrix = DistanceMatrix()

        distance_matrix.add_distance(3.8, 'red')
        distance_matrix.add_distance(3.8, 'red')
        distance_matrix.add_distance(3, 'blue')

        result = distance_matrix.find_nearest_label(k=3)
        expected = ['red']

        self.assertEqual(expected, result)
    def test_find_most_present_ambigous_labels_between_four(self):
        distance_matrix = DistanceMatrix()

        distance_matrix.add_distance(3.8, 'red')
        distance_matrix.add_distance(3.8, 'red')
        distance_matrix.add_distance(3, 'blue')
        distance_matrix.add_distance(3, 'blue')

        result = distance_matrix.find_nearest_label(k=3)
        expected = ['blue', 'red']

        self.assertEqual(sorted(expected), sorted(result))
Exemplo n.º 18
0
def write_distances(csv_file, json_file):
    original_csv_file = CsvHelper(csv_file)
    new_csv_file = CsvHelper(f"new_props_{time.time()}")
    distance_matrix = JsonHelper(json_file).read()

    try:
        for index, row in enumerate(original_csv_file.read()):
            key = str(index + 1)

            if key in distance_matrix:
                result = DistanceMatrix.to_h(distance_matrix[key])

                price = int(row["Price"].replace("£",
                                                 "").replace("$", "").replace(
                                                     "€", "").replace(",", ""))
                row["Price"] = price
                address_index = 10

                row_keys = list(row.keys())
                row_values = list(row.values())

                postal_info = Postal(row["Address"]).info()
                postal_length = len(postal_info)

                postal_keys = list(postal_info.keys())
                for index, key in enumerate(postal_keys):
                    offset_index = address_index + index + 1
                    row_keys.insert(offset_index, key)

                postal_values = list(postal_info.values())
                for index, value in enumerate(postal_values):
                    offset_index = address_index + index + 1
                    row_values.insert(offset_index, value)

                keys = list(result.keys())
                for index, key in enumerate(keys):
                    offset_index = address_index + postal_length + index + 1
                    row_keys.insert(offset_index, key)

                values = list(result.values())
                for index, value in enumerate(values):
                    offset_index = address_index + postal_length + index + 1
                    row_values.insert(offset_index, value)

                new_row = dict(zip(row_keys, row_values))
                new_csv_file.write(new_row)
    finally:
        original_csv_file.file.close()
        new_csv_file.file.close()
Exemplo n.º 19
0
def main(source_path, destination_path, tensor_dimension):
    for root, dirs, files in os.walk(source_path, topdown=False):
        for name in tqdm(files):
            dis = DistanceMatrix(source_path, destination_path,
                                 name.split('_')[0], tensor_dimension)
            try:
                dis.standardise_flatten_distance_matrix()
            except:
                pass
            try:
                dis.save_file()
            except:
                pass
Exemplo n.º 20
0
def test_complex(y):
    x = pdist(y)
    d = DistanceMatrix(x)
    np.testing.assert_array_equal(d.imag.values, DistanceMatrix(x.imag).values)
    np.testing.assert_array_equal(d.real.values, DistanceMatrix(x.real).values)
Exemplo n.º 21
0
def test_conversions(y):
    x = pdist(y)
    d = DistanceMatrix(x)
    s = squareform(x)
    np.testing.assert_array_equal(d.toarray(), np.triu(s))
    assert (d.tosparse() != sps.coo_matrix(np.triu(s))).nnz == 0
Exemplo n.º 22
0
def update_distance_matrix(lat, lon):
    matrix = DistanceMatrix(os.path.dirname(os.path.abspath(__file__)))
    matrix.update_csv(lat, lon, os.path.dirname(os.path.abspath(__file__)))
Exemplo n.º 23
0
 def dist(self, obj1, obj2):
     distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1]
     return (-distance)
Exemplo n.º 24
0
class NeighborJoining:

    ##
    # The constructor just saves the data for the execution.
    #
    # @param distMatrtix the distance matrix
    # @param alignment should be specified when the names parameter is not present
    # @param names the names of the taxa in the distance matrix
    def __init__(self, distMatrtix, alignment=None, names=None):
        self._alignment = alignment
        self._names = names
        if self._alignment:
            self._distMatrix = DistanceMatrix(distMatrtix, [x.id for x in self._alignment])
        elif self._names:
            self._distMatrix = DistanceMatrix(distMatrtix, self._names)
        else:
            raise RuntimeError("You must pass either an alignment or a list of names.")

    ##
    # Neighbor-Joining implementation.
    #
    # @return constructed tree with edges weighted by distances
    @property
    def tree(self):
        L = self._distMatrix.columnNames
        tree = Tree()
        tree.name = "root"
        tree.dist = 0
        for seq in L:
            tree.add_child(name=seq, dist=0)

        iter_count = 1
        while len(L) > 2:
            nearest_nbs = self._distMatrix.getNearestNeigbors()
            node_i = tree.search_nodes(name=nearest_nbs[0])[0]
            node_j = tree.search_nodes(name=nearest_nbs[1])[0]
            L.remove(nearest_nbs[0])
            L.remove(nearest_nbs[1])

            node_k = Tree()
            node_k.dist = 0
            node_k.name = "X" + str(iter_count)
            d_ij = self._distMatrix.getDistance(node_i.name, node_j.name)
            assert d_ij > 0
            d_ik = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_i.name) - self._distMatrix.getSeparation(node_j.name))
            d_jk = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_j.name) - self._distMatrix.getSeparation(node_i.name))

            tree.remove_child(node_i)
            tree.remove_child(node_j)
            node_k.add_child(node_i, dist=d_ik)
            node_k.add_child(node_j, dist=d_jk)
            tree.add_child(node_k)

            d_km = []
            for node_m in L:
                d_km.append(0.5 * (self._distMatrix.getDistance(node_i.name, node_m) + self._distMatrix.getDistance(node_j.name, node_m) - d_ij) )
                assert d_km > 0

            self._distMatrix.removeData((node_i.name, node_j.name))
            self._distMatrix.appendData(d_km, node_k.name)

            iter_count+=1
            L = self._distMatrix.columnNames

        last_nodes = tree.get_children()
        d_ij = self._distMatrix.getDistance(last_nodes[0].name, last_nodes[1].name)
        leaf = None
        new_root = None
        for node in last_nodes:
            if node.is_leaf():
                node.dist = d_ij
                leaf = node.detach()
            else:
                new_root = node.detach()
        if not leaf:
            leaf = last_nodes[0]
            leaf.dist = d_ij
        new_root.add_child(leaf)

        return new_root

    ##
    # @var _distMatrix
    # the distance matrix in more or less arbitrary form
    # @var _names
    # taxa identification strings
    # @var _alignment
    # multiple sequence alignment
    def test_return_only_the_minimal_necessary_point(self):
        distance_matrix = DistanceMatrix()

        distance_matrix.add_distance(2, 'blue')
        distance_matrix.add_distance(3, 'green')
        distance_matrix.add_distance(4, 'blue')
        distance_matrix.add_distance(4, 'green')
        distance_matrix.add_distance(4, 'red')

        result = distance_matrix.find_nearest_label(k=3)
        expected = ['blue', 'green']

        self.assertEqual(sorted(expected), sorted(result))
Exemplo n.º 26
0
 def distance_matrix(self, metric, **kwargs):
     """ Generate a distance matrix from a fully-populated Collection """
     trees = [rec.tree for rec in self.records]
     return DistanceMatrix(trees, metric, tmpdir=self.tmpdir, **kwargs)
Exemplo n.º 27
0
import numpy as np
from distance_matrix import DistanceMatrix
import random
import pandas as pd
import os

matrix = DistanceMatrix(os.path.dirname(os.path.abspath(__file__)))
limit = 2000


def __initialize__(size):
    random_list = random.sample(range(1, 1305), size)
    route = np.array([0] + random_list + [0])
    return route


def __initialize_empty__(size):
    route = np.zeros((size, 2))
    return route


def evaluate_distance():
    pass


def min_distance_search(size):
    # initialize values
    route = __initialize_empty__(size)  # route contains vertex id and distance to it from previous vertex
    cost = 0  # total distance
    vertex = 0  # current vertex
    read = {}  # read lines from distance matrix file (minimizes reading from file)