Пример #1
0
    def fit(self, tweets):
        reggrp = tweets.groupby('region')
        regions = reggrp.head(1).set_index('region').sort_index()
        self.regions = regions
        distances_km = pd.DataFrame(
            (6371.0088 * haversine_distances(
                np.radians(regions[['latitude', 'longitude']]),
            )),
            index=regions.index,
            columns=regions.index,
        )
        self.distances = distances_km.stack()
        seed = np.exp(-self.beta * distances_km)
        seed += 0.0000001
        seed = seed.div(seed.sum(axis=1), axis=0)
        self.seed = seed

        region_counts = reggrp.size().sort_values(ascending=False)
        region_probs = np.power(
            np.arange(1, region_counts.shape[0] + 1),
            -self.zipfs,
        )
        region_probs += 0.0000001
        region_probs = pd.Series(
            region_probs / np.sum(region_probs),
            index=region_counts.index,
        ).sort_index()
        self.region_probabilities = region_probs
        fitted = region_probs * seed
        fitted = fitted.div(fitted.sum(axis=1), axis=0)
        self.transition_mx = fitted.stack()
    def test_haversine_vectorized(self):
        sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv")
        sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id")
        x = sp.geometry.x.values
        y = sp.geometry.y.values

        n = len(x)
        # our distance
        ix_1, ix_2 = np.triu_indices(n, k=1)

        x1 = x[ix_1]
        y1 = y[ix_1]
        x2 = x[ix_2]
        y2 = y[ix_2]

        d_ours = haversine_dist(x1, y1, x2, y2)

        # their distance
        x_rad = np.asarray([radians(_) for _ in x])
        y_rad = np.asarray([radians(_) for _ in y])
        yx = np.concatenate((y_rad.reshape(-1, 1), x_rad.reshape(-1, 1)), axis=1)

        D_theirs = haversine_distances(yx, yx) * 6371000
        d_theirs = D_theirs[ix_1, ix_2]
        assert np.sum(np.abs(d_ours - d_theirs)) < 0.01  #  1cm for 58 should be good enough
Пример #3
0
 def process_similarity(self, similarity):
     if similarity == "cosine":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = cosine_similarity(self._attribute_matrix)[x, y]
     elif similarity == "dot":
         self._similarity_matrix = (self._attribute_matrix @ self._attribute_matrix.T).toarray()
     elif similarity == "euclidean":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (1 / (1 + euclidean_distances(self._attribute_matrix)))[x, y]
     elif similarity == "manhattan":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (1 / (1 + manhattan_distances(self._attribute_matrix)))[x, y]
     elif similarity == "haversine":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (1 / (1 + haversine_distances(self._attribute_matrix)))[x, y]
     elif similarity == "chi2":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (1 / (1 + chi2_kernel(self._attribute_matrix)))[x, y]
     elif similarity in ['cityblock', 'l1', 'l2']:
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (1 / (1 + pairwise_distances(self._attribute_matrix, metric=similarity)))[x, y]
     elif similarity in ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']:
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (1 / (1 + pairwise_distances(self._attribute_matrix.toarray(), metric=similarity)))[x, y]
     else:
         raise Exception("Not implemented similarity")
    def compute_clusters(self):
        """
        Find clusters using DBSCAN algorithm

        Returns
        -------
            tup : tuple
                centroids, sizes, and number of points of cluster found
        """
        X, date_distances = self.transform_data()

        X_rad = np.array([np.radians(i)
                          for i in X])  # scikit method takes radians

        # 2-D table of haversine distances between each pair of points
        distance_pairs = haversine_distances(X_rad, X_rad)

        distance_pairs /= distance_pairs.max()  # Normalize distances

        # Weight of space and time distances
        # Found by experimentation
        prop = 0.98

        # Distance is weighted average of space distance and time distance
        space_time_distance = prop * distance_pairs + (1 -
                                                       prop) * date_distances

        # epsilon is the max distance for 2 points to be considered "close"
        # 0.014 has been found by experimentation
        Y = DBSCAN(eps=0.014,
                   metric="precomputed").fit_predict(space_time_distance)

        return self.get_cluster_data(X, Y)
Пример #5
0
def find_closest_ll(input_ll, reference_ll, n=1):
    """ Find the closest pairing of longitude and latitudes from the input set
        to the reference set
    """

    nbrs = NearestNeighbors(n_neighbors=n).fit(
        reference_ll[['longitude', 'latitude']].values)
    _, indices = nbrs.kneighbors(input_ll[['longitude', 'latitude']].values)

    input_ll['longitude_rad'] = input_ll['longitude'].apply(radians)
    input_ll['latitude_rad'] = input_ll['latitude'].apply(radians)
    loc_1 = input_ll[['longitude_rad', 'latitude_rad']]

    reference_ll['longitude_rad'] = reference_ll['longitude'].apply(radians)
    reference_ll['latitude_rad'] = reference_ll['latitude'].apply(radians)
    loc_2 = reference_ll.iloc[indices.flatten()][[
        'longitude_rad', 'latitude_rad'
    ]]

    distances = np.array([])
    for l1, l2 in zip(loc_1.values, loc_2.values):
        d = (haversine_distances([l1, l2]) * EARTH_RADIUS / 1000)  # km
        distances = np.append(distances, np.max(d))

    return distances, indices
Пример #6
0
def binned_variance_batch(inds1, inds2, bin_edges, coords, X):
    # Compute distances.
    distances = haversine_distances(coords[inds1[0]:inds1[1]],
                                    coords[inds2[0]:inds2[1]])

    # Convert distances to km.
    distances *= 6371000 / 1000

    # Use just the upper triangle later on - mark all others using -1.
    distances[np.triu_indices(n=distances.shape[0],
                              m=distances.shape[1],
                              k=abs(inds1[0] - inds2[0]))] = -1

    n_samples = np.empty((bin_edges.shape[0] - 1, ), dtype=np.int64)
    means = np.empty((bin_edges.shape[0] - 1, ))
    variances = np.empty((bin_edges.shape[0] - 1, ))

    for (bin_index, (lower,
                     upper)) in enumerate(zip(bin_edges[:-1], bin_edges[1:])):
        # Bin the observations.
        selection = (lower <= distances) & (distances < upper)
        # Get matching indices.
        diffs = np.empty((np.sum(selection), ))
        for (counter, (i, j)) in enumerate(zip(*np.where(selection))):
            diffs[counter] = X[inds1[0] + i] - X[inds2[0] + j]

        n = diffs.size
        n_samples[bin_index] = n
        means[bin_index] = np.mean(diffs) if n else 0
        variances[bin_index] = np.var(diffs) if n else 0

    return n_samples, means, variances
Пример #7
0
def obtain_dist(c_a, c_b):
    # Convert angle to radians
    ca_in_radians = [radians(_) for _ in c_a]
    cb_in_radians = [radians(_) for _ in c_b]
    # Obtain the haversine distance
    result = haversine_distances([ca_in_radians, cb_in_radians])
    return result[0][1] * 6371000
def gen_distance_matrix(df: pd.DataFrame, cluster: int) -> pd.DataFrame:
    """
    This function takes in a dataframe, with lon lat coordinates, and a cluster number
    and calculates the distance matrix between points for a single cluster

    :return:
        return_df: a pandas dataframe, containing 'identificatie' as both row and column names,
        with the distance between those 'identificatie' as value in kilometers
    """
    # Take only the data belonging to the cluster we want
    only_cluster = df[df['cluster'] == cluster]

    # Take out only the coordinates
    cluster_coords = only_cluster[['x_coordinate', 'y_coordinate']]

    # Calculate radians for the haversine function
    in_radians = [[radians(coord[0]), radians(coord[1])]
                  for coord in cluster_coords.values]

    # Calculate distances with the haversine function, and multiply by the circumference of earth to get kilometers
    result = haversine_distances(in_radians) * 6371.0088

    # Add 'identificatie' as column and row names
    return_df = pd.DataFrame(result,
                             columns=only_cluster.identificatie_vbo,
                             index=only_cluster.identificatie_vbo)

    return return_df
Пример #9
0
def distance_matrix(X1: np.ndarray,
                    X2: np.ndarray,
                    units: str = "km",
                    fast_dist: bool = False) -> np.ndarray:
    """
    Computes the geodesic (or great circle if fast_dist=True) distance among all pairs of points given two sets of coordinates.
    Wrapper for scipy.spatial.distance.cdist using geopy.distance.geodesic as a the metric.

    NOTE:
    - points should be formatted in rows as [lat, lon]
    - if fast_dist=True, units are kilometers regardless of specification
    """
    # enforce 2d array in case of single point
    X1 = np.atleast_2d(X1)
    X2 = np.atleast_2d(X2)
    if fast_dist:
        # great circle distances in kilometers
        X1_r = np.radians(X1)
        X2_r = np.radians(X2)
        return haversine_distances(X1_r, X2_r) * EARTH_RADIUS
    elif units is not None:
        # geodesic distances in specified units
        return cdist(X1, X2,
                     lambda s_i, s_j: getattr(geodesic(s_i, s_j), units))
    else:
        # Euclidean distance
        return cdist(X1, X2)
Пример #10
0
def get_haversine(x):
    lat1 = x['Latitude']
    long1 = x['Longitude']
    lat2 = 41.8889
    long2 = -87.6264
    loc1 = [radians(lat1), radians(long1)]
    loc2 = [radians(lat2), radians(long2)]
    return (haversine_distances([loc1, loc2]) * 6357000)[0][1]
Пример #11
0
def sklearn_example():
    # distance b/w  Ezeiza Airport (Buenos Aires, Argentina) and Charles de Gaulle Airport (Paris, France)
    bas_coords = [-34.83333, -58.5166646]
    paris_coords = [49.0083899664, 2.53844117956]
    bsas_in_radians = [radians(_) for _ in bas_coords]
    paris_in_radians = [radians(_) for _ in paris_coords]
    result = haversine_distances([bsas_in_radians, paris_in_radians])
    print(result * 6371000/1000)  # multiply by Earth radius to get kilometers
def haversine(row):
  from_station = [row['rad_lat_i'],row['rad_lon_i']]
  to_station = [row['rad_lat_j'],row['rad_lon_j']]
  
  distance = haversine_distances([from_station,to_station])
  distance = distance * 6371000/1000  # multiply by Earth radius to get kilometers
  
  return distance[0][1]
Пример #13
0
def haversine_distance(orig_long, orig_lat, dest_long, dest_lat):
    origin_coord = [orig_lat, orig_long]
    destination_coord = [dest_lat, dest_long]
    origin_in_radians = [radians(_) for _ in origin_coord]
    destination_in_radians = [radians(_) for _ in destination_coord]
    res = haversine_distances([origin_in_radians, destination_in_radians
                               ])[0][1] * 6371000 / 1000
    return res
Пример #14
0
def great_circle(loc1, lat2, long2):
    rest = np.array(loc1)
    comparison = np.array([lat2, long2]).reshape(1, 2)
    rest_in_radians = np.radians(rest)
    comp_in_radians = np.radians(comparison)
    result = haversine_distances(rest_in_radians, comp_in_radians)
    result = result * 6371000 / 1000
    return result
Пример #15
0
def calc_matrices(invar, lon, lat, return_all=False):
    """
    Calculate correlation, covariance, and distance matrices in preparation
    for clustering.

    Parameters
    ----------
    invar : ARRAY (Time x Lat x Lon)
        Input variable
    lon : ARRAY (Lon)
        Longitudes
    lat : ARRAY (Lat)
        Latitudes
    return_all : BOOL, optional
        Set to true to return non-nan points, indices, and coordinates. The default is False.

    Returns
    -------
    srho: ARRAY [npts x npts]
        Correlation Matrix
    scov: ARRAY [npts x npts]
        Covariance Matrix
    sdist: ARRAY [npts x npts]
        Distance Matrix

    """

    # ---------------------
    # Remove All NaN Points
    # ---------------------
    ntime, nlat, nlon = invar.shape
    varrs = invar.reshape(ntime, nlat * nlon)
    okdata, knan, okpts = proc.find_nan(varrs, 0)
    npts = okdata.shape[1]

    # ---------------------------------------------
    # Calculate Correlation and Covariance Matrices
    # ---------------------------------------------
    srho = np.corrcoef(okdata.T, okdata.T)
    scov = np.cov(okdata.T, okdata.T)
    srho = srho[:npts, :npts]
    scov = scov[:npts, :npts]

    # --------------------------
    # Calculate Distance Matrix
    # --------------------------
    lonmesh, latmesh = np.meshgrid(lon, lat)
    coords = np.vstack([lonmesh.flatten(), latmesh.flatten()]).T
    coords = coords[okpts, :]
    coords1 = coords.copy()
    coords2 = np.zeros(coords1.shape)
    coords2[:, 0] = np.radians(coords1[:, 1])  # First point is latitude
    coords2[:, 1] = np.radians(coords1[:, 0])  # Second Point is Longitude
    sdist = haversine_distances(coords2, coords2) * 6371

    if return_all:
        return srho, scov, sdist, okdata, okpts, coords2
    return srho, scov, sdist
def get_pairwise_dists(df, lat_col, lng_col):
    lat = df[lat_col].apply(math.radians)
    lng = df[lng_col].apply(math.radians)
    R = 3959.87433 * 5280  # approximate radius of earth in ft (mi * ft/mi)
    pairwise_dists_df = pd.DataFrame(haversine_distances(
        pd.DataFrame([lat, lng]).T),
                                     index=df.index,
                                     columns=df.index)
    return pairwise_dists_df * R  # (converting radians to feet)
Пример #17
0
def mask_sig_to_cluster(mask_and_data_s, wght_area, distance_eps, min_area_samples,
                        n_jobs=-1):
    from sklearn import cluster
    from math import radians as _r
    from sklearn.metrics.pairwise import haversine_distances

    mask_sig_1d = mask_and_data_s.mask.astype('bool').values == False
    data = mask_and_data_s.data
    lons = mask_and_data_s.longitude.values
    lats = mask_and_data_s.latitude.values
    n_lags = mask_and_data_s.lag.size

    np_dbregs   = np.zeros( (n_lags, lats.size, lons.size), dtype=int )
    labels_sign_lag = []
    label_start = 0

    for sign in [-1, 1]:
        mask = mask_sig_1d.copy()
        mask[np.sign(data) != sign] = False
        n_gc_sig_sign = mask[mask==True].size
        labels_for_lag = np.zeros( (n_lags, n_gc_sig_sign), dtype=bool)
        meshgrid = np.meshgrid(lons.data, lats.data)
        mask_sig = np.reshape(mask, (n_lags, lats.size, lons.size))
        sign_coords = [] ; count=0
        weights_core_samples = []
        for l in range(n_lags):
            sign_c = meshgrid[0][ mask_sig[l,:,:] ], meshgrid[1][ mask_sig[l,:,:] ]
            n_sign_c_lag = len(sign_c[0])
            labels_for_lag[l][count:count+n_sign_c_lag] = True
            count += n_sign_c_lag
            # shape sign_coords = [(lats, lons)]
            sign_coords.append( [[_r(sign_c[1][i]), _r(sign_c[0][i]-180)] for i in range(sign_c[0].size)] )
            weights_core_samples.append(wght_area[mask_sig[l,:,:]].reshape(-1))

        sign_coords = flatten(sign_coords)
        if len(sign_coords) != 0:
            weights_core_samples = flatten(weights_core_samples)
            # calculate distance between sign coords accross all lags to keep labels
            # more consistent when clustering
            distance = haversine_distances(sign_coords) * 6371000/1000 # multiply by Earth radius to get kilometers
            dbresult = cluster.DBSCAN(eps=distance_eps, min_samples=min_area_samples,
                                      metric='precomputed', n_jobs=n_jobs).fit(distance,
                                      sample_weight=weights_core_samples)
            labels = dbresult.labels_ + 1
            # all labels == -1 (now 0) are seen as noise:
            labels[labels==0] = -label_start
            individual_labels = labels + label_start
            [labels_sign_lag.append((l, sign)) for l in np.unique(individual_labels) if l != 0]

            for l in range(n_lags):
                mask_sig_lag = mask[l,:,:]==True
                np_dbregs[l,:,:][mask_sig_lag] = individual_labels[labels_for_lag[l]]
            label_start = int(np_dbregs[mask].max())
        else:
            pass
        np_regs = np.array(np_dbregs, dtype='int')
    return np_regs, labels_sign_lag
Пример #18
0
def take_dist_mat(df):
    '''
    in km
    '''
    coords_temp = [[d1, d2]
                   for d1, d2 in zip(df.lat.tolist(), df.lon.tolist())]
    coords_rad = [[radians(_) for _ in a1] for a1 in coords_temp]
    hav_mat_ = haversine_distances(coords_rad, coords_rad) * 6371
    hav_mat_ = np.round(hav_mat_, 2)
    return hav_mat_
Пример #19
0
def distance_to_station(my_cords, station_cords):
    """Calculates distance from one coordinate to another.

    """
    my_cords_in_radians = [radians(_) for _ in my_cords]
    station_cords_in_radians = [radians(_) for _ in station_cords]
    result = haversine_distances(
        [my_cords_in_radians, station_cords_in_radians])
    result = result * 6371000 / 1000  # multiply by Earth radius to get kilometers
    return result[1][0]
Пример #20
0
    def test_example_from_sklean(self):

        bsas = [-34.83333, -58.5166646]
        paris = [49.0083899664, 2.53844117956]
        bsas_in_radians = [radians(_) for _ in bsas]
        paris_in_radians = [radians(_) for _ in paris]
        d_theirs = haversine_distances([bsas_in_radians, paris_in_radians]) * 6371000

        d_ours = haversine_dist(bsas[1], bsas[0], paris[1], paris[0])

        assert np.abs(d_theirs[1][0] - d_ours) < 0.01
Пример #21
0
def calc_distance(a, b):

    # Convert positions a and b from degrees to radians
    a_radians = [math.radians(_) for _ in a]
    b_radians = [math.radians(_) for _ in b]

    # Calculate the distance between a and b with the haversine formula
    distance = haversine_distances([a_radians, b_radians])
    distance *= 6371  # multiply by Earth radius to get kilometers

    return distance[0, 1]
Пример #22
0
def get_max_distance(coordinates):
    """Gets the maximum distance between a set of co-ordinates.

    Parameters:
        coordinates (numpy array of lat, lon): list of points

    Returns:
        maximum distance between given points
    """
    distances = haversine_distances(coordinates)
    return np.max(distances)
    def haversine_adapted(point_1, point_2):
        # lat lon to radians for haversine
        point_1 = [radians(_) for _ in point_1]
        point_2 = [radians(_) for _ in point_2]

        result = haversine_distances([point_1, point_2])
        # convert to km
        result *= 6371000 / 1000
        # result is a 2d distance matrix,
        #  0, dist
        #  dist, 0
        return result[0][1]
Пример #24
0
def test_haversine_distances():
    # Check haversine distance with distances computation
    def slow_haversine_distances(x, y):
        diff_lat = y[0] - x[0]
        diff_lon = y[1] - x[1]
        a = np.sin(diff_lat / 2)**2 + (np.cos(x[0]) * np.cos(y[0]) *
                                       np.sin(diff_lon / 2)**2)
        c = 2 * np.arcsin(np.sqrt(a))
        return c

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 2))
    Y = rng.random_sample((10, 2))
    D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X])
    D2 = haversine_distances(X, Y)
    assert_array_almost_equal(D1, D2)
    # Test haversine distance does not accept X where n_feature != 2
    X = rng.random_sample((10, 3))
    err_msg = "Haversine distance only valid in 2 dimensions"
    with pytest.raises(ValueError, match=err_msg):
        haversine_distances(X)
Пример #25
0
def gps_distance(p1: list, p2: list):
    """
    @param p[1/2]:  Coordinate Point (latitude, longitude) in floating angular notation
    @return:        The Distance between the coordinate points [meter]

    """
    r_earth = 6371000  # earth radius in meter
    p1_rad = [math.radians(x) for x in p1]
    p2_rad = [math.radians(x) for x in p2]
    d_haversine = haversine_distances([p1_rad, p2_rad])
    d_real = d_haversine * r_earth
    return d_real[0][1]
Пример #26
0
def get_pairs(geos, df):
    """Get pairwise comparisons"""

    # Clean centroid data
    geos_for_pairwise_comp = (geos.set_index("geoid").assign(
        treated=lambda x: x["status"] == "Selected")[[
            "statefp", "intptlat", "intptlon", "treated"
        ]].transform_column("intptlat",
                            float).transform_column("intptlon", float))

    # Tracts with nonmissing housing price data
    with_data = set(
        df.query("year == 2018").dropna(
            subset=["annual_change"])["tract"].unique())

    pair_dfs = []
    for state in geos_for_pairwise_comp.statefp.unique():
        state_data = geos_for_pairwise_comp.query(f"statefp == @state").copy()
        rad_per_degree = 1 / 360 * 2 * np.pi
        x = state_data.query("treated")[["intptlon", "intptlat"
                                         ]] * rad_per_degree
        x_index = x.index
        y = state_data.query("not treated")[["intptlon", "intptlat"
                                             ]] * rad_per_degree
        y_index = y.index

        y_index_data = y_index.isin(with_data)
        dist_mat = haversine_distances(X=x, Y=y)

        # Distance is infinity to places with missing data in order to exclude them
        dist_mat[:, ~y_index_data] = np.inf
        min_dist_control = y_index[dist_mat.argmin(axis=1)]

        pair_dfs.append(
            pd.DataFrame({
                "treated": x_index,
                "untreated": min_dist_control,
                "dist": dist_mat.min(axis=1),
            }).assign(statefp=state))
    pair_df = pd.concat(pair_dfs)

    pair_df = (pair_df.reset_index(drop=True).reset_index().melt(
        ["statefp", "index", "dist"]).sort_values("index").rename_column(
            "variable",
            "treatment").rename_column("value", "tract").reset_index(
                drop=True).merge(df[["tract", "annual_change", "year"]],
                                 on="tract",
                                 how="left").sort_values([
                                     "statefp", "year", "index", "treatment"
                                 ]).rename_column("index", "pair_id").assign(
                                     post_treatment=lambda x: x.year >= 2018))
    return pair_df
Пример #27
0
def kantenmodell(d):
    ### Elevation Change
    d['elev_delta'] = d['Elevation'].shift(-1) - d['Elevation']
    ### State of Charge Change
    d['soc_delta'] = d['HV Battery SOC_%_'].shift(-1) - d['HV Battery SOC_%_']
    ### Distance
    concated =  pd.concat([
        d[['Latitude_deg_','Longitude_deg_']].shift(-1).astype(float).add_suffix('_to').reset_index(drop=True),
        d[['Latitude_deg_','Longitude_deg_']].astype(float).add_suffix('_from').reset_index(drop=True)], axis=1
    )
    dist_matrix = haversine_distances(concated[['Latitude_deg__from', 'Longitude_deg__from']], concated[['Latitude_deg__to', 'Longitude_deg__to']]) * 6371000/1000
    d['distance'] = [dist_matrix[i,i] for i in range(dist_matrix.shape[0]) if i < dist_matrix.shape[1] - 1] + [np.nan]
    return d
    def get_cluster_data(self, X, Y):
        """
        Use clustering computed by DBSCAN to find:
        * centroid of each cluster
        * number of points per cluster
        * radius of each cluster

        Parameters
        ----------
            X : numpy array
                points clustered
            Y : numpy array
                cluster decision vector
        Returns
        -------
            centroids : list
                centroids of clusters
            sizes : list
                sizes of clusters (in kilometers)
            num_points : list
                number of points in clusters
        """

        centroids = []
        num_points = []
        sizes = []

        for i in range(np.max(Y) + 1):
            points_in_cluster = X[Y == i]

            # Centroid is arithmetic mean of point coordinates
            centroid = np.mean(points_in_cluster, axis=0)
            centroids.append(centroid)

            num_points.append(len(points_in_cluster))

            # Radius of cluster is distance from centroid to farthest point
            size = 0
            for point in points_in_cluster:
                point = np.array([np.radians(i) for i in point])
                centroid_rad = np.array([np.radians(i) for i in centroid])
                distance = haversine_distances([point], [centroid_rad])[0][0]
                if distance > size:
                    size = distance

            # Multiply by radius of Earth to get kilometers
            size *= 6371
            sizes.append(size)

        return centroids, num_points, sizes
Пример #29
0
def parse_toy_data(data_dir="."):
    lats, longs, names = [], [], []

    with open(f"{data_dir}/cities-us0.txt", "r") as in_file:
        # ignore first line
        for line in in_file.readlines()[1:]:
            s = line.split()
            lats.append(radians(float(s[1])))
            longs.append(radians(float(s[2])))
            names.append(" ".join(s[3:]))

    X = np.array(list(zip(lats, longs)))
    dists = haversine_distances(X)  # * 6_371_000 / 1_000 to km
    dists /= dists.max()
    return squareform(dists), np.array(names), len(names)
def connectTrafficData(accData, trafData, inplace=True, hardsave=False):
    ''' 
    Attaches traffic data to accident data as 'Traffic' column
    Parameters:
        accData: Pandas dataframe of the accident data
        trafData: Pandas dataframe of traffic data
        inplace: Default True. If True, will add a "CP" column to accident data with the closest traffic checkpoint. 
            If false will return closest array which can be used to add traffic data. 
        hardsave: Default False. If true will save the resulting DataFrame in the Data directory. 

    Returns:
        closest: Array of closest traffic CP (checkpoint) and distance to it for each accident in accData. 
    '''
    #Haversine distance finds the actual distance between two points given their latitude and longitude
    #Accuracy for Haversine formula is within 1%, doesn't account for ellipsoidal shape of the earth.
    from sklearn.metrics.pairwise import haversine_distances

    years = np.unique(accData['Year'])

    # accLocs = accData[['Latitude', 'Longitude']].values
    # trafLocs = trafData[['Lat','Lon']].values

    closest = np.ones((len(accData), 5)) * 10
    index = 0

    for year in years:
        curAccs = accData[accData['Year'] == year].copy()
        curTraf = trafData[trafData['year'] == year].copy()
        curAccLocs = curAccs[['Latitude', 'Longitude']].copy().values
        curTrafLocs = curTraf[['latitude', 'longitude']].copy().values
        for i, acc in enumerate(curAccLocs):
            distances = haversine_distances(acc.reshape((1, -1)), curTrafLocs)
            closest[index + i, 0] = distances.min()
            CPindex = distances.argmin()
            closest[index + i, 1] = curTraf.iloc[CPindex].count_point_id
            closest[index + i, 2] = curTraf.iloc[CPindex].all_motor_vehicles
            closest[index + i, 3] = curTraf.iloc[CPindex].latitude
            closest[index + i, 4] = curTraf.iloc[CPindex].longitude
        index += len(curAccs)
    if inplace:
        accData['CP'] = closest[:, 1].copy()
        accData['Traffic'] = closest[:, 2].copy()
        accData['CPlatitude'] = closest[:, 3].copy()
        accData['CPlongitude'] = closest[:, 4].copy()
        if hardsave:
            accData.to_csv("data/accidents_2005_to_2014_wTraffic.csv")
    else:
        return closest
Пример #31
0
def test_haversine_distances():
    # Check haversine distance with distances computation
    def slow_haversine_distances(x, y):
        diff_lat = y[0] - x[0]
        diff_lon = y[1] - x[1]
        a = np.sin(diff_lat / 2) ** 2 + (
            np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon/2) ** 2
        )
        c = 2 * np.arcsin(np.sqrt(a))
        return c
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 2))
    Y = rng.random_sample((10, 2))
    D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X])
    D2 = haversine_distances(X, Y)
    assert_array_almost_equal(D1, D2)
    # Test haversine distance does not accept X where n_feature != 2
    X = rng.random_sample((10, 3))
    assert_raise_message(ValueError,
                         "Haversine distance only valid in 2 dimensions",
                         haversine_distances, X)
Пример #32
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)

    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)

    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)

    # Test haversine distance
    # The data should be valid latitude and longitude
    X = rng.random_sample((5, 2))
    X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2
    X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
    S = pairwise_distances(X, metric="haversine")
    S2 = haversine_distances(X)
    assert_array_almost_equal(S, S2)

    # Test haversine distance, with Y != X
    Y = rng.random_sample((2, 2))
    Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2
    Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi
    S = pairwise_distances(X, Y, metric="haversine")
    S2 = haversine_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # "cityblock" uses scikit-learn metric, cityblock (function) is
    # scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)

    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)

    # Test cosine as a string metric versus cosine callable
    # The string "cosine" uses sklearn.metric,
    # while the function cosine is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)

    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")