Пример #1
0
def test_ball_tree_pickle():
    np.random.seed(0)
    X = np.random.random((10, 3))

    bt1 = BallTree(X, leaf_size=1)
    # Test if BallTree with callable metric is picklable
    bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2)

    ind1, dist1 = bt1.query(X)
    ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)

        s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol)
        bt2_pyfunc = pickle.loads(s_pyfunc)

        ind2, dist2 = bt2.query(X)
        ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X)

        assert_array_almost_equal(ind1, ind2)
        assert_array_almost_equal(dist1, dist2)

        assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc)
        assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc)

    for protocol in (0, 1, 2):
        yield check_pickle_protocol, protocol
Пример #2
0
def test_ball_tree_pickle():
    rng = check_random_state(0)
    X = rng.random_sample((10, 3))

    bt1 = BallTree(X, leaf_size=1)
    # Test if BallTree with callable metric is picklable
    bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2)

    ind1, dist1 = bt1.query(X)
    ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)

        s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol)
        bt2_pyfunc = pickle.loads(s_pyfunc)

        ind2, dist2 = bt2.query(X)
        ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X)

        assert_array_almost_equal(ind1, ind2)
        assert_array_almost_equal(dist1, dist2)

        assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc)
        assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc)

        assert isinstance(bt2, BallTree)

    for protocol in (0, 1, 2):
        check_pickle_protocol(protocol)
Пример #3
0
def similar_products2(deep_f):
	qs = Product.objects.all()
	df=read_frame(qs)
	df['idx'] = range(1, len(df) + 1)
	feature_list=[]
	asin_list=[]

	for prod in qs:
		feature_list.append(prod.get_features())
		asin_list.append(prod.asin)
	
		
	nparray = np.asarray(feature_list)
	#print nparray
	tree = BallTree(nparray)              
	dist, ind = tree.query(deep_f, k=5)
	print ind
	index = ind[0]
	recom = index[0:]
	recommended_asins =[];
	
	for i in recom:
		recommended_asins.append(asin_list[i])
	recommended_prods = Product.objects.filter(asin__in = recommended_asins)
	return recommended_prods

#    image_train = graphlab.SFrame(data=df)
#    cur_prod = image_train[18:19]
#    print cur_prod
#    print image_train
#    knn_model = graphlab.nearest_neighbors.create(image_train, features = ['features'],label = 'asin',distance = 'levenshtein',method = 'ball_tree')
#    knn_model.save('my_knn')
#    #knn_model= graphlab.load_model('my_knn')
#    #print knn_model.query(cur_prod)
#    #knn_model = graphlab.nearest_neighbors.create(image_train, features = ['features'],label = 'keywords')
Пример #4
0
def similar_products(product):
	qs = Product.objects.all()
	df=read_frame(qs)
	df['idx'] = range(1, len(df) + 1)
	feature_list=[]
	asin_list=[]
	product_index = 0
	inn=0
	for prod in qs:
		feature_list.append(prod.get_features())
		asin_list.append(prod.asin)
		if prod.asin == product.asin:
			product_index = inn
		inn+=1
		
	nparray = np.asarray(feature_list)
	#print nparray
	tree = BallTree(nparray)              
	dist, ind = tree.query(nparray[product_index], k=5)
	print ind
	index = ind[0]
	recom = index[1:]
	recommended_asins =[];
	
	for i in recom:
		recommended_asins.append(asin_list[i])
	recommended_prods = Product.objects.filter(asin__in = recommended_asins)
	return recommended_prods
Пример #5
0
    def check_neighbors(dualtree, breadth_first, k, metric, kwargs):
        bt = BallTree(X, leaf_size=1, metric=metric, **kwargs)
        dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first)
        dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)

        # don't check indices here: if there are any duplicate distances,
        # the indices may not match.  Distances should not have this problem.
        assert_array_almost_equal(dist1, dist2)
Пример #6
0
def test_query_haversine():
    rng = check_random_state(0)
    X = 2 * np.pi * rng.random_sample((40, 2))
    bt = BallTree(X, leaf_size=1, metric='haversine')
    dist1, ind1 = bt.query(X, k=5)
    dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine')

    assert_array_almost_equal(dist1, dist2)
    assert_array_almost_equal(ind1, ind2)
Пример #7
0
def test_query_haversine():
    np.random.seed(0)
    X = 2 * np.pi * np.random.random((40, 2))
    bt = BallTree(X, leaf_size=1, metric='haversine')
    dist1, ind1 = bt.query(X, k=5)
    dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine')

    assert_array_almost_equal(dist1, dist2)
    assert_array_almost_equal(ind1, ind2)
Пример #8
0
def test_ball_tree_kde(kernel, h, rtol, atol, breadth_first, n_samples=100,
                       n_features=3):
    rng = np.random.RandomState(0)
    X = rng.random_sample((n_samples, n_features))
    Y = rng.random_sample((n_samples, n_features))
    bt = BallTree(X, leaf_size=10)

    dens_true = compute_kernel_slow(Y, X, kernel, h)

    dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol,
                             kernel=kernel,
                             breadth_first=breadth_first)
    assert_allclose(dens, dens_true,
                    atol=atol, rtol=max(rtol, 1e-7))
Пример #9
0
def test_gaussian_kde(n_samples=1000):
    # Compare gaussian KDE results to scipy.stats.gaussian_kde
    from scipy.stats import gaussian_kde
    rng = check_random_state(0)
    x_in = rng.normal(0, 1, n_samples)
    x_out = np.linspace(-5, 5, 30)

    for h in [0.01, 0.1, 1]:
        bt = BallTree(x_in[:, None])
        gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))

        dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples
        dens_gkde = gkde.evaluate(x_out)

        assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
Пример #10
0
def test_ball_tree_query_metrics(metric):
    rng = check_random_state(0)
    if metric in BOOLEAN_METRICS:
        X = rng.random_sample((40, 10)).round(0)
        Y = rng.random_sample((10, 10)).round(0)
    elif metric in DISCRETE_METRICS:
        X = (4 * rng.random_sample((40, 10))).round(0)
        Y = (4 * rng.random_sample((10, 10))).round(0)

    k = 5

    bt = BallTree(X, leaf_size=1, metric=metric)
    dist1, ind1 = bt.query(Y, k)
    dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
    assert_array_almost_equal(dist1, dist2)
Пример #11
0
def test_ball_tree_query(metric, k, dualtree, breadth_first):
    rng = check_random_state(0)
    X = rng.random_sample((40, DIMENSION))
    Y = rng.random_sample((10, DIMENSION))

    kwargs = METRICS[metric]

    bt = BallTree(X, leaf_size=1, metric=metric, **kwargs)
    dist1, ind1 = bt.query(Y, k, dualtree=dualtree,
                           breadth_first=breadth_first)
    dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)

    # don't check indices here: if there are any duplicate distances,
    # the indices may not match.  Distances should not have this problem.
    assert_array_almost_equal(dist1, dist2)
Пример #12
0
def test_ball_tree_pickle():
    import pickle
    np.random.seed(0)
    X = np.random.random((10, 3))
    bt1 = BallTree(X, leaf_size=1)
    ind1, dist1 = bt1.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)
        ind2, dist2 = bt2.query(X)
        assert_allclose(ind1, ind2)
        assert_allclose(dist1, dist2)

    for protocol in (0, 1, 2):
        yield check_pickle_protocol, protocol
Пример #13
0
def test_ball_tree_query_radius(n_samples=100, n_features=10):
    np.random.seed(0)
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = bt.query_radius(query_pt, r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert_array_almost_equal(i, ind)
Пример #14
0
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10):
    np.random.seed(0)
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))

        assert_array_almost_equal(d, dist)
Пример #15
0
def test_gaussian_kde(n_samples=1000):
    """Compare gaussian KDE results to scipy.stats.gaussian_kde"""
    from scipy.stats import gaussian_kde
    np.random.seed(0)
    x_in = np.random.normal(0, 1, n_samples)
    x_out = np.linspace(-5, 5, 30)

    for h in [0.01, 0.1, 1]:
        bt = BallTree(x_in[:, None])
        try:
            gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
        except TypeError:
            raise SkipTest("Old version of scipy, doesn't accept explicit bandwidth.")

        dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples
        dens_gkde = gkde.evaluate(x_out)

        assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
Пример #16
0
def test_gaussian_kde(n_samples=1000):
    """Compare gaussian KDE results to scipy.stats.gaussian_kde"""
    from scipy.stats import gaussian_kde
    np.random.seed(0)
    x_in = np.random.normal(0, 1, n_samples)
    x_out = np.linspace(-5, 5, 30)

    for h in [0.01, 0.1, 1]:
        bt = BallTree(x_in[:, None])
        try:
            gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
        except TypeError:
            # older versions of scipy don't accept explicit bandwidth
            raise SkipTest

        dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples
        dens_gkde = gkde.evaluate(x_out)

        assert_allclose(dens_bt, dens_gkde, rtol=1E-3, atol=1E-3)
Пример #17
0
def _nonlocalmeans_clustered(img, n_small=5, n_components=9, n_neighbors=30, h=10):

    Nw = (2 * n_small + 1) ** 2
    h2 = h * h
    n_rows, n_cols = img.shape

    # precompute the coordinate difference for the big patch
    small_rows, small_cols = np.indices(((2 * n_small + 1), (2 * n_small + 1))) - n_small

    # put all patches so we can cluster them
    n_padded = np.pad(img, n_small, mode='reflect')
    patches = np.zeros((n_rows * n_cols, Nw))

    n = 0
    for r in range(n_small, n_small + n_rows):
        for c in range(n_small, n_small + n_cols):
            window = n_padded[r + small_rows, c + small_cols].flatten()
            patches[n, :] = window
            n += 1

    transformed = PCA(n_components=n_components).fit_transform(patches)
    # index the patches into a tree
    tree = BallTree(transformed, leaf_size=2)

    print("Denoising")
    new_img = np.zeros_like(img)
    for r in range(n_rows):
        for c in range(n_cols):
            idx = r * n_cols + c
            dist, ind = tree.query(transformed[idx], k=n_neighbors)
            ridx = np.array([(int(i / n_cols), int(i % n_cols)) for i in ind[0, 1:]])
            colors = img[ridx[:, 0], ridx[:, 1]]
            w = np.exp(-dist[0, 1:] / h2)
            new_img[r, c] = np.sum(w * colors) / np.sum(w)

    return new_img
def compute_distances():
    # Load IXP-GST positions
    altitude = 1150
    min_elev = 40
    orbits = 32
    sat_per_orbit = 50
    inclination = 53
    gst_file = "data/raw/ixp_geolocation.csv"
    src_file = "data/raw/WUP2018-F22-Cities_Over_300K_Annual.csv"

    # Load geo information
    sat_pos, gst_pos, src_pos = load_locations(altitude,
                                               orbits,
                                               sat_per_orbit,
                                               inclination,
                                               gst_file,
                                               src_file,
                                               time=15000)

    lon_sort_idx_src = np.argsort(src_pos[:, 1])
    src_pos = (src_pos[lon_sort_idx_src])

    # Remove SRCs that are too high in latitude
    higher = np.where(src_pos[:, 0] > 56)[0]
    src_pos = np.delete(src_pos, higher, axis=0)

    lon_sort_idx_gst = np.argsort(gst_pos[:, 1])
    gst_pos = (gst_pos[lon_sort_idx_gst])

    # %%
    sat_sat_dist = compute_sat_sat_distance(sat_pos, altitude, orbits,
                                            sat_per_orbit)
    # Compute the BallTree for the satellites. This gives nn to satellites.
    sat_tree = BallTree(np.deg2rad(sat_pos),
                        metric=DistanceMetric.get_metric("haversine"))

    # Get the satellites that are in reach for the ground stations
    #   and their distance.
    sat_gst_ind_city, sat_gst_dist_city = compute_gst_sat_distance(
        altitude, min_elev, src_pos, sat_tree)

    src_src_satellite = gsts_optimization(sat_gst_ind_city,
                                          sat_gst_dist_city,
                                          sat_sat_dist,
                                          n_gsts=src_pos.shape[0])

    src_src_latency = src_src_satellite / LIGHT_IN_VACUUM

    # %%
    sat_gst_ind_ixp, sat_gst_dist_ixp = compute_gst_sat_distance(
        altitude, min_elev, gst_pos, sat_tree)

    gst_gst_satellite = gsts_optimization(sat_gst_ind_ixp,
                                          sat_gst_dist_ixp,
                                          sat_sat_dist,
                                          n_gsts=gst_pos.shape[0])

    src_gst_ind, src_gst_dist = src_nearest_gst_distance(src_pos, gst_pos)

    n_src = src_pos.shape[0]
    src_gst_latency = compute_src_dst_latency(n_src, [], src_gst_ind,
                                              src_gst_dist, [], [],
                                              gst_gst_satellite)

    return src_gst_latency, src_src_latency, src_pos
Пример #19
0
batch_size = 512
X = np.random.random(size=(n_points, d)).astype(np.float32)

res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()
flat_config.device = 0
index = faiss.GpuIndexFlatL2(res, d, flat_config)
index.add(X)

for bi in range(3,10):
    for ki in range(3, 10):
        t = time.time()
        D, I = index.search(X[0:2**bi,:], 2**ki)
        print 2**bi, 2**ki, int((time.time()-t)*1000)

t = time.time()
cpu_index = BallTree(X)
print("BallTree build time (mins)", int((time.time()-t)/60))

#t = time.time()
#D, I = cpu_index.query(X[0:batch_size,:], k)
#print int((time.time()-t)*1000)

for bi in range(3,10):
    for ki in range(3, 10):
        t = time.time()
        D, I = cpu_index.query(X[0:2**bi,:], 2**ki)
        print 2**bi, 2**ki, int((time.time()-t)*1000)


    #print(WandV['pressure'])
    #X = np.array((WandV.values()))
    #print(len(WandV.values()))
    #print(WandV.values())
    import pandas as pd
    df = pd.DataFrame()

    for i in WandV.values():
        #print(pd.DataFrame(i))
        df = df.append(pd.Series(i), ignore_index=True)
    #print("temp head",df.head())
    #print("temp shape", df.shape)

    from sklearn.neighbors.ball_tree import BallTree
    print("KNN ...........")
    tree = BallTree(df, leaf_size=2)
    print("finding neighbor words .....")
    dist, ind = tree.query(df[:1], k=3)  # doctest: +SKIP
    print(ind)  # indices of 3 closest neighbors
    #[0 3 1]
    print(dist)  # distances to 3 closest neighbors
    #[ 0.          0.19662693  0.29473397]

    v1 = df.iloc[0, :]
    v2 = df.iloc[363, :]
    v3 = df.iloc[3774, :]

    V1 = np.array(v1)
    V2 = np.array(v2)
    V3 = np.array(v3)
Пример #21
0
 def get_ball_tree_index(X):
     return BallTree(X)
Пример #22
0
    def __init__(
        self,
        reciprocal_lattice: Lattice,
        original_points: np.ndarray,
        original_dim: np.ndarray,
        extra_points: np.ndarray,
        ir_to_full_idx: Optional[np.ndarray] = None,
        extra_ir_points_idx: Optional[np.ndarray] = None,
        nworkers: int = pdefaults["nworkers"],
    ):
        """

        Add a warning about only using the symmetry options if you are sure your
        extra k-points have been symmetrized

        Args:
            original_points:
            nworkers:
        """
        self._nworkers = nworkers if nworkers != -1 else cpu_count()
        self._final_points = np.concatenate([original_points, extra_points])
        self._reciprocal_lattice = reciprocal_lattice

        if ir_to_full_idx is None:
            ir_to_full_idx = np.arange(
                len(original_points) + len(extra_points))

        if extra_ir_points_idx is None:
            extra_ir_points_idx = np.arange(len(extra_points))

        logger.debug("Initializing periodic Voronoi calculator")
        all_points = np.concatenate((original_points, extra_points))

        logger.debug("  ├── getting supercell k-points")
        supercell_points = get_supercell_points(all_points)
        supercell_idxs = np.arange(supercell_points.shape[0])

        # filter points far from the zone boundary, this will lead to errors for
        # very small meshes < 5x5x5 but we are not interested in those
        mask = ((supercell_points > -0.75) &
                (supercell_points < 0.75)).all(axis=1)
        supercell_points = supercell_points[mask]
        supercell_idxs = supercell_idxs[mask]

        # want points in cartesian space so we can define a regular spherical
        # cutoff even if reciprocal lattice is not cubic. If we used a
        # fractional cutoff, the cutoff regions would not be spherical
        logger.debug("  ├── getting cartesian points")
        cart_points = reciprocal_lattice.get_cartesian_coords(supercell_points)
        cart_extra_points = reciprocal_lattice.get_cartesian_coords(
            extra_points[extra_ir_points_idx])

        # small cutoff is slightly larger than the max regular grid spacing
        # means at least 1 neighbour point will always be included in each
        # direction, need to find cartesian length which covers the longest direction
        # of the mesh
        spacing = 1 / original_dim
        body_diagonal = reciprocal_lattice.get_cartesian_coords(spacing)
        xy = reciprocal_lattice.get_cartesian_coords(
            [spacing[0], spacing[1], 0])
        xz = reciprocal_lattice.get_cartesian_coords(
            [spacing[0], 0, spacing[2]])
        yz = reciprocal_lattice.get_cartesian_coords(
            [0, spacing[1], spacing[2]])

        len_diagonal = np.linalg.norm(body_diagonal)
        len_xy = np.linalg.norm(xy)
        len_xz = np.linalg.norm(xz)
        len_yz = np.linalg.norm(yz)

        small_cutoff = (np.max([len_diagonal, len_xy, len_xz, len_yz]) * 1.6)
        big_cutoff = (small_cutoff * 1.77)

        logger.debug("  ├── initializing ball tree")

        # use BallTree for quickly evaluating which points are within cutoffs
        tree = BallTree(cart_points)

        n_supercell_points = len(supercell_points)

        # big points are those which surround the extra points within the big cutoff
        # (including the extra points themselves)
        logger.debug("  ├── calculating points in big radius")
        big_points_idx = _query_radius_iteratively(tree, n_supercell_points,
                                                   cart_extra_points,
                                                   big_cutoff)

        # Voronoi points are those we actually include in the Voronoi diagram
        self._voronoi_points = cart_points[big_points_idx]

        # small points are the points in all_points (i.e., original + extra points) for
        # which we want to calculate the Voronoi volumes. Outside the small cutoff, the
        # weights will just be the regular grid weight.
        logger.debug("  └── calculating points in small radius")
        small_points_idx = _query_radius_iteratively(tree, n_supercell_points,
                                                     cart_extra_points,
                                                     small_cutoff)

        # get the irreducible small points
        small_points_in_all_points = supercell_idxs[small_points_idx] % len(
            all_points)
        mapping = ir_to_full_idx[small_points_in_all_points]
        unique_mappings, ir_idx = np.unique(mapping, return_index=True)
        small_points_idx = small_points_idx[ir_idx]

        # get a mapping to go from the ir small points to the full BZ.
        groups = groupby(np.arange(len(all_points)), ir_to_full_idx)
        grouped_ir = groups[unique_mappings]
        counts = [len(g) for g in grouped_ir]
        self._expand_ir = np.repeat(np.arange(len(ir_idx)), counts)

        # get the indices of the expanded ir_small_points in all_points
        self._volume_in_final_idx = np.concatenate(grouped_ir)

        # get the indices of ir_small_points_idx (i.e., the points for which we will
        # calculate the volume) in voronoi_points
        self._volume_points_idx = _get_loc(big_points_idx, small_points_idx)

        # Prepopulate the final volumes array. By default, each point has the
        # volume of the original mesh. Note: at this point, the extra points
        # will have zero volume. This will array will be updated by
        # compute_volumes
        self._volume = reciprocal_lattice.volume
        self._final_volumes = np.full(len(all_points),
                                      1 / len(original_points))
        self._final_volumes[len(original_points):] = 0
        self._final_volumes[self._volume_in_final_idx] = 0
Пример #23
0
    def calculate_band_rates(self,
                             spin: Spin,
                             b_idx: int,
                             nsplits: int):
        integral_conversion = (
                (2 * np.pi) ** 3
                / (self.amset_data.structure.lattice.volume * A_to_nm ** 3)
                / self.gauss_width)

        # prefactors have shape [nscatterers, ndoping, ntemp)
        elastic_prefactors = integral_conversion * np.array(
            [m.prefactor(spin, b_idx) for m in self.elastic_scatterers])
        inelastic_prefactors = integral_conversion * np.array(
            [m.prefactor(spin, b_idx) for m in self.inelastic_scatterers])

        if self.use_symmetry:
            kpoints_idx = self.amset_data.ir_kpoints_idx
        else:
            kpoints_idx = np.arange(len(self.amset_data.full_kpoints))

        nkpoints = len(kpoints_idx)

        band_energies = self.amset_data.energies[spin][b_idx, kpoints_idx]

        # filter_points far from band edge where Fermi integrals are very small
        ball_band_energies = copy.deepcopy(band_energies)
        mask = (band_energies < self.scattering_energy_cutoffs[0]) | (
                band_energies > self.scattering_energy_cutoffs[1])

        # set the energies out of range to infinite so that they will not be
        # included in the scattering rate calculations
        ball_band_energies[mask] *= float("inf")

        ball_tree = BallTree(ball_band_energies[:, None], leaf_size=100)
        g = np.ones(self.amset_data.fermi_levels.shape +
                    (len(self.amset_data.energies[spin][b_idx]), )) * 1e-9

        s_g, g = create_shared_array(g, return_buffer=True)

        s_energies = create_shared_array(band_energies)
        s_kpoints = create_shared_array(self.amset_data.full_kpoints)
        s_k_norms = create_shared_array(self.amset_data.kpoint_norms)
        s_k_weights = create_shared_array(self.amset_data.kpoint_weights)
        s_a_factor = create_shared_array(
            self.amset_data.a_factor[spin][b_idx, kpoints_idx])
        s_c_factor = create_shared_array(
            self.amset_data.c_factor[spin][b_idx, kpoints_idx])

        rlat = self.amset_data.structure.lattice.reciprocal_lattice.matrix

        # spawn as many worker processes as needed, put all bands in the queue,
        # and let them work until all the required rates have been computed.
        workers = []
        iqueue = Queue()
        oqueue = Queue()

        for i in range(self.nworkers):
            args = (self.scatterers, ball_tree, spin, b_idx,
                    self.gauss_width * units.eV,
                    s_g, s_energies, s_kpoints, s_k_norms, s_k_weights,
                    s_a_factor, s_c_factor, len(band_energies), rlat, iqueue,
                    oqueue)
            if self.use_symmetry:
                kwargs = {
                    "grouped_ir_to_full": self.amset_data.grouped_ir_to_full,
                    "ir_kpoints_idx": self.amset_data.ir_kpoints_idx}

                workers.append(Process(target=scattering_worker, args=args,
                                       kwargs=kwargs))
            else:
                workers.append(Process(target=scattering_worker, args=args))

        slices = list(gen_even_slices(nkpoints, nsplits))

        for w in workers:
            w.start()

        elastic_rates = None
        if self.elastic_scatterers:
            elastic_rates = self._fill_workers(
                nkpoints, slices, iqueue, oqueue, desc="elastic",
                scattering_mask=mask)
            elastic_rates *= elastic_prefactors[..., None]

        if self.inelastic_scatterers:
            # currently only supports one inelastic scattering energy difference
            # convert frequency to THz and get energy in Rydberg
            energy_diff = (self.materials_properties["pop_frequency"] * 1e12
                           * 2 * np.pi * hbar * units.eV)

            n_inelastic = len(self.inelastic_scatterers)
            shape = (n_inelastic, len(self.amset_data.doping),
                     len(self.amset_data.temperatures), nkpoints)
            in_rates = np.zeros(shape)
            out_rates = np.zeros(shape)

            # in 1/s
            # force = (self.amset_data.dfdk[spin][:, :, b_idx] *
            #          default_small_e / hbar)
            force = np.zeros(self.amset_data.dfde[spin][:, :, b_idx].shape)

            # if max_iter == 1 then RTA, don't calculate in rates
            calculate_in_rate = self.max_g_iter != 1
            calculate_out_rate = True

            for _ in range(self.max_g_iter):

                # rates are formatted as s1_i, s1_i, s2_o, s2_o etc
                inelastic_rates = self._fill_workers(
                    nkpoints, slices, iqueue, oqueue, energy_diff=energy_diff,
                    desc="inelastic", calculate_out_rate=calculate_out_rate,
                    calculate_in_rate=calculate_in_rate, scattering_mask=mask)

                if calculate_in_rate:
                    # in rates always returned first
                    in_rates = inelastic_rates[:n_inelastic]
                    in_rates *= inelastic_prefactors[..., None]

                if calculate_out_rate:
                    # in rate independent of g so only need to calculate it once
                    idx = n_inelastic if calculate_in_rate else 0
                    out_rates = inelastic_rates[idx:]
                    out_rates *= inelastic_prefactors[..., None]
                    calculate_out_rate = False

                if self.max_g_iter != 1:
                    new_g = calculate_g(
                        out_rates, in_rates, elastic_rates, force)
                    g_diff = np.abs(np.average(new_g-g))
                    logger.debug("  ├── difference in g value: {:.2g}".format(
                        g_diff))

                    if g_diff < self.g_tol:
                        break

                    # update the shared buffer
                    g[:] = new_g[:]

            to_stack = [elastic_rates] if elastic_rates is not None else []
            if calculate_in_rate:
                to_stack.append(in_rates)
            to_stack.append(out_rates)
            all_band_rates = np.vstack(to_stack)

        else:
            all_band_rates = elastic_rates

        # The "None"s at the end of the queue signal the workers that there are
        # no more jobs left and they must therefore exit.
        for i in range(self.nworkers):
            iqueue.put(None)

        for w in workers:
            w.join()
            w.terminate()

        return all_band_rates
Пример #24
0
def preprocess_table(input_file_path, output_file_path):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    encoders = {}
    logger = logging.getLogger(__name__)

    df_full_train, output_filepath_df_train, output_filepath_misc_train = read_table(
        input_file_path, logger, output_file_path, suffix="Train")

    df_full_test, output_filepath_df_test, output_filepath_misc_test = read_table(
        input_file_path, logger, output_file_path, suffix="Test")

    df_full_val, output_filepath_df_val, output_filepath_misc_val = read_table(
        input_file_path, logger, output_file_path, suffix="Validation")

    # Label encode categoricals
    for cat in CAT_COLUMNS:
        logger.info(f"to category: {cat}")
        df_full_train[cat] = df_full_train[cat].astype(str)
        df_full_test[cat] = df_full_test[cat].astype(str)
        df_full_val[cat] = df_full_val[cat].astype(str)

    CALC_COUNT_COLUMNS = []
    df_to_fit_le = pd.concat([df_full_train, df_full_val],
                             axis=0)[df_full_test.columns]

    # Label encode categoricals
    label_encoder = ce.CountEncoder(return_df=True,
                                    cols=CAT_COLUMNS,
                                    verbose=1,
                                    normalize=True)
    count_encoder = ce.CountEncoder(return_df=True,
                                    cols=COUNT_COLUMNS + CALC_COUNT_COLUMNS,
                                    verbose=1,
                                    normalize=True)

    # Encode train and test with LE
    label_encoder.fit(df_to_fit_le)
    df_full_train[df_full_test.columns] = label_encoder.transform(
        df_full_train[df_full_test.columns])
    df_full_test = label_encoder.transform(df_full_test)
    df_full_val[df_full_test.columns] = label_encoder.transform(
        df_full_val[df_full_test.columns])
    # Encode train and test with CE
    count_encoder.fit(df_to_fit_le)
    df_full_train[df_full_test.columns] = count_encoder.transform(
        df_full_train[df_full_test.columns])
    df_full_test = count_encoder.transform(df_full_test)
    df_full_val[df_full_test.columns] = count_encoder.transform(
        df_full_val[df_full_test.columns])
    # Encode aggregate statistics using BallTree:
    X = pd.concat(
        [df_full_train[['lat', 'long']], df_full_val[['lat', 'long']]],
        axis=0).values
    # Build a tree:
    tree = BallTree(X)
    # Calculate aggregate statistics using tree:
    X_to_get_data = pd.concat([df_full_train, df_full_val], axis=0)
    #
    # df_full_train = calculate_agg_statistics(tree,X_to_get_data,df_full_train)
    # df_full_val = calculate_agg_statistics(tree, X_to_get_data, df_full_val)
    # df_full_test = calculate_agg_statistics(tree, X_to_get_data, df_full_test)

    #
    print(df_full_train.shape)
    print(df_full_test.shape)
    print(df_full_val.shape)
    # Encode test:

    misc = {}
    misc["encoder_dict"] = encoders
    # profile = feature_df.profile_report(title=f'Pandas Profiling Report for {suffix}')
    # profile.to_file(output_file=os.path.join(project_dir, f"output_{suffix}.html"))

    df_full_train.to_pickle(output_filepath_df_train)
    df_full_test.to_pickle(output_filepath_df_test)
    df_full_val.to_pickle(output_filepath_df_val)

    with open(output_filepath_misc_train, "wb") as f:
        pickle.dump(misc, f)

    return 0
Пример #25
0
 def check_neighbors(metric):
     bt = BallTree(X, leaf_size=1, metric=metric)
     dist1, ind1 = bt.query(Y, k)
     dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
     assert_array_almost_equal(dist1, dist2)
Пример #26
0
    def _fit(self, X):
        self._check_algorithm_metric()
        self._check_hubness_algorithm()
        self._check_algorithm_hubness_compatibility()
        if self.metric_params is None:
            self.effective_metric_params_ = {}
        else:
            self.effective_metric_params_ = self.metric_params.copy()

        effective_p = self.effective_metric_params_.get('p', self.p)
        if self.metric in ['wminkowski', 'minkowski']:
            self.effective_metric_params_['p'] = effective_p

        self.effective_metric_ = self.metric
        # For minkowski distance, use more efficient methods where available
        if self.metric == 'minkowski':
            p = self.effective_metric_params_.pop('p', 2)
            if p <= 0:
                raise ValueError(f"p must be greater than one for minkowski metric, "
                                 f"or in ]0, 1[ for fractional norms.")
            elif p == 1:
                self.effective_metric_ = 'manhattan'
            elif p == 2:
                self.effective_metric_ = 'euclidean'
            elif p == np.inf:
                self.effective_metric_ = 'chebyshev'
            else:
                self.effective_metric_params_['p'] = p

        if isinstance(X, NeighborsBase):
            self._fit_X = X._fit_X
            self._tree = X._tree
            self._fit_method = X._fit_method
            self._index = X._index
            self._hubness_reduction = X._hubness_reduction
            return self

        elif isinstance(X, BallTree):
            self._fit_X = X.data
            self._tree = X
            self._fit_method = 'ball_tree'
            return self

        elif isinstance(X, KDTree):
            self._fit_X = X.data
            self._tree = X
            self._fit_method = 'kd_tree'
            return self

        elif isinstance(X, ApproximateNearestNeighbor):
            self._tree = None
            if isinstance(X, PuffinnLSH):
                self._fit_X = np.array([X.index_.get(i) for i in range(X.n_indexed_)]) * X.X_indexed_norm_
                self._fit_method = 'lsh'
            elif isinstance(X, FalconnLSH):
                self._fit_X = X.X_train_
                self._fit_method = 'falconn_lsh'
            elif isinstance(X, NNG):
                self._fit_X = None
                self._fit_method = 'nng'
            elif isinstance(X, HNSW):
                self._fit_X = None
                self._fit_method = 'hnsw'
            elif isinstance(X, RandomProjectionTree):
                self._fit_X = None
                self._fit_method = 'rptree'
            self._index = X
            # TODO enable hubness reduction here.
            # We do not store X_train in all cases atm.
            # self._hubness_reduction_method = self.hubness
            # self._set_hubness_reduction(self._fit_X)
            return self

        X = check_array(X, accept_sparse='csr')

        n_samples = X.shape[0]
        if n_samples == 0:
            raise ValueError(f"n_samples must be greater than 0 (but was {n_samples}.")

        if issparse(X):
            if self.algorithm not in ('auto', 'brute'):
                warnings.warn("cannot use tree with sparse input: "
                              "using brute force")
            if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] \
                    and not callable(self.effective_metric_):
                raise ValueError(f"Metric '{self.effective_metric_}' not valid for sparse input. "
                                 f"Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) "
                                 f"to get valid options. Metric can also be a callable function.")
            self._fit_X = X.copy()
            self._tree = None
            self._fit_method = 'brute'
            if self.hubness is not None:
                warnings.warn(f'cannot use hubness reduction with sparse data: disabling hubness reduction.')
                self.hubness = None
            self._hubness_reduction_method = None
            self._hubness_reduction = NoHubnessReduction()
            return self

        self._fit_method = self.algorithm
        self._fit_X = X
        self._hubness_reduction_method = self.hubness

        if self._fit_method == 'auto':
            # A tree approach is better for small number of neighbors,
            # and KDTree is generally faster when available
            if ((self.n_neighbors is None or
                 self.n_neighbors < self._fit_X.shape[0] // 2) and
                    self.metric != 'precomputed'):
                if self.effective_metric_ in VALID_METRICS['kd_tree']:
                    self._fit_method = 'kd_tree'
                elif (callable(self.effective_metric_) or
                      self.effective_metric_ in VALID_METRICS['ball_tree']):
                    self._fit_method = 'ball_tree'
                else:
                    self._fit_method = 'brute'
            else:
                self._fit_method = 'brute'
            self._index = None

        if self._fit_method == 'ball_tree':
            self._tree = BallTree(X, self.leaf_size,
                                  metric=self.effective_metric_,
                                  **self.effective_metric_params_)
            self._index = None
        elif self._fit_method == 'kd_tree':
            self._tree = KDTree(X, self.leaf_size,
                                metric=self.effective_metric_,
                                **self.effective_metric_params_)
            self._index = None
        elif self._fit_method == 'brute':
            self._tree = None
            self._index = None
        elif self._fit_method == 'lsh':
            self._index = PuffinnLSH(**self.algorithm_params)
            self._index.fit(X)
            self._tree = None
        elif self._fit_method == 'falconn_lsh':
            self._index = FalconnLSH(**self.algorithm_params)
            self._index.fit(X)
            self._tree = None
        elif self._fit_method == 'nng':
            self._index = NNG(**self.algorithm_params)
            self._index.fit(X)
            self._tree = None
        elif self._fit_method == 'hnsw':
            self._index = HNSW(**self.algorithm_params)
            self._index.fit(X)
            self._tree = None
        elif self._fit_method == 'rptree':
            self._index = RandomProjectionTree(**self.algorithm_params)
            self._index.fit(X)
            self._tree = None  # because it's a tree, but not an sklearn tree...
        else:
            raise ValueError(f"algorithm = '{self.algorithm}' not recognized")

        # Fit hubness reduction method
        self._set_hubness_reduction(X)

        if self.n_neighbors is not None:
            if self.n_neighbors <= 0:
                raise ValueError(f"Expected n_neighbors > 0. Got {self.n_neighbors:d}")
            else:
                if not np.issubdtype(type(self.n_neighbors), np.integer):
                    raise TypeError(
                        f"n_neighbors does not take {type(self.n_neighbors)} value, "
                        f"enter integer value"
                        )

        return self
Пример #27
0
    def __init__(self,
                 reciprocal_lattice: Lattice,
                 original_points: np.ndarray,
                 original_dim: np.ndarray,
                 extra_points: np.ndarray,
                 nworkers: int = pdefaults["nworkers"]):
        """

        Args:
            original_points:
            nworkers:
        """
        self._nworkers = nworkers if nworkers != -1 else cpu_count()

        supercell_points = get_supercell_points([2, 2, 2], original_points)

        # want points in cartesian space so we can define a regular spherical
        # cutoff even if reciprocal lattice is not cubic. If we used a
        # fractional cutoff, the cutoff regions would not be spherical
        cart_points = reciprocal_lattice.get_cartesian_coords(supercell_points)

        cart_extra_points = reciprocal_lattice.get_cartesian_coords(
            extra_points)

        # small cutoff is slighly larger than the max regular grid spacing
        # means at least 1 neighbour point will always be included in each
        # direction
        dim_lengths = np.dot(1 / original_dim, reciprocal_lattice.matrix)
        small_cutoff = np.max(dim_lengths) * 1.01
        big_cutoff = small_cutoff * 2

        # use BallTree for quickly evaluating which points are within cutoffs
        tree = BallTree(cart_points)

        # big cutoff points are those which surround the extra points within
        # the big cutoff (it does not include the extra points themselves)
        big_cutoff_points_idx = np.concatenate(tree.query_radius(
            cart_extra_points, big_cutoff),
                                               axis=0)

        # Voronoi points are those we actually calculate in the Voronoi diagram
        # e.g. the big points + extra points
        voronoi_points = supercell_points[big_cutoff_points_idx]
        self._voronoi_points = np.concatenate((voronoi_points, extra_points))

        # small points are the points in original_points for which we want to
        # calculate the Voronoi volumes. Note this does not include the
        # indices of the extra points. Outside the small cutoff, the weights
        # will just be the regular grid weight.
        small_cutoff_points_idx = np.concatenate(tree.query_radius(
            cart_extra_points, small_cutoff),
                                                 axis=0)

        # get the indices of small_cutoff_points in voronoi_points
        small_in_voronoi_idx = _get_loc(big_cutoff_points_idx,
                                        small_cutoff_points_idx)

        # get the indices of the small cutoff points + extra points
        # in voronoi points that we want the volumes for. The extra points
        # were just added at the end of big_cutoff_points, so getting their
        # indices is simple
        self._volume_points_idx = np.concatenate(
            (small_in_voronoi_idx,
             np.arange(len(extra_points)) + len(big_cutoff_points_idx)))

        # get the indices of the small_cutoff_points (not including the extra
        # points) in the original mesh. this works because the supercell
        # points are in the same order as the original mesh, just repeated for
        # each cell in the supercell
        small_in_original_idx = (small_cutoff_points_idx %
                                 len(original_points))

        # get the indices of the small cutoff points + extra points in the
        # final volume array. Note that the final volume array has the same
        # order as original_mesh + extra_points
        self._volume_in_final_idx = np.concatenate(
            (small_in_original_idx,
             np.arange(len(extra_points)) + len(original_points)))

        # prepopulate the final volumes array. By default, each point has the
        # volume of the original mesh. Note: at this point, the extra points
        # will have zero volume. This will array will be updated by
        # compute_volumes
        self._final_volumes = np.full(
            len(original_points) + len(extra_points), 1 / len(original_points))
        self._final_volumes[len(original_points):] = 0