예제 #1
0
def correct_dot_masks(masks,
                      gain_map,
                      excluded_pixels=None,
                      allow_empty=False):
    mask_shape = masks.shape
    sig_shape = gain_map.shape
    masks = masks.reshape((-1, np.prod(sig_shape)))

    if excluded_pixels is not None:
        if is_sparse(masks):
            result = sparse.DOK(masks)
        else:
            result = masks.copy()
        desc = RepairDescriptor(sig_shape,
                                excluded_pixels=excluded_pixels,
                                allow_empty=allow_empty)
        for e, r, c in zip(desc.exclude_flat, desc.repair_flat,
                           desc.repair_counts):
            result[:, e] = 0
            rep = masks[:, e] / c
            # We have to loop because of sparse.pydata limitations
            for m in range(result.shape[0]):
                for rr in r[:c]:
                    result[m, rr] = result[m, rr] + rep[m]
        if is_sparse(result):
            result = sparse.COO(result)
    else:
        result = masks
    result = result * gain_map.flatten()
    return result.reshape(mask_shape)
예제 #2
0
def build_trans_tables(retrieved, key, L):
    (dt, tSim, N, S, p, num_fact, p_fact, dzeta, a_pf, eps, f_russo,
     cm, a, U, w, tau_1, tau_2, tau_3_A, tau_3_B, g_A, beta, tau, t_0,
     g, random_seed, p_0, n_p, nSnap, russo2008_mode, muted_prop) = \
         file_handling.load_parameters(key)

    n_seeds = len(retrieved)
    num_tables = [[] for order in range(L + 1)]
    proba_tables = [[] for order in range(L + 1)]
    print('Table creation')
    for order in tqdm(range(L)):
        chain_length = order + 1
        num_tables[order] = sparse.DOK(
            shape=tuple([p for ii in range(chain_length)]))
    print('Fill num_tables')
    for kick_seed in tqdm(range(n_seeds)):
        for cue_ind in range(p):
            if isinstance(retrieved[kick_seed][cue_ind], list) \
               and len(retrieved[kick_seed][cue_ind]) >= 3:
                # print(len(retrieved[kick_seed][cue_ind]))
                sequence = []
                sequence = retrieved[kick_seed][cue_ind][3:]

                for ind_trans in range(len(sequence) - L - 1):
                    trans_string = sequence[ind_trans:ind_trans + L + 1]
                    for order in range(L):
                        string = trans_string[:order + 1]
                        num_tables[order][tuple(string)] += 1
    print('Table conversion')
    for order in tqdm(range(L)):
        num_tables[order] = sparse.COO(num_tables[order])
        proba_tables[order] = num_tables[order] / num_tables[order].sum()
    return num_tables, proba_tables
예제 #3
0
    def _get_k1(self, system):
        """Calculates the second order terms where the scalar mapping is the
        inverse distance between atoms.

        Returns:
            1D ndarray: flattened K2 values.
        """
        grid = self.k1["grid"]
        start = grid["min"]
        stop = grid["max"]
        n = grid["n"]
        sigma = grid["sigma"]

        # Determine the geometry function
        geom_func_name = self.k1["geometry"]["function"]

        cmbtr = MBTRWrapper(
            self.atomic_number_to_index,
            self._interaction_limit,
            np.zeros((len(system), 3), dtype=int),
        )

        k1_map = cmbtr.get_k1(
            system.get_atomic_numbers(),
            geom_func_name.encode(),
            b"unity",
            {},
            start,
            stop,
            sigma,
            n,
        )

        k1_map = self._make_new_k1map(k1_map)

        # Depending on flattening, use either a sparse matrix or a dense one.
        n_elem = self.n_elements
        if self.flatten:
            k1 = sparse.DOK((n_elem * n), dtype=np.float32)
        else:
            k1 = np.zeros((n_elem, n), dtype=np.float32)

        for key, gaussian_sum in k1_map.items():
            i = key[0]

            # Denormalize if requested
            if not self.normalize_gaussians:
                max_val = 1 / (sigma * math.sqrt(2 * math.pi))
                gaussian_sum /= max_val

            if self.flatten:
                start = i * n
                end = (i + 1) * n
                k1[start:end] = gaussian_sum
            else:
                k1[i, :] = gaussian_sum
        if self.flatten:
            k1 = k1.to_coo()

        return k1
예제 #4
0
def test_mask_patch_sparse():
    for i in range(REPEATS):
        print(f"Loop number {i}")
        num_nav_dims = np.random.choice([1, 2, 3])
        num_sig_dims = np.random.choice([2, 3])

        nav_dims = tuple(np.random.randint(low=8, high=16, size=num_nav_dims))
        sig_dims = tuple(np.random.randint(low=8, high=16, size=num_sig_dims))

        # The mask-based correction is performed as float64 since it creates
        # numerical instabilities otherwise
        data = gradient_data(nav_dims, sig_dims).astype(np.float64)

        gain_map = (np.random.random(sig_dims) + 1).astype(np.float64)
        dark_image = np.random.random(sig_dims).astype(np.float64)

        exclude = exclude_pixels(sig_dims=sig_dims, num_excluded=3)

        damaged_data = data.copy()
        damaged_data /= gain_map
        damaged_data += dark_image
        damaged_data[(Ellipsis, *exclude)] = 1e24

        print("Nav dims: ", nav_dims)
        print("Sig dims:", sig_dims)
        print("Exclude: ", exclude)

        masks = sparse.DOK(sparse.zeros((20, ) + sig_dims, dtype=np.float64))
        indices = [
            np.random.randint(low=0, high=s, size=s // 2)
            for s in (20, ) + sig_dims
        ]
        for tup in zip(*indices):
            masks[tup] = 1
        masks = masks.to_coo()

        data_flat = data.reshape((np.prod(nav_dims), np.prod(sig_dims)))
        damaged_flat = damaged_data.reshape(
            (np.prod(nav_dims), np.prod(sig_dims)))

        correct_dot = sparse.dot(data_flat,
                                 masks.reshape((-1, np.prod(sig_dims))).T)
        corrected_masks = detector.correct_dot_masks(masks, gain_map, exclude)
        assert is_sparse(corrected_masks)

        reconstructed_dot =\
            sparse.dot(damaged_flat, corrected_masks.reshape((-1, np.prod(sig_dims))).T)\
            - sparse.dot(dark_image.flatten(), corrected_masks.reshape((-1, np.prod(sig_dims))).T)

        _check_result(data=correct_dot,
                      corrected=reconstructed_dot,
                      atol=1e-8,
                      rtol=1e-5)
예제 #5
0
    def matrix_load(self, path):
        ''' Loads a previously saved matrix from a .npz file (containing the matrix) and a .nfo file (containing the matrix tags). '''

        # load matrix
        matrix = sparse.load_npz(os.path.splitext(path)[0] + '.npz')
        matrix = sparse.DOK(
            matrix)  # convert to dict-of-keys for faster indexing

        # load matrix tags
        with open(os.path.splitext(path)[0] + '.nfo', 'rb') as f:
            tags = pickle.load(f)

        return matrix, tags
예제 #6
0
파일: sMPX.py 프로젝트: zhcui/watermelon
def overwrite(mpx, out=None):
    """
    Overwrites tensors of mpx2 with tensors of mpx1,
    with fixed shape of mpx2 tensors.

    Parameters
    ----------
    mpx : MPX (source)
    out : MPX (target) [modified]
    """
    L = len(mpx)
    for i in range(L):
        m1 = sp.DOK.from_coo(out[i])
        m2 = sp.DOK(mpx[i])
        for coord in m2.data:
            m1[coord] = m2[coord]
        out[i] = m1.to_coo()
예제 #7
0
def resolve_relations(db_file, rel_file, meta_file, id_file):
    """
	"""
    conn = open_db_connection(db_file)
    c = conn.cursor()

    # load or compute unique IDs
    if os.path.isfile(meta_file):
        meta = np.load(meta_file)
        off = meta[0]
        num_unique = meta[1]
        unique_ids = np.load(id_file)
    else:
        off = 0
        c.execute("SELECT DISTINCT event1_id FROM Relations;")
        event_ids = set(c.fetchall())
        for id2 in c.execute("SELECT event2_id FROM Relations;"):
            if not id2 in event_ids:
                event_ids.add(id2)
        unique_ids = np.char.array(list(event_ids))
        num_unique = len(event_ids)
        np.save(id_file, unique_ids)
        np.save(meta_file, np.array([off, num_unique]))

    id_lookup = dict()
    for i, id_entr in enumerate(unique_ids):
        id_lookup[id_entr[0]] = i

    # load or compute (compressed) relations
    if os.path.isfile(rel_file):
        relations = sparse.load_npz(rel_file)
    else:
        relations = sparse.DOK((num_unique, num_unique, RELATION_COUNT),
                               dtype=np.float32)
        for row in c.execute("SELECT * FROM Relations;"):
            id_out = row[1]
            id_in = row[2]
            relations[id_lookup[id_out], id_lookup[id_in], :] = row[3:]
        relations = sparse.COO(relations)
        sparse.save_npz(rel_file, relations)

    conn.close()
예제 #8
0
def bench_test_fused_pydata(tacoBench, num, pt1):
        loader = ImagePydataSparseTensorLoader()
        sparse_bin_img1 = safeCastPydataTensorToInts(loader.sparse_image(num, pt1, 1))
        sparse_bin_img2 = safeCastPydataTensorToInts(loader.sparse_image(num, pt1+0.05, 2))
        sparse_bin_window = loader.sparse_window(num, 3)
        bin_img1 = loader.dense_image(num, pt1, 1) 
        bin_img2 = loader.dense_image(num, pt1 + 0.05, 2)
        bin_window = loader.dense_window(num)

        def sparse_bench():
            return testOp(sparse_bin_img1, sparse_bin_img2, sparse_bin_window).astype('int')

        def dense_bench():
            return testOp(bin_img1, bin_img2, bin_window).astype('int')

        ret = tacoBench(sparse_bench)
        sparse_xor_img = sparse_bench()
        xor_img = dense_bench()

        # Write result to TNS file to see what's different
        shape = xor_img.shape
        result = sparse.COO.from_numpy(xor_img, fill_value=0)
        dok = sparse.DOK(result)
        TnsFileDumper().dump_dict_to_file(shape, dok.data, os.path.join("temp", "numpy-result-{}.tns".format(num)))
        
    
        num_elements = float(np.prod(bin_img1.shape))
        f = sparse_xor_img.fill_value
        print("shape1", sparse_bin_img1.shape)
        print("shape2", sparse_bin_img2.shape)
        print("sparse img1 nnz =", sparse_bin_img1.nnz, "    ", np.sum(bin_img1 != 0))
        print("sparse img2 nnz =", sparse_bin_img2.nnz, "    ", np.sum(bin_img2 != 0))
        print("sparse win nnz =", sparse_bin_window.nnz, "    ", np.sum(bin_window != 0))
        print("Total num elements", num_elements)
        print("Fill value", f)
        print("Sparse xor NNF = ", sparse_xor_img.nnz, "\t", "Dense xor NNF = ", np.sum(xor_img != int(f)))
        print("Dense xor NNZ = ", np.sum(xor_img != 0))
        assert(sparse_xor_img.nnz == np.sum(xor_img != 1))
예제 #9
0
def correct_dot_masks(masks, gain_map, excluded_pixels=None):
    mask_shape = masks.shape
    sig_shape = gain_map.shape
    masks = masks.reshape((-1, np.prod(sig_shape)))

    if excluded_pixels is not None:
        if is_sparse(masks):
            result = sparse.DOK(masks)
        else:
            result = masks.copy()
        repairs = environments(excluded_pixels, sig_shape)
        for e, r in zip(*flatten_filter(excluded_pixels, repairs, sig_shape)):
            result[:, e] = 0
            rep = masks[:, e] / len(r)
            # We have to loop because of sparse.pydata limitations
            for m in range(result.shape[0]):
                for rr in r:
                    result[m, rr] = result[m, rr] + rep[m]
        if is_sparse(result):
            result = sparse.COO(result)
    else:
        result = masks
    result = result * gain_map.flatten()
    return result.reshape(mask_shape)
예제 #10
0
def get_rel_counts(ds_name, must_overlap=True):
    """
    Get counts of all of the relations. Used for modeling directly P(rel | o1, o2)
    :param train_data:
    :param must_overlap:
    :return:
    """

    if ds_name.find('vg') >= 0:
        with open(cfg.DATA_DIR + '/vg/rel_annotations_train.json') as f:
            train_data = json.load(f)
    elif ds_name.find('vrd') >= 0:
        with open(cfg.DATA_DIR + '/vrd/new_annotations_train.json') as f:
            train_data = json.load(f)
    else:
        raise NotImplementedError

    sparse_fg_matrix = sparse.DOK(
        (
            cfg.MODEL.NUM_CLASSES - 1,  # not include background
            cfg.MODEL.NUM_CLASSES - 1,  # not include background
            cfg.MODEL.NUM_PRD_CLASSES + 1,  # include background
        ),
        dtype=np.int6)

    sparse_bg_matrix = sparse.DOK(
        (
            cfg.MODEL.NUM_CLASSES - 1,  # not include background
            cfg.MODEL.NUM_CLASSES - 1,  # not include background
        ),
        dtype=np.int6)

    for _, im_rels in train_data.items():
        # get all object boxes
        gt_box_to_label = {}
        for i, rel in enumerate(im_rels):
            sbj_box = box_utils.y1y2x1x2_to_x1y1x2y2(rel['subject']['bbox'])
            obj_box = box_utils.y1y2x1x2_to_x1y1x2y2(rel['object']['bbox'])
            sbj_lbl = rel['subject']['category']  # not include background
            obj_lbl = rel['object']['category']  # not include background
            prd_lbl = rel['predicate']  # not include background
            if tuple(sbj_box) not in gt_box_to_label:
                gt_box_to_label[tuple(sbj_box)] = sbj_lbl
            if tuple(obj_box) not in gt_box_to_label:
                gt_box_to_label[tuple(obj_box)] = obj_lbl

            sparse_fg_matrix[sbj_lbl, obj_lbl, prd_lbl + 1] += 1

        if cfg.MODEL.USE_OVLP_FILTER:
            if len(gt_box_to_label):
                gt_boxes = np.array(list(gt_box_to_label.keys()),
                                    dtype=np.int32)
                gt_classes = np.array(list(gt_box_to_label.values()),
                                      dtype=np.int32)
                o1o2_total = gt_classes[np.array(box_filter(
                    gt_boxes, must_overlap=must_overlap),
                                                 dtype=int)]
                for (o1, o2) in o1o2_total:
                    sparse_bg_matrix[o1, o2] += 1

        else:
            # consider all pairs of boxes, overlapped or non-overlapped
            for b1, l1 in gt_box_to_label.items():
                for b2, l2 in gt_box_to_label.items():
                    if b1 == b2:
                        continue
                    sparse_bg_matrix[l1, l2] += 1

    return sparse_fg_matrix.to_coo(), sparse_bg_matrix.to_coo()
예제 #11
0
def test_empty_dok_dtype():
    d = sparse.DOK(5, dtype=np.uint8)
    s = sparse.COO(d)
    assert s.dtype == d.dtype
예제 #12
0
def _dok_like(a, drop_dims=("c", ), dtype="uint8"):
    dims = tuple(d for d in a.dims if d not in drop_dims)
    shape = tuple(a.sizes[d] for d in dims)
    return xarray.DataArray(sparse.DOK(shape=shape, dtype=dtype), dims=dims)
예제 #13
0
def create_interaction_list(interaction_df,
                            num_individuals,
                            fps=3,
                            ringbuffer_size=5):
    ts = interaction_df.timestamp.min()

    rbs = ringbuffer_size
    rbs_hp = (rbs // 2) + 1

    # cumulative interactions over whole period
    previous_interactions = sparse.COO([],
                                       shape=(num_individuals,
                                              num_individuals))
    # frame sliding window
    interaction_ringbuffer = [
        sparse.COO([], shape=(num_individuals, num_individuals))
        for i in range(rbs)
    ]
    # cumulative interactions for all cameras within a 1/fps frame period
    # == 1 frame combined for all cameras
    current_interactions = sparse.COO([],
                                      shape=(num_individuals, num_individuals))

    interval_counter = 0

    events = []

    print("Number of events {}".format(len(interaction_df)), flush=True)
    print("Number of timestamps {}".format(
        len(interaction_df.timestamp.unique())),
          flush=True)

    for timestamp, group in list(
            interaction_df.sort_values("timestamp").groupby("timestamp")):
        # still within current time interval
        if (timestamp - ts) < datetime.timedelta(milliseconds=int(900 / fps)):
            pass
        # end of current time interval
        else:
            # count as interaction if more than half of rbs consecutive frames had interactions
            # == median filter over temporal dimension with kernel size rbs
            if interval_counter >= rbs:
                new_interactions = sparse.stack(interaction_ringbuffer).sum(
                    axis=0) > rbs_hp
                stopped_interactions = (previous_interactions.astype(np.int) -
                                        new_interactions.astype(np.int)) == 1

                if stopped_interactions.sum() > 0:
                    for bee_id_a, bee_id_b in np.argwhere(
                            stopped_interactions):
                        events.append((timestamp, bee_id_a, bee_id_b))

                previous_interactions = new_interactions

            interaction_ringbuffer[interval_counter %
                                   rbs] = current_interactions

            # new time interval => reset adjacency matrix and timestamp
            current_interactions = sparse.COO([],
                                              shape=(num_individuals,
                                                     num_individuals))
            ts = group.timestamp.min()

            interval_counter += 1

        # interaction adjacency matrix
        adj_data = {(min(k), max(k)): 1 for k in tuple(group["bee_id"].values)}
        adj = sparse.DOK(shape=(num_individuals, num_individuals),
                         data=adj_data)

        # logical or => accumulate interactions from different cameras
        # for current time interval (~1/fps of a second)
        current_interactions += adj
        current_interactions.clip(0, 1, current_interactions)

    return events
예제 #14
0
    def _get_k2(self, system, new_system, indices):
        """Calculates the second order terms where the scalar mapping is the
        inverse distance between atoms.

        Returns:
            1D ndarray: flattened K2 values.
        """
        grid = self.k2["grid"]
        start = grid["min"]
        stop = grid["max"]
        n = grid["n"]
        sigma = grid["sigma"]

        # Determine the weighting function and possible radial cutoff
        radial_cutoff = None
        weighting = self.k2.get("weighting")
        parameters = {}
        if weighting is not None:
            weighting_function = weighting["function"]
            if weighting_function == "exponential" or weighting_function == "exp":
                scale = weighting["scale"]
                threshold = weighting["threshold"]
                if scale != 0:
                    radial_cutoff = -math.log(threshold) / scale
                parameters = {
                    b"scale": weighting["scale"],
                    b"threshold": weighting["threshold"],
                }
        else:
            weighting_function = "unity"

        # Determine the geometry function
        geom_func_name = self.k2["geometry"]["function"]

        # Calculate extended system
        if self.periodic:
            centers = new_system.get_positions()
            ext_system, cell_indices = dscribe.utils.geometry.get_extended_system(
                system,
                radial_cutoff,
                centers,
                return_cell_indices=True,
            )
            ext_system = System.from_atoms(ext_system)
        else:
            ext_system = system
            cell_indices = np.zeros((len(system), 3), dtype=int)

        cmbtr = MBTRWrapper(self.atomic_number_to_index,
                            self._interaction_limit, cell_indices)

        # If radial cutoff is finite, use it to calculate the sparse distance
        # matrix to reduce computational complexity from O(n^2) to O(n log(n)).
        # If radial cutoff is not available, calculate full matrix.
        n_atoms_ext = len(ext_system)
        n_atoms_new = len(new_system)
        ext_pos = ext_system.get_positions()
        new_pos = new_system.get_positions()
        if radial_cutoff is not None:
            dmat = new_system.get_distance_matrix_within_radius(radial_cutoff,
                                                                pos=ext_pos)
            adj_list = dscribe.utils.geometry.get_adjacency_list(dmat)
            dmat_dense = np.full(
                (n_atoms_new, n_atoms_ext), sys.float_info.max
            )  # The non-neighbor values are treated as "infinitely far".
            dmat_dense[dmat.row, dmat.col] = dmat.data
        else:
            dmat_dense = scipy.spatial.distance.cdist(new_pos, ext_pos)
            adj_list = np.tile(np.arange(n_atoms_ext), (n_atoms_new, 1))

        # Form new indices that include the existing atoms and the newly added
        # ones
        indices = np.array(
            np.append(
                indices,
                [n_atoms_ext + i for i in range(n_atoms_new - len(indices))]),
            dtype=int,
        )

        k2_list = cmbtr.get_k2_local(
            indices,
            ext_system.get_atomic_numbers(),
            dmat_dense,
            adj_list,
            geom_func_name.encode(),
            weighting_function.encode(),
            parameters,
            start,
            stop,
            sigma,
            n,
        )
        k2_list = self._make_new_klist_local(k2_list)

        # Depending on flattening, use either a sparse matrix or a dense one.
        n_elem = self.n_elements
        n_loc = len(indices)
        if self.flatten:
            k2 = sparse.DOK((n_loc, n_elem * n), dtype=np.float32)

            for i_loc, k2_map in enumerate(k2_list):
                for key, gaussian_sum in k2_map.items():
                    i = key[1]
                    m = i
                    start = int(m * n)
                    end = int((m + 1) * n)

                    # Denormalize if requested
                    if not self.normalize_gaussians:
                        max_val = 1 / (sigma * math.sqrt(2 * math.pi))
                        gaussian_sum /= max_val

                    k2[i_loc, start:end] = gaussian_sum
            k2 = k2.to_coo()
        else:
            k2 = np.zeros((n_loc, n_elem, n), dtype=np.float32)
            for i_loc, k2_map in enumerate(k2_list):
                for key, gaussian_sum in k2_map.items():
                    i = key[1]

                    # Denormalize if requested
                    if not self.normalize_gaussians:
                        max_val = 1 / (sigma * math.sqrt(2 * math.pi))
                        gaussian_sum /= max_val

                    k2[i_loc, i, :] = gaussian_sum

        return k2
예제 #15
0
    def _get_k3(self, system, new_system, indices):
        """Calculates the second order terms where the scalar mapping is the
        inverse distance between atoms.

        Returns:
            1D ndarray: flattened K2 values.
        """
        grid = self.k3["grid"]
        start = grid["min"]
        stop = grid["max"]
        n = grid["n"]
        sigma = grid["sigma"]

        # Determine the weighting function and possible radial cutoff
        radial_cutoff = None
        weighting = self.k3.get("weighting")
        parameters = {}
        if weighting is not None:
            weighting_function = weighting["function"]
            if weighting_function == "exponential" or weighting_function == "exp":
                scale = weighting["scale"]
                threshold = weighting["threshold"]
                if scale != 0:
                    radial_cutoff = -0.5 * math.log(threshold) / scale
                parameters = {b"scale": scale, b"threshold": threshold}
        else:
            weighting_function = "unity"

        # Determine the geometry function
        geom_func_name = self.k3["geometry"]["function"]

        # Calculate extended system
        if self.periodic:
            centers_new = new_system.get_positions()
            centers_existing = system.get_positions()[indices]
            centers = np.concatenate((centers_new, centers_existing), axis=0)
            ext_system, cell_indices = dscribe.utils.geometry.get_extended_system(
                system,
                radial_cutoff,
                centers,
                return_cell_indices=True,
            )
            ext_system = System.from_atoms(ext_system)
        else:
            ext_system = system
            cell_indices = np.zeros((len(system), 3), dtype=int)

        cmbtr = MBTRWrapper(self.atomic_number_to_index,
                            self._interaction_limit, cell_indices)

        # If radial cutoff is finite, use it to calculate the sparse
        # distance matrix to reduce computational complexity from O(n^2) to
        # O(n log(n))
        fin_system = ext_system + new_system
        n_atoms_ext = len(ext_system)
        n_atoms_fin = len(fin_system)
        n_atoms_new = len(new_system)
        ext_pos = ext_system.get_positions()
        new_pos = new_system.get_positions()
        if radial_cutoff is not None:

            # Calculate distance within the extended system
            dmat_ext_to_ext = ext_system.get_distance_matrix_within_radius(
                radial_cutoff, pos=ext_pos)
            col = dmat_ext_to_ext.col
            row = dmat_ext_to_ext.row
            data = dmat_ext_to_ext.data
            dmat = scipy.sparse.coo_matrix((data, (row, col)),
                                           shape=(n_atoms_fin, n_atoms_fin))

            # Calculate the distances from the new positions to atoms in the
            # extended system using the cutoff
            if len(new_pos) != 0:
                dmat_ext_to_new = ext_system.get_distance_matrix_within_radius(
                    radial_cutoff, pos=new_pos)
                col = dmat_ext_to_new.col
                row = dmat_ext_to_new.row
                data = dmat_ext_to_new.data
                dmat.col = np.append(dmat.col, col + n_atoms_ext)
                dmat.row = np.append(dmat.row, row)
                dmat.data = np.append(dmat.data, data)
                dmat.col = np.append(dmat.col, row)
                dmat.row = np.append(dmat.row, col + n_atoms_ext)
                dmat.data = np.append(dmat.data, data)

            # Calculate adjacencies and transform to the dense matrix for
            # sending information to C++
            adj_list = dscribe.utils.geometry.get_adjacency_list(dmat)
            dmat_dense = np.full(
                (n_atoms_fin, n_atoms_fin), sys.float_info.max
            )  # The non-neighbor values are treated as "infinitely far".
            dmat_dense[dmat.row, dmat.col] = dmat.data

        # If no weighting is used, the full distance matrix is calculated
        else:
            dmat = scipy.sparse.lil_matrix((n_atoms_fin, n_atoms_fin))

            # Fill in block for extended system
            dmat_ext_to_ext = ext_system.get_distance_matrix()
            dmat[0:n_atoms_ext, 0:n_atoms_ext] = dmat_ext_to_ext

            # Fill in block for extended system to new system
            dmat_ext_to_new = scipy.spatial.distance.cdist(ext_pos, new_pos)
            dmat[0:n_atoms_ext,
                 n_atoms_ext:n_atoms_ext + n_atoms_new] = dmat_ext_to_new
            dmat[n_atoms_ext:n_atoms_ext + n_atoms_new,
                 0:n_atoms_ext] = dmat_ext_to_new.T

            # Calculate adjacencies and the dense version
            dmat = dmat.tocoo()
            adj_list = dscribe.utils.geometry.get_adjacency_list(dmat)
            dmat_dense = np.full(
                (n_atoms_fin, n_atoms_fin), sys.float_info.max
            )  # The non-neighbor values are treated as "infinitely far".
            dmat_dense[dmat.row, dmat.col] = dmat.data

        # Form new indices that include the existing atoms and the newly added
        # ones
        indices = np.array(np.append(
            indices, [n_atoms_ext + i for i in range(n_atoms_new)]),
                           dtype=int)

        k3_list = cmbtr.get_k3_local(
            indices,
            fin_system.get_atomic_numbers(),
            dmat_dense,
            adj_list,
            geom_func_name.encode(),
            weighting_function.encode(),
            parameters,
            start,
            stop,
            sigma,
            n,
        )

        k3_list = self._make_new_klist_local(k3_list)
        # Depending on flattening, use either a sparse matrix or a dense one.
        n_elem = self.n_elements
        n_loc = len(indices)
        if self.flatten:
            k3 = sparse.DOK((n_loc, int((n_elem * (3 * n_elem - 1) * n / 2))),
                            dtype=np.float32)

            for i_loc, k3_map in enumerate(k3_list):
                for key, gaussian_sum in k3_map.items():
                    i = key[0]
                    j = key[1]
                    k = key[2]

                    # This is the index of the spectrum. It is given by enumerating the
                    # elements of a three-dimensional array and only considering
                    # elements for which k>=i and i || j == 0. The enumeration begins
                    # from [0, 0, 0], and ends at [n_elem, n_elem, n_elem], looping the
                    # elements in the order k, i, j.
                    if j == 0:
                        m = k + i * n_elem - i * (i + 1) / 2
                    else:
                        m = n_elem * (n_elem + 1) / 2 + (j - 1) * n_elem + k
                    start = int(m * n)
                    end = int((m + 1) * n)

                    # Denormalize if requested
                    if not self.normalize_gaussians:
                        max_val = 1 / (sigma * math.sqrt(2 * math.pi))
                        gaussian_sum /= max_val

                    k3[i_loc, start:end] = gaussian_sum
            k3 = k3.to_coo()
        else:
            k3 = np.zeros((n_loc, n_elem, n_elem, n_elem, n), dtype=np.float32)
            for i_loc, k3_map in enumerate(k3_list):
                for key, gaussian_sum in k3_map.items():
                    i = key[0]
                    j = key[1]
                    k = key[2]

                    # Denormalize if requested
                    if not self.normalize_gaussians:
                        max_val = 1 / (sigma * math.sqrt(2 * math.pi))
                        gaussian_sum /= max_val

                    k3[i_loc, i, j, k, :] = gaussian_sum
        return k3
예제 #16
0
    def _get_k3(self, system):
        """Calculates the third order terms.

        Returns:
            1D ndarray: flattened K3 values.
        """
        grid = self.k3["grid"]
        start = grid["min"]
        stop = grid["max"]
        n = grid["n"]
        sigma = grid["sigma"]

        # Determine the weighting function and possible radial cutoff
        radial_cutoff = None
        weighting = self.k3.get("weighting")
        parameters = {}
        if weighting is not None:
            weighting_function = weighting["function"]
            if weighting_function == "exp" or weighting_function == "exponential":
                scale = weighting["scale"]
                threshold = weighting["threshold"]
                if scale != 0:
                    radial_cutoff = -0.5 * math.log(threshold) / scale
                parameters = {b"scale": scale, b"threshold": threshold}
        else:
            weighting_function = "unity"

        # Determine the geometry function
        geom_func_name = self.k3["geometry"]["function"]

        # If needed, create the extended system
        if self.periodic:
            centers = system.get_positions()
            ext_system, cell_indices = dscribe.utils.geometry.get_extended_system(
                system, radial_cutoff, centers, return_cell_indices=True)
            ext_system = System.from_atoms(ext_system)
        else:
            ext_system = system
            cell_indices = np.zeros((len(system), 3), dtype=int)

        cmbtr = MBTRWrapper(self.atomic_number_to_index,
                            self._interaction_limit, cell_indices)

        # If radial cutoff is finite, use it to calculate the sparse
        # distance matrix to reduce computational complexity from O(n^2) to
        # O(n log(n))
        n_atoms = len(ext_system)
        if radial_cutoff is not None:
            dmat = ext_system.get_distance_matrix_within_radius(radial_cutoff)
            adj_list = dscribe.utils.geometry.get_adjacency_list(dmat)
            dmat_dense = np.full(
                (n_atoms, n_atoms), sys.float_info.max
            )  # The non-neighbor values are treated as "infinitely far".
            dmat_dense[dmat.col, dmat.row] = dmat.data
        # If no weighting is used, the full distance matrix is calculated
        else:
            dmat_dense = ext_system.get_distance_matrix()
            adj_list = np.tile(np.arange(n_atoms), (n_atoms, 1))

        k3_map = cmbtr.get_k3(
            ext_system.get_atomic_numbers(),
            dmat_dense,
            adj_list,
            geom_func_name.encode(),
            weighting_function.encode(),
            parameters,
            start,
            stop,
            sigma,
            n,
        )

        k3_map = self._make_new_kmap(k3_map)
        # Depending of flattening, use either a sparse matrix or a dense one.
        n_elem = self.n_elements
        if self.flatten:
            k3 = sparse.DOK((int(n_elem * n_elem * (n_elem + 1) / 2 * n)),
                            dtype=np.float32)
        else:
            k3 = np.zeros((n_elem, n_elem, n_elem, n), dtype=np.float32)

        for key, gaussian_sum in k3_map.items():
            i = key[0]
            j = key[1]
            k = key[2]

            # This is the index of the spectrum. It is given by enumerating the
            # elements of a three-dimensional array where for valid elements
            # k>=i. The enumeration begins from [0, 0, 0], and ends at [n_elem,
            # n_elem, n_elem], looping the elements in the order j, i, k.
            m = int(j * n_elem * (n_elem + 1) / 2 + k + i * n_elem - i *
                    (i + 1) / 2)

            # Denormalize if requested
            if not self.normalize_gaussians:
                max_val = 1 / (sigma * math.sqrt(2 * math.pi))
                gaussian_sum /= max_val

            if self.flatten:
                start = m * n
                end = (m + 1) * n
                k3[start:end] = gaussian_sum
            else:
                k3[i, j, k, :] = gaussian_sum
        if self.flatten:
            k3 = k3.to_coo()

        return k3
예제 #17
0
파일: test_dok.py 프로젝트: s-fleck/sparse
def test_coo_fv_interface():
    s1 = sparse.full((5, 5), fill_value=1+np.random.rand())
    s2 = sparse.DOK(s1)
    assert_eq(s1, s2)
    s3 = sparse.COO(s2)
    assert_eq(s1, s3)
예제 #18
0
  def fit(self, training_caption_dict, image_object_dict, num_categories,
          train_markov = True, train_object_word = True):
    def create_ngram(tokens, n):
      """enumerate all ngrams from the list of tokens with automatic start and end paddings"""
      tokens_with_end = tokens + [self.end_token_index]
      return [tuple([self.start_token_index] * max(0, n - i - 1)
                    + tokens_with_end[max(0, i + 1 - n): i + 1]) for i in range(len(tokens_with_end))]

    # captions are yet to have start/end tokens added
    # unknown token depends on the data. Do not add artificially
    unique_words = {Constant.start_token, Constant.end_token}

    unmatch_count = 0
    matched_count = 0

    for img_id, ngram_lists in training_caption_dict.items():
      # make sure that the training data exists from both datasets
      if img_id not in image_object_dict:
        unmatch_count += 1
        continue

      matched_count += 1

      for ngrams in ngram_lists:
        unique_words.update(ngrams)

    print(f"{matched_count} images will be used for training")
    print(f"{unmatch_count} images unmatched")
    print(len(unique_words), "unique words")

    word_encoder = LabelEncoder()
    word_encoder.fit(list(unique_words))

    self.word_encoder = word_encoder

    self.start_token_index, self.end_token_index = word_encoder.transform([Constant.start_token,
                                                                           Constant.end_token])

    self.num_words = len(unique_words)
    self.num_obj_cats = num_categories

    if train_markov:
      # Count(w_t)
      word_count = np.zeros(self.num_words)
      # Count(w_t-2, w_t-1, w_t)
      state_transition_occurrence_matrix = sparse.DOK([self.num_words] * self.ngram_n)

    if train_object_word:
      # P(obj_cat | w_t)
      # flatten grid index dimension
      object_word_occurrence = np.zeros((self.num_obj_cats * self.grid_size ** 2, self.num_words))

    # MLE
    for img_id, sentence_lists in training_caption_dict.items():

      # make sure that the training data exists from both datasets
      if img_id not in image_object_dict:
        continue

      object_list = image_object_dict[img_id]

      for sentence in sentence_lists:
        encoded_sentence = word_encoder.transform(sentence).tolist()

        # add 1 start and end token per sentence for counting purpose
        for word in [self.start_token_index, self.end_token_index] + encoded_sentence:
          if train_markov:
            # add to word prob
            word_count[word] += 1

          if train_object_word:
            # add to object-word prob
            for object_id, grid_ids in object_list.items():
              for grid_id in grid_ids:
                object_word_occurrence[self.num_obj_cats * grid_id + object_id][word] += 1

        if train_markov:
          # add to markov chain prob
          # create_ngram automatically pads the start and end of the encoded sentence
          for ngram in create_ngram(encoded_sentence, self.ngram_n):
            state_transition_occurrence_matrix[ngram] += 1

    if train_markov:
      # P(w_t-2, w_t-1 | w_t)
      self.state_transition_prob_matrix = state_transition_occurrence_matrix / word_count # automatically converts from DOK to COO
      # P(w_t-2 | w_t-1)
      self.denominator_conditional_prob_matrix = state_transition_occurrence_matrix.to_coo().sum(-1) / word_count

      # impute the count of <start> and <end> token as the average count of all other regular words
      # this alleviates the problem of <end> token being generated too soon
      word_count_copy = word_count.copy()
      mask = np.ones(len(word_count_copy), dtype=bool)
      mask[[self.start_token_index, self.end_token_index]] = False
      word_count_copy[[self.start_token_index, self.end_token_index]] = word_count_copy[mask].mean()

      # P(w_t)
      self.word_log_prob = np.log(word_count_copy / word_count_copy.sum())
      # for debugging purpose
      self.word_count = word_count

    if train_object_word:
      self.object_word_prob = object_word_occurrence / word_count
예제 #19
0
def test_dok_dask_array_is_sparse():
    assert utils.is_dask_array_sparse(da.from_array(sparse.DOK((10, 10))))
예제 #20
0
def test_dok_indexing():
    s = sparse.DOK((3, 3))
    s[1, 2] = 0.5
    x = s.todense()
    assert_eq(x[1::-1], s[1::-1])
예제 #21
0
    def matrix(self, lastfm, tags=None, dim=3, save_to=None):
        ''' Computes a n-dimensional matrix where the (i_1, ... ,i_n)-th entry contains the number of tracks having all the i_1-th, ..., i_n-th tags (where the i's are the indexes in self.m_tags).

        Notes
        -----
        To optimize performance, values are computed only with indexes in increasing order (which means, we only compute the number of tracks having tag-0 
        and tag-1, not vice-versa). This is something to keep in mind when indexing the matrix.
        
        To optimize memory, the matrix is saved in sparse format. DOK is the preferred sparse format for building and indexing, while COO is the preferred
        sparse format to perform mathematical operations).

        The dimension of the matrix captures the kind of queries which you will be able to perform. A matrix of dim=2 on tracks=['rock', 'pop', 'hip-hop'] will
        capture how many tracks have tags rock and pop, or pop and hip-hop, but not rock, pop and hip-hop at the same time.
        
        A matrix of dim=len(tags) will fully describe the database (or the subset of the database having the given tags).
        A matrix of dim>len(tags) will be rather pointless (but we won't prevent you from doing it).

        Parameters
        ----------
        lastfm: LastFm, LastFm2Pandas
            Instance of tags database. Using LastFm2Pandas is strongly recommended here.

        tags: list
            List of tags to use. If None, all the tags will be used.

        dim: int
            The dimension of the matrix.

        save_to: str
            Filename or full path of the .npz file to save matrix and matrix tags. Use to load_from in the future.
        '''

        # initialize matrix tags
        if tags is None:
            tags = lastfm.get_tags()
        else:
            tags = [tag for tag in tags if tag in lastfm.get_tags()
                    ]  # possibly purge inexistent tags

        # initialize matrix
        matrix = sparse.DOK(
            (len(tags), ) * dim, dtype=np.int32
        )  # sparse dict-of-keys matrix (for easy creation, awful for calculations)

        # compute total number of steps to comatplotlibetion (see http://www.iosrjournals.org/iosr-jm/papers/Vol8-issue3/A0830110.pdf)
        n_steps = crazysum(n=len(tags), s=3, k=dim - 1)

        # check whether a progress bar is needed
        verbose = n_steps > 100
        if verbose:
            progbar = Progbar(n_steps)  # instantiate progress bar

        def count_intersect_tags(tags):
            tids_list = [lastfm.with_tag(tag) for tag in tags]
            tids_list.sort(key=len, reverse=True)
            tids = set(
                tids_list.pop()
            )  # start with shortest list of tids to improve performance; convert to set to be able to intersect
            for _ in range(len(tids_list)):
                tids = tids.intersection(tids_list.pop(
                ))  # intersections performed from shortest list to longest
            return len(tids)  # how many tids have all tags

        def count_intersect_tags_recursive(
            tags_idxs, dim
        ):  # recursively iterate count_intersect_tags dim times; avoid repetitions such as 'rock AND pop AND folk' vs. 'rock AND folk AND pop' vs. 'folk AND pop AND rock'
            if dim >= 1:
                for i in range(tags_idxs[-1] + 1):
                    count_intersect_tags_recursive(tags_idxs + (i, ), dim - 1)
            else:
                matrix[tags_idxs] = count_intersect_tags(
                    np.take(tags, tags_idxs))  # add count to sparse matrix
                if verbose:
                    progbar.add(1)

        # instantiate recursive loop
        for i in range(len(tags)):
            count_intersect_tags_recursive((i, ), dim - 1)

        matrix = matrix.to_coo()  # convert to coordinate matrix

        if save_to is not None:
            # save matrix
            sparse.save_npz(save_to, matrix.to_coo(
            ))  # default to compressed format (i.e. sparse format)

            # save matrix tags in serialized format
            with open(os.path.splitext(save_to)[0] + '.nfo', 'wb') as f:
                pickle.dump(tags, f)

        return matrix, tags