示例#1
0
    def _generate_mini_batches(
        self,
        id_list: List[str],
        batches: Dict[str, List[torch.Tensor]],
        shuffle: bool,
        state: np.random.RandomState,
    ):
        if shuffle:
            indices = np.arange(0, len(id_list))
            state.shuffle(indices)
            batches = {k: [v[i] for i in indices] for k, v in batches.items()}
            id_list = [id_list[i] for i in indices]

        bs = self.batch_size
        while len(id_list) >= bs:
            # Make mini-batch and yield
            yield (
                id_list[:bs],
                {k: torch.stack(v[:bs], 0)
                 for k, v in batches.items()},
            )
            id_list = id_list[bs:]
            batches = {k: v[bs:] for k, v in batches.items()}

        return id_list, batches
示例#2
0
    def _make_logical(
        n_tiles: int = 1,
        shuffle: bool = True,
        random_state: np.random.RandomState = None,
    ):
        """Make toy dataset"""
        base_pattern = np.array(
            [
                # A  B  OR  XOR  AND
                [0, 0, 0, 0, 0],
                [0, 1, 1, 1, 0],
                [1, 0, 1, 1, 0],
                [1, 1, 1, 0, 1],
            ],
            dtype=int,
        )

        N, E = base_pattern.shape
        D = 2
        L = E - D

        pattern = np.zeros((N, E))
        pattern[:, 0:L] = base_pattern[:, D:E]
        pattern[:, L:E] = base_pattern[:, 0:D]
        pattern = np.tile(pattern, (n_tiles, 1))
        if shuffle:
            random_state.shuffle(pattern)
        # return X, Y
        return (
            np.array(pattern[:, L:E], dtype=int),
            np.array(pattern[:, 0:L], dtype=int),
        )
示例#3
0
    def InsertBlockIntoKernel(self, rand: np.random.RandomState,
                              block_to_insert: str) -> None:
        """Insert a code block at a random position in the kernel.

    Args:
      rand: A random seed.
      block_to_insert: The code block to insert, as a string.
    """
        if not self.is_kernel:
            raise TypeError("Cannot insert block into non-kernel.")

        lines = self.src.split("\n")
        if len(lines) < 2:
            raise ValueError("OpenCL kernel is less than two lines long.")
        # Try and find a point to
        indices = list(range(1, len(lines)))
        rand.shuffle(indices)
        for insertion_line_idx in indices:
            previous_line = lines[insertion_line_idx - 1]
            if previous_line[-1] == ";" or previous_line[-1] == "{":
                # The previous line was either a statement or the start of a new block: we
                # can insert the block here.
                break
            else:
                app.Log(
                    2,
                    'Previous line "%s" not valid as a code block insertion '
                    "point",
                    previous_line,
                )
        else:
            raise ValueError(
                f"Failed to find a position to insert block in function '{self.src}'"
            )

        pre = lines[:insertion_line_idx]
        post = lines[insertion_line_idx:]

        indendation_at_point_of_insertion = 0
        for c in pre[-1]:
            if c == " ":
                indendation_at_point_of_insertion += 1
            else:
                break
        else:
            raise ValueError(
                f"Line contains nothing but whitespace: '{pre[-1]}'")

        if previous_line[-1] == "{":
            # Inserting block at the start of a new block, increase indentation.
            indendation_at_point_of_insertion += 2

        if indendation_at_point_of_insertion < 2:
            raise ValueError(
                "Line has insufficient indentation "
                f"({indendation_at_point_of_insertion}): '{pre[-1]}'")

        block = fmt.Indent(indendation_at_point_of_insertion, block_to_insert)

        self.src = "\n".join(pre + [block] + post)
示例#4
0
def _get_balanced_index(
        y: np.ndarray,
        test_split: float,
        dev_split: float = 0,
        random_state: np.random.RandomState = None) -> Tuple[list, list, list]:
    train = []
    test = []
    dev = []

    d = defaultdict(list)

    for i, _y in enumerate(y):
        d[_y].append(i)

    for index_list in d.values():

        if random_state is not None:
            random_state.shuffle(index_list)
        else:
            np.random.shuffle(index_list)

        ln = len(index_list)

        _test_split = int(ln * test_split)
        _dev_split = int(ln * dev_split)

        train.extend(index_list[_test_split + _dev_split:])
        test.extend(index_list[:_test_split])
        dev.extend(index_list[_test_split:_test_split + _dev_split])

    assert sum(map(len, [train, test, dev])) == len(y)

    return train, dev, test
 def _create_splits(size:int,
                    importance_estimators: int,
                    random_generator: np.random.RandomState) -> np.ndarray:
     """Create splits for the data (shuffled)
     """
     # create splits on the indicies, shuffled
     ind = np.arange(size)
     # shuffle array inplace
     random_generator.shuffle(ind)
     return np.array_split(ind, importance_estimators)
示例#6
0
文件: data.py 项目: xiaoyubing/sigver
def split_train_test(exp_set: Tuple[np.ndarray, np.ndarray, np.ndarray],
                     num_gen_train: int,
                     num_gen_test: int,
                     rng: np.random.RandomState) -> Tuple[Tuple[np.ndarray, np.ndarray, np.ndarray],
                                                          Tuple[np.ndarray, np.ndarray, np.ndarray]]:
    """ Splits a set into training and testing. Both sets contains the same users. The
        training set contains only genuine signatures, while the testing set contains
        genuine signatures and forgeries. Note that the number of genuine signatures for
        training plus the number of genuine signatures for test must be smaller or equal to
        the total number of genuine signatures (to ensure no overlap)

    Parameters
    ----------
    exp_set: tuple of np.ndarray (x, y, yforg)
        The dataset
    num_gen_train: int
        The number of genuine signatures to be used for training
    num_gen_test: int
        The number of genuine signatures to be used for testing
    rng: np.random.RandomState
        The random number generator (for reproducibility)

    Returns
    -------
    tuple of np.ndarray (x, y, yforg)
        The training set

    tuple of np.ndarray (x, y, yforg)
        The testing set
    """
    x, y, yforg = exp_set
    users = np.unique(y)

    train_idx = []
    test_idx = []

    for user in users:
        user_genuines = np.flatnonzero((y == user) & (yforg == False))
        rng.shuffle(user_genuines)
        user_train_idx = user_genuines[0:num_gen_train]
        user_test_idx = user_genuines[-num_gen_test:]

        # Sanity check to ensure training samples are not used in test:
        assert len(set(user_train_idx).intersection(user_test_idx)) == 0

        train_idx += user_train_idx.tolist()
        test_idx += user_test_idx.tolist()

        user_forgeries = np.flatnonzero((y == user) & (yforg == True))
        test_idx += user_forgeries.tolist()

    exp_train = x[train_idx], y[train_idx], yforg[train_idx]
    exp_test = x[test_idx], y[test_idx], yforg[test_idx]

    return exp_train, exp_test
示例#7
0
def buffered_shuffle(source: Iterable[Any], buffer_size: int,
                     rng: np.random.RandomState) -> Iterator[Any]:
  """Shuffles an iterable via buffered shuffling."""
  it = iter(source)
  buf = list(itertools.islice(it, buffer_size))
  rng.shuffle(buf)
  for i in it:
    r, buf[0] = buf[0], i
    swap = rng.randint(buffer_size)
    if swap < buffer_size - 1:
      buf[swap], buf[0] = buf[0], buf[swap]
    yield r
  for i in buf:
    yield i
示例#8
0
def _get_split_index(
        y: np.ndarray,
        test_split: float,
        dev_split: float = 0,
        random_state: np.random.RandomState = None) -> Tuple[list, list, list]:
    index_list = np.arange(len(y))

    if random_state is not None:
        random_state.shuffle(index_list)
    else:
        np.random.shuffle(index_list)

    ln = len(index_list)

    _test_split = int(ln * test_split)
    _dev_split = int(ln * dev_split)

    train = index_list[_test_split + _dev_split:]
    test = index_list[:_test_split]
    dev = index_list[_test_split:_test_split + _dev_split]

    return train, dev, test
示例#9
0
def simulate_data(covariates: int, scales: Sequence[int],
                  levels: Sequence[int], singletons: float,
                  state: np.random.RandomState) -> Tuple[Array, Array, Array]:
    """Simulate IDs and data matrices."""

    # simulate fixed effects
    ids = np.array(
        list(
            itertools.product(*(np.repeat(np.arange(l), s)
                                for s, l in zip(scales, levels)))))
    fe = np.array(
        list(
            itertools.product(*(np.repeat(state.normal(size=l), s)
                                for s, l in zip(scales, levels)))))

    # count dimensions
    N, M = ids.shape

    # shuffle the IDs
    for index in range(M):
        indices = np.arange(N)
        state.shuffle(indices)
        ids[indices, index] = ids.copy()[:, index]

    # shuffle and replace shares of the data with singletons
    indices = np.arange(N)
    for index in range(M):
        state.shuffle(indices)
        singleton_indices = indices[:int(singletons * N / M)]
        ids[indices, index] = ids.copy()[:, index]
        ids[singleton_indices, index] = -np.arange(singleton_indices.size)

    # simulate remaining data
    error = state.normal(size=(N, 1))
    X = state.normal(size=(N, covariates))
    y = X.sum(axis=1, keepdims=True) + fe.sum(axis=1, keepdims=True) + error
    return ids, X, y
示例#10
0
def generate_trajectories(
    policy,
    venv: VecEnv,
    sample_until: GenTrajTerminationFn,
    *,
    deterministic_policy: bool = False,
    rng: np.random.RandomState = np.random,
) -> Sequence[types.TrajectoryWithRew]:
    """Generate trajectory dictionaries from a policy and an environment.

    Args:
      policy (Callable,BasePolicy or BaseAlgorithm): A function mapping observation to action, a stable_baselines3 policy 
      or an algorithm trained on the gym environment.
      venv: The vectorized environments to interact with.
      sample_until: A function determining the termination condition.
          It takes a sequence of trajectories, and returns a bool.
          Most users will want to use one of `min_episodes` or `min_timesteps`.
      deterministic_policy: If True, asks policy to deterministically return
          action. Note the trajectories might still be non-deterministic if the
          environment has non-determinism!
      rng: used for shuffling trajectories.

    Returns:
      Sequence of trajectories, satisfying `sample_until`. Additional trajectories
      may be collected to avoid biasing process towards short episodes; the user
      should truncate if required.
    """
    if isinstance(policy, BaseAlgorithm):
        policy.set_env(venv)

    # Collect rollout tuples.
    trajectories = []
    # accumulator for incomplete trajectories
    trajectories_accum = TrajectoryAccumulator()
    obs = venv.reset()
    for env_idx, ob in enumerate(obs):
        # Seed with first obs only. Inside loop, we'll only add second obs from
        # each (s,a,r,s') tuple, under the same "obs" key again. That way we still
        # get all observations, but they're not duplicated into "next obs" and
        # "previous obs" (this matters for, e.g., Atari, where observations are
        # really big).
        trajectories_accum.add_step(dict(obs=ob), env_idx)

    # Now, we sample until `sample_until(trajectories)` is true.
    # If we just stopped then this would introduce a bias towards shorter episodes,
    # since longer episodes are more likely to still be active, i.e. in the process
    # of being sampled from. To avoid this, we continue sampling until all epsiodes
    # are complete.
    #
    # To start with, all environments are active.
    active = np.ones(venv.num_envs, dtype=np.bool)
    while np.any(active):
        if isinstance(policy, Callable):
            acts = policy(obs)
        else:
            acts, _ = policy.predict(obs, deterministic=deterministic_policy)
        obs, rews, dones, infos = venv.step(acts)

        # If an environment is inactive, i.e. the episode completed for that
        # environment after `sample_until(trajectories)` was true, then we do
        # *not* want to add any subsequent trajectories from it. We avoid this
        # by just making it never done.
        dones &= active

        new_trajs = trajectories_accum.add_steps_and_auto_finish(
            acts, obs, rews, dones, infos)
        trajectories.extend(new_trajs)

        if sample_until(trajectories):
            # Termination condition has been reached. Mark as inactive any environments
            # where a trajectory was completed this timestep.
            active &= ~dones

    # Note that we just drop partial trajectories. This is not ideal for some
    # algos; e.g. BC can probably benefit from partial trajectories, too.

    # Each trajectory is sampled i.i.d.; however, shorter episodes are added to
    # `trajectories` sooner. Shuffle to avoid bias in order. This is important
    # when callees end up truncating the number of trajectories or transitions.
    # It is also cheap, since we're just shuffling pointers.
    rng.shuffle(trajectories)

    # Sanity checks.
    for trajectory in trajectories:
        n_steps = len(trajectory.acts)
        # extra 1 for the end
        exp_obs = (n_steps + 1, ) + venv.observation_space.shape
        real_obs = trajectory.obs.shape
        assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}"
        exp_act = (n_steps, ) + venv.action_space.shape
        real_act = trajectory.acts.shape
        assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}"
        exp_rew = (n_steps, )
        real_rew = trajectory.rews.shape
        assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}"

    return trajectories
示例#11
0
def _init_param_random(x: np.ndarray, k: int,
                       random_state: np.random.RandomState, repeats: int,
                       tol: float, iterations: int):
    """Initialize the parameters for Gaussian mixture model fitting using random
    cluster assignment and constrained EM iterations.

    Parameters
    ----------
    x : numpy.ndarray
        Sample matrix of shape (n, p) (n=number of observations, p=number of
        features).
    k : int
        Number of components in the Gaussian mixture model.
    random_state : numpy.random.RandomState
        Random number generator.
    repeats : int
        Number of times to repeat the constrained EM iterations.
    tol : float
        Tolerance to determine early stopping of the EM algorithm (based on
        convergence of the log-likelihood).
    iterations: int
        Number of iterations of the EM algorithm.

    Returns
    -------
    means : numpy.ndarray of shape (k, p)
        Initial mean vectors for the EM algorithm.
    covs : numpy.ndarray of shape (k, p, p)
        Initial covariance matrices for the EM algorithm.
    weights : numpy.ndarray of shape (k,)
        Initial weights for the EM algorithm.
    """
    # n = number of observations, p = number of features
    n, p = x.shape

    # Initialize arrays for the parameters
    means = np.empty(shape=(k, p), dtype=np.float_)
    covs = np.empty(shape=(k, p, p), dtype=np.float_)
    weights = np.empty(shape=(k,), dtype=np.float_)

    # Repeat the EM algorithm with different initial parameter values. At the
    # end, the parameter estimates yielding the highest log-likelihood will be
    # selected.
    log_likelihood = -np.Inf
    for _ in range(repeats):
        # Randomly divide the data into k clusters
        clusters = np.tile(np.arange(k), reps=(int(n / k) + 1))[:n]
        random_state.shuffle(clusters)

        # Initialize the parameters using per-cluster estimates
        means_ = np.empty(shape=(k, p), dtype=np.float_)
        covs_ = np.empty(shape=(k, p, p), dtype=np.float_)
        weights_ = np.empty(shape=(k,), dtype=np.float_)
        for i in range(k):
            means_[i] = np.mean(x[clusters == i], axis=0)
            covs_[i] = np.cov(x[clusters == i], rowvar=False)
            weights_[i] = np.sum(clusters == i) / n

        # Run the EM algorithm a restricted number of times and see which gives
        # the best log-likelihood at the end.
        *params, log_likelihoods = \
            _gmm_fit_em(x, means_, covs_, weights_, tol, iterations)

        # Compare the new log-likelihood with the best log-likelihood so far and
        # update the parameters if necessary
        if log_likelihoods[-1] > log_likelihood:
            log_likelihood = log_likelihoods[-1]
            means, covs, weights = params

    return means, covs, weights
示例#12
0
文件: utils.py 项目: zzyunzhi/robogym
def place_objects_in_grid(
    object_bounding_boxes: np.ndarray,
    table_dimensions: Tuple[np.ndarray, np.ndarray, float],
    placement_area: PlacementArea,
    random_state: np.random.RandomState,
    max_num_trials: int = 5,
) -> Tuple[np.ndarray, bool]:
    """
    Place objects within rectangular boundaries by dividing the placement area into a grid of cells
    of equal size, and then randomly sampling cells for each object to be placed in.

    :param object_bounding_boxes: matrix of bounding boxes (num_objects, 2, 3) where [:, 0, :]
        contains the center position of the bounding box in Cartesian space relative to the body's
        frame of reference and where [:, 1, :] contains the half-width, half-height, and half-depth
        of the object.
    :param table_dimensions: Tuple (table_pos, table_size, table_height) defining dimension of
        the table where
            table_pos: position of the table.
            table_size: half-size of the table along (x, y, z).
            table_height: height of the table.
    :param placement_area: the placement area in which to place objects.
    :param random_state: numpy random state to use to shuffle placement positions
    :param max_num_trials: maximum number of trials to run (a trial will fail if there is overlap
        detected between any two placements; generally this shouldn't happen with this algorithm)
    :return: Tuple[np.ndarray, bool], where the array is of size (num_objects, 3) with columns set
        to the x, y, z coordinates of objects relative to the world frame, and the boolean
        indicates whether the placement is valid.
    """
    offset_x, offset_y, _ = placement_area.offset
    width, height, _ = placement_area.size
    table_pos, table_size, table_height = table_dimensions

    def _get_global_placement(placement: np.ndarray):
        return placement + [offset_x, offset_y, 0.0] - table_size + table_pos

    # 1. Determine the number of rows and columns of the grid, based on the largest object width
    # and height.
    total_object_area = 0.0
    n_objects = object_bounding_boxes.shape[0]
    max_obj_height = 0.0
    max_obj_width = 0.0
    for i in range(n_objects):
        # Bounding boxes are in half-sizes.
        obj_width = object_bounding_boxes[i, 1, 0] * 2
        obj_height = object_bounding_boxes[i, 1, 1] * 2

        max_obj_height = max(max_obj_height, obj_height)
        max_obj_width = max(max_obj_width, obj_width)

        object_area = obj_width * obj_height
        total_object_area += object_area

    n_columns = int(width // max_obj_width)
    n_rows = int(height // max_obj_height)
    n_cells = n_columns * n_rows

    cell_width = width / n_columns
    cell_height = height / n_rows

    if n_cells < n_objects:
        # Cannot find a valid placement via this method; give up.
        logging.warning(
            f"Unable to fit {n_objects} objects into placement area with {n_cells} cells"
        )
        return np.zeros(shape=(n_objects, 3)), False

    for trial_i in range(max_num_trials):
        placement_valid = True
        placements: List[Tuple[NumType, NumType, NumType]] = []

        # 2. Initialize an array with all valid cell coordinates.

        # Create an array of shape (n_rows, n_columns, 2) where each element contains the row,col
        # coord
        coords = np.dstack(np.mgrid[0:n_rows, 0:n_columns])
        # Create a shuffled list where ever entry is a valid (row, column) coordinate.
        coords = np.reshape(coords, (n_rows * n_columns, 2))
        random_state.shuffle(coords)
        coords = list(coords)

        # 3. Place each object into a randomly selected cell.
        for object_idx in range(n_objects):
            row, col = coords.pop()
            pos, size = object_bounding_boxes[object_idx]

            prop_x = cell_width * col + size[0] - pos[0]
            prop_y = cell_height * row + size[1] - pos[1]

            # Reference is to (xmin, ymin, zmin) of table.
            prop_z = object_bounding_boxes[object_idx, 1, -1] + 2 * table_size[-1]
            prop_z -= object_bounding_boxes[object_idx, 0, -1]

            placement = _get_global_placement(np.array([prop_x, prop_y, prop_z]))

            b1_x, b1_y = placement[:2]
            if not _is_valid_proposal(
                b1_x, b1_y, object_idx, object_bounding_boxes, placements
            ):
                placement_valid = False
                logging.warning(f"Trial {trial_i} failed on object {object_idx}")
                break

            placements.append(placement)

        if placement_valid:
            assert (
                len(placements) == n_objects
            ), "There should be a placement for every object"
            break

    return np.array(placements), placement_valid
示例#13
0
def _split_list(ids: List[Any], test_size: int,
                rns: np.random.RandomState) -> Tuple[List[Any], List[Any]]:
    rns.shuffle(ids)
    return ids[test_size:], ids[:test_size]