예제 #1
0
파일: sub.py 프로젝트: CRSilkworth/wtrwrks
  def _pump(self, target, smaller_size_array, a_is_smaller):
    """Execute the Sub tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: np.ndarray
      The result of a-b.
    smaller_size_array: np.ndarray
      Either 'a' or 'b' depending on which has fewer elements.
    a_is_smaller: bool
      Whether or not 'a' is the smaller size array.

    Returns
    -------
    dict(
      a: np.ndarray
        The object to subtract something from.
      b: np.ndarray
        The object which substracts from something else.
    )

    """
    # Reconstruct the larger array from the smaller size array nd the target.
    if a_is_smaller:
      a = ut.maybe_copy(smaller_size_array)
      b = np.array(a - target)
    else:
      a = np.array(target + smaller_size_array)
      b = ut.maybe_copy(smaller_size_array)

    return {'a': a, 'b': b}
예제 #2
0
    def _pump(self, target, smaller_size_array, a_is_smaller, missing_vals):
        """Execute the Mul tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: np.ndarray
      The result of a*b
    smaller_size_array: np.ndarray
      Either 'a' or 'b' depending on which has fewer elements.
    a_is_smaller: bool
      Whether a is the smaller sized array.
    missing_vals: np.ndarray
      The values from either 'a' or 'b' that were lost when the other array had a zero in that location.

    Returns
    -------
    dict(
      a: np.ndarray
        The first array to be multiplied
      b: np.ndarray
        The second array to be multiplied
    )

    """
        # Find the value of the larger array using target and the smaller array.
        # Fill in any missing values which occured when there was a zero involved.
        if a_is_smaller:
            a = ut.maybe_copy(smaller_size_array)
            b = np.array(target / a)
            b[target == 0] = missing_vals
        else:
            a = np.array(target / smaller_size_array)
            b = ut.maybe_copy(smaller_size_array)
            a[target == 0] = missing_vals
        return {'a': a, 'b': b}
예제 #3
0
파일: add.py 프로젝트: CRSilkworth/wtrwrks
    def _pump(self, target, smaller_size_array, a_is_smaller):
        """Execute the CatToIndex tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: np.ndarray of ints
      The indices of all the corresponding category values from 'cats'.
    cat_to_index_map: dict
      The mapping from category value to index. Must be one to one and contain all indices from zero to len(cat_to_index_map) - 1
    missing_vals: list of category values
      All the category values from 'cats' which were not found in cat_to_index_map.
    input_dtype: a numpy dtype
      The dtype of the inputted 'cats' array.

    Returns
    -------
    dict(
      cats: np.ndarray
        The array with all the category values to map to indices.
      cat_to_index_map: dict
        The mapping from category value to index. Must be one to one and contain all indices from zero to len(cat_to_index_map) - 1
    )

    """
        # reconstruct the other array from the smaller size array nd the target.
        if a_is_smaller:
            a = ut.maybe_copy(smaller_size_array)
            b = np.array(target - a)
        else:
            a = np.array(target - smaller_size_array)
            b = ut.maybe_copy(smaller_size_array)

        return {'a': a, 'b': b}
예제 #4
0
    def _pour(self, a, b):
        """Execute the Mul tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    a: np.ndarray
      The first array to be multiplied
    b: np.ndarray
      The second array to be multiplied

    Returns
    -------
    dict(
      target: np.ndarray
        The result of a*b
      smaller_size_array: np.ndarray
        Either 'a' or 'b' depending on which has fewer elements.
      a_is_smaller: bool
        Whether a is the smaller sized array.
      missing_vals: np.ndarray
        The values from either 'a' or 'b' that were lost when the other array had a zero in that location.
    )

    """
        # If a or b is not a numpy array, then cast them to it.
        if type(a) is not np.ndarray:
            a = np.array(a)
        if type(b) is not np.ndarray:
            b = np.array(b)

        # Save the array which has a fewer number of elements. Since we can
        # reconstruct the original shape of the larger array from the target.
        a_is_smaller = a.size < b.size
        if a_is_smaller:
            smaller_size_array = ut.maybe_copy(a)
        else:
            smaller_size_array = ut.maybe_copy(b)

        # Multiply them together and save all the values which were effectively
        # 'erased' by a corresponding zero in the smaller array. We don't need to
        # to it for the other array since the smaller sized array is going to be
        # saved anyway.
        target = np.array(a * b)
        if a_is_smaller:
            missing_vals = b[target == 0]
        else:
            missing_vals = a[target == 0]

        return {
            'target': target,
            'smaller_size_array': smaller_size_array,
            'a_is_smaller': a_is_smaller,
            'missing_vals': missing_vals
        }
예제 #5
0
    def _pump(self, target):
        """Execute the Clone tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: object
      The merged object. Simply takes the value of the first in the list.

    Returns
    -------
    dict(
      a0: object
        zeroth equal object
      a1: object
        first equal object,
      .
      .
      .
    )

    """
        kwargs = {}
        for key in self.slot_keys:
            kwargs[key] = ut.maybe_copy(target)
        return kwargs
예제 #6
0
    def _pour(self, **kwargs):
        """Execute the MergeEqual tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    a0: object
      zeroth equal object
    a1: object
      first equal object,
    .
    .
    .

    Returns
    -------
    dict(
      a: type of slot 'a'target: object
        The merged object. Simply takes the value of the first in the list.
    )

    """
        if self.test_equal:
            for key in kwargs:
                if not np.all(kwargs[key] == kwargs['a0']):
                    raise ValueError(
                        "All arguments passed to merge_equal must be equal. Got "
                        + str(kwargs[key]) + ' and ' + str(kwargs['a0']))
        return {'target': ut.maybe_copy(kwargs['a0'])}
예제 #7
0
    def _pump(self, **kwargs):
        """Execute the Clone tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    a0: object
      zeroth clone
    a1: object
      first clone,
    .
    .
    .

    Returns
    -------
    dict(
      a: object
        The object to be cloned into two.
    )

    """
        r_dict = {}
        for key in self.slot_keys:
            r_dict['a'] = ut.maybe_copy(kwargs[key])
            break
        return r_dict
예제 #8
0
    def _pour(self, a):
        """Execute the Clone tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    a: object
      The object to be cloned into two.
    num: int > 0
      The number of clones

    Returns
    -------
    dict(
      a0: object
        zeroth clone
      a1: object
        first clone,
      .
      .
      .
    )

    """

        r_dict = {}
        for key in self.tube_keys:
            r_dict[key] = ut.maybe_copy(a)
        return r_dict
예제 #9
0
    def _pump(self, target, smaller_size_array, a_is_smaller, missing_vals,
              remainder):
        """Execute the Div tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: np.ndarray
      The result of a/b
    smaller_size_array: np.ndarray
      Either 'a' or 'b' depending on which has fewer elements.
    a_is_smaller: bool
      Whether a is the smaller sized array.
    missing_vals: np.ndarray
      The values from either 'a' or 'b' that were lost when the other array had a zero in that location.
    remainder: np.ndarray
      The remainder of a/b in the case that 'a' and 'b' are of integer type.

    Returns
    -------
    dict(
      a: np.ndarray
        The numerator array.
      b: np.ndarray
        The denominator array
    )

    """
        if a_is_smaller:
            # If a is the smaller of the two arrays, then it was the one that was
            # saved. So no need to worry about the remainder.
            a = ut.maybe_copy(smaller_size_array)
            b = np.array(a / target)
            b[(target == 0)] = missing_vals
        else:
            a = target * smaller_size_array
            if target.dtype in (np.int32, np.int64):
                a = np.array(a + remainder)
            b = ut.maybe_copy(smaller_size_array)

            # If b is the smaller array then it is the one that was saved. This means
            # a nan, negative infinity, or positive infinity, (i.e. zeros in b)
            # correspond to the missing values in a.
            a[np.isposinf(target) | np.isneginf(target)
              | np.isnan(target)] = missing_vals
        return {'a': a, 'b': b}
예제 #10
0
파일: add.py 프로젝트: CRSilkworth/wtrwrks
    def _pour(self, a, b):
        """Execute the CatToIndex tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    cats: np.ndarray
      The array with all the category values to map to indices.
    cat_to_index_map: dict
      The mapping from category value to index. Must be one to one and contain all indices from zero to len(cat_to_index_map) - 1

    Returns
    -------
    dict(
      target: np.ndarray of ints
        The indices of all the corresponding category values from 'cats'.
      cat_to_index_map: dict
        The mapping from category value to index. Must be one to one and contain all indices from zero to len(cat_to_index_map) - 1
      missing_vals: list of category values
        All the category values from 'cats' which were not found in cat_to_index_map.
      input_dtype: a numpy dtype
        The dtype of the inputted 'cats' array.
    )

    """

        # Convert to nump arrays
        if type(a) is not np.ndarray:
            a = np.array(a)
        if type(b) is not np.ndarray:
            b = np.array(b)

        # Copy whichever has a fewer number of elements and pass as output
        a_is_smaller = a.size < b.size
        if a_is_smaller:
            smaller_size_array = ut.maybe_copy(a)
        else:
            smaller_size_array = ut.maybe_copy(b)

        target = np.array(a + b)

        return {
            'target': target,
            'smaller_size_array': smaller_size_array,
            'a_is_smaller': a_is_smaller
        }
예제 #11
0
    def _pour(self, a, bs, selector):
        if a.shape != selector.shape:
            raise ValueError(
                "Shape of a and selector must match. Got {} and {}".format(
                    a.shape, selector.shape))

        uniques = np.unique(selector)
        target = np.zeros(a.shape, dtype=bool)
        for unique in uniques:
            mask = selector == unique
            target[mask] = np.isin(a[mask], bs[unique])

        return {
            'target': target,
            'a': ut.maybe_copy(a),
            'bs': ut.maybe_copy(bs),
            'selector': ut.maybe_copy(selector)
        }
예제 #12
0
    def _pour(self, a):
        """Execute the Clone tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    a: object
      The object to be cloned into two.

    Returns
    -------
    dict(
      a: type of slot 'a'
        The first of the two cloned objects.
      b: type of slot 'a'
        The second of the two cloned objects.
    )

    """
        return {'a': ut.maybe_copy(a), 'b': ut.maybe_copy(a)}
예제 #13
0
파일: sub.py 프로젝트: chrisTripla/wtrwrks
    def _pour(self, a, b):
        """Execute the Sub tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    a: np.ndarray
      The object to subtract something from.
    b: np.ndarray
      The object which substracts from something else.

    Returns
    -------
    dict(
      target: np.ndarray
        The result of a-b.
      smaller_size_array: np.ndarray
        Either 'a' or 'b' depending on which has fewer elements.
      a_is_smaller: bool
        Whether or not 'a' is the smaller size array.
    )

    """
        # Convert to nump arrays
        if type(a) is not np.ndarray:
            a = np.array(a)
        if type(b) is not np.ndarray:
            b = np.array(b)

        # Copy whichever has a fewer number of elements and pass as output
        a_is_smaller = a.size < b.size
        if a_is_smaller:
            smaller_size_array = ut.maybe_copy(a)
        else:
            smaller_size_array = ut.maybe_copy(b)

        target = np.array(a - b)

        return {
            'target': target,
            'smaller_size_array': smaller_size_array,
            'a_is_smaller': a_is_smaller
        }
예제 #14
0
        def _pour(self, a, axis):
            # If an empty tuple was given then set the axis to None
            if not np.array(axis).size:
                input_axis = None
            else:
                input_axis = axis
            axis = np.array(axis)

            # Reduce the array using the supplied numpy array function.
            target = np_func(a, axis=input_axis)
            return {'target': target, 'a': ut.maybe_copy(a), 'axis': axis}
예제 #15
0
    def _pour(self, a, mask, replace_with):
        """Execute the Replace tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    a: np.ndarray
      The array which has values that are to be replaced.
    mask: np.ndarray of bools
      An array of booleans whose True values denote which of array 'a's values are to be replaced.
    replace_with: np.ndarray
      The values to be used to replace the corresponding values in 'a'.

    Returns
    -------
    dict(
      target: np.ndarray of same type as 'a'
        The array with the necessary values replaced.
      mask: np.ndarray of bools
        An array of booleans whose True values denote which of array 'a's values are to be replaced.
      replaced_vals: np.ndarray of same type as 'a'
        The values that were overwritten when they were replaced by the replace_with values.
      replace_with_shape: list of ints
        The original shape of the replace_with array.
    )

    """
        self.mask = mask
        # Cast the replace_with values to an array.
        replace_with = np.array(replace_with)
        target = ut.maybe_copy(a)

        # Save the values that are going to be replaced.
        replaced_vals = af.empty_array_like(a)

        replaced_vals[mask] = target[mask]

        # if len(replace_with.shape) != 1:
        #   raise ValueError("replace_with must be numpy array of rank 1, Got {} ".format(replace_with.shape))
        # if int(np.sum(mask)) != int(replace_with.size):
        #   raise ValueError("Number of values to be replaced needs to match the size of replace_with. Got: {} and {}".format(np.sum(mask), replace_with.size))

        # Replace the values with the values found in replace_with.
        target[mask] = replace_with

        return {
            'target': target,
            'mask': mask,
            'replaced_vals': replaced_vals,
            'replace_with': replace_with
        }
예제 #16
0
    def _pump(self, target, removed, num_tries, ends, random_seed, segment_ids,
              is_random_next):
        """Execute the Shape tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: np.ndarray
      The array a with the [SEP] and [CLS] tags as well a some randomly overwritten second sentences.
    removed: np.ndarray
      A array with the same size as target that contains all the substrings that were overwritten.
    ends: np.ndarray of bools
      An array of the same shape as 'a' which marks the end of a sentence with a True.
    num_tries: int
      The number of times to try and find a random sentences to replace the second part of the 'a' array.
    segment_ids: np.ndarray
      An array of zeros and ones with the same shape as 'a' which says whether the token is part of the first sentence or the second.
    is_random_next: np.ndarray
      An array of bools which says whether the second sentence was replaced with a random sentence.
    random_seed: int
      The random seed.

    Returns
    -------
    dict(
      a: np.ndarray
        The array that will have the [SEP] and [CLS] tags inserted as well as randomly setting half of the rows to having random sentences after the first [SEP] tag.
      ends: np.ndarray of bools
        An array of the same shape as 'a' which marks the end of a sentence with a True.
      num_tries: int
        The number of times to try and find a random sentences to replace the second part of the 'a' array.
      random_seed: int
        The random seed.
    )

    """

        mask = removed != '[NA]'
        a = ut.maybe_copy(target)
        a[mask] = removed[mask]

        a = a[~np.isin(a, ['[CLS]', '[SEP]'])]
        a = np.reshape(a, list(target.shape[:-1]) + [target.shape[-1] - 3])

        return {
            'a': a,
            'num_tries': num_tries,
            'ends': ends,
            'random_seed': random_seed
        }
예제 #17
0
  def _pump(self, target, mask, replaced_vals, replace_with_shape):
    """Execute the Replace tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: np.ndarray of same type as 'a'
      The array with the necessary values replaced.
    mask: np.ndarray of bools
      An array of booleans whose True values denote which of array 'a's values are to be replaced.
    replaced_vals: np.ndarray of same type as 'a'
      The values that were overwritten when they were replaced by the replace_with values.
    replace_with_shape: list of ints
      The original shape of the replace_with array.

    Returns
    -------
    dict(
      a: np.ndarray
        The array which has values that are to be replaced.
      mask: np.ndarray of bools
        An array of booleans whose True values denote which of array 'a's values are to be replaced.
      replace_with: np.ndarray
        The values to be used to replace the corresponding values in 'a'.
    )

    """
    a = ut.maybe_copy(target)
    replace_with = a[mask]

    a[mask] = replaced_vals[mask]

    if mask.any():
      # If the replace_with had any shape then find the number of elements.
      # Otherwise it's just a scalar and has one element
      if replace_with_shape:
        num_elements = np.prod(replace_with_shape)
      else:
        num_elements = 1

      # If there was only one element then just save the replace_with value
      # as the first element. Reshape it so it matches it's former shape.
      if num_elements == 1:
        replace_with = replace_with.flatten()[0].reshape(replace_with_shape)
    else:
      # Otherwise the replace_with_shape is actually the replace_with values.
      replace_with = replace_with_shape[0]
    a = a.astype(replaced_vals.dtype.type)
    return {'a': a, 'mask': mask, 'replace_with': replace_with}
예제 #18
0
  def _pour(self, a, mask, replace_with):
    """Execute the Replace tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    a: np.ndarray
      The array which has values that are to be replaced.
    mask: np.ndarray of bools
      An array of booleans whose True values denote which of array 'a's values are to be replaced.
    replace_with: np.ndarray
      The values to be used to replace the corresponding values in 'a'.

    Returns
    -------
    dict(
      target: np.ndarray of same type as 'a'
        The array with the necessary values replaced.
      mask: np.ndarray of bools
        An array of booleans whose True values denote which of array 'a's values are to be replaced.
      replaced_vals: np.ndarray of same type as 'a'
        The values that were overwritten when they were replaced by the replace_with values.
      replace_with_shape: list of ints
        The original shape of the replace_with array.
    )

    """
    # Cast the replace_with values to an array.
    replace_with = np.array(replace_with)
    target = ut.maybe_copy(a)

    # Save the values that are going to be replaced.
    replaced_vals = af.empty_array_like(a)
    replaced_vals[mask] = target[mask]

    # Replace the values with the values found in replace_with.
    target[mask] = replace_with

    # If the mask is all false then save the actual replace_with values, since
    # that information would otherwise be lost. Otherwise just save the shape.
    if mask.any():
      replace_with_shape = replace_with.shape
    else:
      replace_with_shape = (replace_with,)

    return {'target': target, 'mask': mask, 'replaced_vals': replaced_vals, 'replace_with_shape': replace_with_shape}
예제 #19
0
  def _pour(self, a):
    """

    Parameters
    ----------
    a: np.ndarray
      The array to get the shape of

    Returns
    -------
    dict(
      target: list of ints
        The shape of the array.
      a: np.ndarray
        The array to get the shape of
    )

    """
    return {'target': list(a.shape), 'a': ut.maybe_copy(a)}
예제 #20
0
  def _pump(self, target, a):
    """Execute the Shape tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: list of ints
      The shape of the array.
    a: np.ndarray
      The array to get the shape of

    Returns
    -------
    dict(
      a: np.ndarray
        The array to get the shape of
    )

    """
    return {'a': ut.maybe_copy(a)}
예제 #21
0
    def _pour(self, a, default_val):
        """

    Parameters
    ----------
    a: np.ndarray
      The array to get the effective length of.
    default_val:
      The value to not count

    Returns
    -------
    dict(
      target: np.ndarray
        An array of the same shape as 'a' except missing the last dimension. The values are effective lengths of the last dimesion of a.
      a: np.ndarray
        The array to get the effective length of.
      default_val:
        The value to not count
    )

    """
        zero = (np.array(a) == default_val)

        all_zero = np.all(zero, axis=-1)
        not_zero = ~zero

        reversed_last_dim = not_zero[..., ::-1]

        lengths = np.argmax(reversed_last_dim, axis=-1)
        lengths = a.shape[-1] - lengths
        lengths[all_zero] = 0

        return {
            'target': lengths,
            'a': ut.maybe_copy(a),
            'default_val': default_val
        }
예제 #22
0
    def _pump(self, target, a, axis):
        """Execute the Shape tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: list of ints
      The shape of the array.
    a: np.ndarray
      The array to get the shape of
    axis: int
      The axis to get the dim_size from.

    Returns
    -------
    dict(
      a: np.ndarray
        The array to get the shape of
      axis: int
        The axis to get the dim_size from.
    )

    """
        return {'a': ut.maybe_copy(a), 'axis': axis}
예제 #23
0
    def _pour(self, a, axis):
        """

    Parameters
    ----------
    a: np.ndarray
      The array to get the shape of
    axis: int
      The axis to get the dim_size from.

    Returns
    -------
    dict(
      target: list of ints
        The shape of the array.
      a: np.ndarray
        The array to get the shape of
      axis: int
        The axis to get the dim_size from.
    )

    """
        return {'target': a.shape[axis], 'a': ut.maybe_copy(a), 'axis': axis}
예제 #24
0
  def _pump(self, target, a, key):
    """Execute the Shape tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: object
      The value returned from the __getitem__ call to 'a'.
    a: object
      The object to getitem from.
    key: hashable
      The key to pass to the getitem

    Returns
    -------
    dict(
      a: object
        The object to getitem from.
      key: hashable
        The key to pass to the getitem
    )

    """
    return {'a': ut.maybe_copy(a), 'key': key}
예제 #25
0
  def _pour(self, a, key):
    """

    Parameters
    ----------
    a: object
      The object to getitem from.
    key: hashable
      The key to pass to the getitem

    Returns
    -------
    dict(
      target: object
        The value returned from the __getitem__ call to 'a'.
      a: object
        The object to getitem from.
      key: hashable
        The key to pass to the getitem
    )

    """
    return {'target': a[key], 'a': ut.maybe_copy(a)}
예제 #26
0
    def _pump(self, target, mask, replaced_vals, replace_with):
        """Execute the Replace tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: np.ndarray of same type as 'a'
      The array with the necessary values replaced.
    mask: np.ndarray of bools
      An array of booleans whose True values denote which of array 'a's values are to be replaced.
    replaced_vals: np.ndarray of same type as 'a'
      The values that were overwritten when they were replaced by the replace_with values.
    replace_with_shape: list of ints
      The original shape of the replace_with array.

    Returns
    -------
    dict(
      a: np.ndarray
        The array which has values that are to be replaced.
      mask: np.ndarray of bools
        An array of booleans whose True values denote which of array 'a's values are to be replaced.
      replace_with: np.ndarray
        The values to be used to replace the corresponding values in 'a'.
    )

    """
        a = ut.maybe_copy(target)
        replaced_vals = np.array(replaced_vals)
        if replaced_vals.dtype.itemsize > a.dtype.itemsize:
            a = a.astype(replaced_vals.dtype)

        if replaced_vals.size == 1:
            a[mask] = replaced_vals
        else:
            a[mask] = replaced_vals[mask]
        a = a.astype(replaced_vals.dtype.type)
        return {'a': a, 'mask': mask, 'replace_with': replace_with}
예제 #27
0
    def _pump(self, target, a, default_val):
        """Execute the Shape tank (operation) in the pump (backward) direction.

    Parameters
    ----------
    target: np.ndarray
      An array of the same shape as 'a' except missing the last dimension. The values are effective lengths of the last dimesion of a.
    a: np.ndarray
      The array to get the effective length of.
    default_val:
      The value to not count

    Returns
    -------
    dict(
      a: np.ndarray
        The array to get the effective length of.
      default_val:
        The value to not count
    )

    """

        return {'a': ut.maybe_copy(a), 'default_val': default_val}
예제 #28
0
  def _pour(self, strings, tokenizer, max_len, detokenizer):
    """Execute the Tokenize tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    strings: np.ndarray of strings
      The array of strings to tokenize.
    tokenizer: func
      Function which converts a string into a list of strings.
    detokenizer: func
      Function which takens in a list of tokens and returns a string. Not strictly necessary but it makes the tube 'diff' much smaller if it's close to the real method of detokenizing.
    max_len: int
      The maximum number of tokens. Defines the size of the added dimension.

    Returns
    -------
    dict(
      target: np.ndarray
        The array of tokenized strings. Will have rank = rank('a') + 1 where the last dimesion will have size max_len.
      tokenizer: func
        Function which converts a string into a list of strings.
      detokenizer: func
        Function which takens in a list of tokens and returns a string. Not strictly necessary but it makes the tube 'diff' much smaller if it's close to the real method of detokenizing.
      diff: np.ndarray of strings
        The array of strings which define the differences between the original string and the string that has been tokenized then detokenized.
    )

    """
    # Convert to a numpy array.
    strings = np.array(strings)

    # print detokenizer('I went on a run yesterday. I saw a bird and it was magnificent. I hope to see one again tomorrow.')
    # Handle the empty array case
    if not strings.size:
      return {'target': ut.maybe_copy(strings), 'diff': ut.maybe_copy(strings), 'tokenizer': tokenizer, 'detokenizer': detokenizer}

    all_tokens = []
    all_diffs = []

    lengths = []
    for string in strings.flatten():
      # Tokenize the string, and regularize the length of the array by padding
      # with '' to fill out the array if it's too small or truncated if it's
      # too long.
      tokens = np.array(tokenizer(string))
      lengths.append(len(tokens))
      if tokens.size < max_len:
        num = max_len - tokens.size
        tokens = np.concatenate([tokens, np.full([num], '')])
      else:
        tokens = tokens[:max_len]

      all_tokens.append(tokens)
      # Detokenize the tokens and reconstruct the orignal string from the
      # diff_string
      processed = detokenizer(tokens)
      diff = di.get_diff_string(processed, string)
      all_diffs.append(np.array(diff, dtype=np.unicode))

    # Combine all the tokens arrays into a single array and reshape to the
    # shape of the original strings array with an additional dimesion of size
    # max_len.
    token_array = np.stack(all_tokens)
    target = np.reshape(token_array, list(strings.shape) + [max_len])

    # Keep all the string diffs and reshape it to match the original strings
    # array shape.
    diff_array = np.stack(all_diffs)
    diff = np.reshape(diff_array, strings.shape)

    return {'target': target, 'diff': diff, 'tokenizer': tokenizer, 'detokenizer': detokenizer}
예제 #29
0
    def _pour(self, a, b):
        """Execute the Div tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    a: np.ndarray
      The numerator array.
    b: np.ndarray
      The denominator array

    Returns
    -------
    dict(
      target: np.ndarray
        The result of a/b
      smaller_size_array: np.ndarray
        Either 'a' or 'b' depending on which has fewer elements.
      a_is_smaller: bool
        Whether a is the smaller sized array.
      missing_vals: np.ndarray
        The values from either 'a' or 'b' that were lost when the other array had a zero in that location.
      remainder: np.ndarray
        The remainder of a/b in the case that 'a' and 'b' are of integer type.
    )

    """
        # If they aren't numpy arrays then cast them to arrays.
        if type(a) is not np.ndarray:
            a = np.array(a)
        if type(b) is not np.ndarray:
            b = np.array(b)

        # Find the array with fewer elements and save that.
        a_is_smaller = a.size < b.size
        if a_is_smaller:
            smaller_size_array = ut.maybe_copy(a)
        else:
            smaller_size_array = ut.maybe_copy(b)

        # Do the division
        target = np.array(a / b)

        # Save the values of the larger array whose values are erased by a zero in
        # the smaller array
        if a_is_smaller:
            missing_vals = b[(target == 0)]
        else:
            missing_vals = a[np.isposinf(target) | np.isneginf(target)
                             | np.isnan(target)]

        # Don't allowed integer division by zero.
        if a.dtype in (np.int32, np.int64) and b.dtype in (np.int32, np.int64):
            if (b == 0).any():
                raise ZeroDivisionError(
                    "Integer division by zero is not supported.")
            remainder = np.array(np.remainder(a, b))
        else:
            remainder = np.array([], dtype=target.dtype)

        return {
            'target': target,
            'smaller_size_array': smaller_size_array,
            'a_is_smaller': a_is_smaller,
            'missing_vals': missing_vals,
            'remainder': remainder
        }
예제 #30
0
    def _pour(self, strings, ids, tokenizer,
              detokenizer=lambda a: ' '.join(a)):
        """Execute the FlatTokenize tank (operation) in the pour (forward) direction.

    Parameters
    ----------
    strings: np.ndarray of strings
      The array of strings to tokenize.
    tokenizer: func
      Function which converts a string into a list of strings.
    detokenizer: func
      Function which takens in a list of tokens and returns a string. Not strictly necessary but it makes the tube 'diff' much smaller if it's close to the real method of detokenizing.
    ids: np.ndarray
      An array of ids which uniquely identify each element of 'strings'. Necessary in order to reconstruct strings since all information about axis is lost when flattened. Each id from ids must be unique.The array of is the same shape as strings

    Returns
    -------
    dict(
      target: np.ndarray
        A one dimensional array of tokens.
      tokenizer: func
        Function which converts a string into a list of strings.
      detokenizer: func
        Function which takens in a list of tokens and returns a string. Not strictly necessary but it makes the tube 'diff' much smaller if it's close to the real method of detokenizing.
      diff: np.ndarray of strings
        The array of strings which define the differences between the original string and the string that has been tokenized then detokenized.
      shape: list of ints
        The shape of the inputted array.
      ids: np.ndarray
        An array of ids which uniquely identify each element of 'strings'. Necessary in order to reconstruct strings. The array of is the same shape as target
    )

    """
        strings = np.array(strings)
        # Guard for the empty array case
        if not strings.size:
            return {
                'target': ut.maybe_copy(strings),
                'diff': ut.maybe_copy(strings),
                'tokenizer': tokenizer,
                'detokenizer': detokenizer
            }

        all_tokens = []
        all_diffs = []

        # Go through each element of the string array, and it's corresponding id.
        r_ids = []
        for string_id, string in zip(ids.flatten(), strings.flatten()):
            # Tokenize the string and add it to the long list of all the tokens.
            tokens = tokenizer(string)
            all_tokens.extend(tokens)

            # Copy the string id len(tokens) times so that the ids always have
            # the same length as the tokens. This makes it more suitable for breaking
            # up in downstream tanks.
            r_ids.extend([string_id] * len(tokens))

            # Find the string diff after detokenizing the tokens.
            processed = detokenizer(tokens)
            diff = di.get_diff_string(processed, string)

            # Copy the diff len(tokens) times so that it always has the same size
            # as tokens. This makes it more suitable for breaking up in downstream
            # tanks.
            all_diffs.extend([diff] * len(tokens))

        target = np.array(all_tokens).astype(strings.dtype)
        diff = np.array(all_diffs).astype(strings.dtype)
        r_ids = np.array(r_ids)

        return {
            'target': target,
            'diff': diff,
            'tokenizer': tokenizer,
            'detokenizer': detokenizer,
            'ids': r_ids,
            'shape': strings.shape
        }