예제 #1
0
    def define_waterwork(self, array=empty, return_tubes=None):
        """Get the waterwork that completely describes the pour and pump transformations.

    Parameters
    ----------
    array : np.ndarray or empty
      The array to be transformed.

    Returns
    -------
    Waterwork
      The waterwork with all the tanks (operations) added, and names set.

    """
        # Replace all the NaT's with the inputted replace_with.
        nats, _ = td.isnat(array)
        replaced, _ = td.replace(nats['a'], nats['target'])

        replaced['replaced_vals'].set_name('replaced_vals')
        replaced['mask'].set_name('nats')

        # Convert the datetimes to numbers
        nums, _ = td.datetime_to_num(replaced['target'],
                                     self.zero_datetime,
                                     self.num_units,
                                     self.time_unit,
                                     name='dtn')
        nums['diff'].set_name('diff')

        # Do any additional normalizations
        if self.norm_mode == 'mean_std':
            nums, _ = nums['target'] - self.mean
            nums, _ = nums['target'] / self.std
        elif self.norm_mode == 'min_max':
            nums, _ = nums['target'] - self.min
            nums, _ = nums['target'] / (self.max - self.min)

        nums['target'].set_name('nums')

        if return_tubes is not None:
            ww = nums['target'].waterwork
            r_tubes = []
            for r_tube_key in return_tubes:
                r_tubes.append(ww.maybe_get_tube(r_tube_key))
            return r_tubes
예제 #2
0
    def define_waterwork(self, array=empty, return_tubes=None):
        """Get the waterwork that completely describes the pour and pump transformations.

    Parameters
    ----------
    array : np.ndarray or empty
      The array to be transformed.

    Returns
    -------
    Waterwork
      The waterwork with all the tanks (operations) added, and names set.

    """
        # Replace all the NaN's with the inputted replace_with function.
        nans, _ = td.isnan(array)
        nums, _ = td.replace(nans['a'], nans['target'])

        nums['replaced_vals'].set_name('replaced_vals')
        nums['mask'].set_name('nans')

        # Do any additional normalization
        if self.norm_mode == 'mean_std':
            nums, _ = nums['target'] - self.mean
            nums, _ = nums['target'] / self.std
        elif self.norm_mode == 'min_max':
            nums, _ = nums['target'] - self.min
            nums, _ = nums['target'] / (self.max - self.min)

        nums['target'].set_name('nums')

        if return_tubes is not None:
            ww = nums['target'].waterwork
            r_tubes = []
            for r_tube_key in return_tubes:
                r_tubes.append(ww.maybe_get_tube(r_tube_key))
            return r_tubes
예제 #3
0
    def define_waterwork(self, array=empty, return_tubes=None, prefix=''):
        """Get the waterwork that completely describes the pour and pump transformations.

    Parameters
    ----------
    array : np.ndarray or empty
      The array to be transformed.

    Returns
    -------
    Waterwork
      The waterwork with all the tanks (operations) added, and names set.

    """
        splits, splits_slots = td.split(array, [1], axis=1)
        splits_slots['a'].unplug()
        splits_slots['a'].set_name('array')

        splits, _ = td.iter_list(splits['target'], 2)

        # Tokenize the full strings into words
        tokens, tokens_slots = td.multi_tokenize(
            strings=splits[0],
            selector=splits[1],
            tokenizers=self.word_tokenizers,
            detokenizers=self.word_detokenizers,
            max_len=self.max_sent_len)

        # Set the names of various tubes and slots to make it easier to reference
        # them in further downstream.
        tokens['diff'].set_name('tokenize_diff')
        tokens_slots['max_len'].set_name('max_sent_len')
        tokens_slots['tokenizers'].set_name('tokenizers')
        tokens_slots['detokenizers'].set_name('detokenizers')

        # lower_case the strings, and set the diff strings of the tank to
        # 'lower_case_dff' for easier referencing.
        if self.lower_case:
            tokens, tokens_slots = td.lower_case(tokens['target'])
            tokens['diff'].set_name('lower_case_diff')

        # Half width the strings, and set the diff strings of the tank to
        # 'half_width_diff' for easier referencing.
        if self.half_width:
            tokens, tokens_slots = td.half_width(tokens['target'])
            tokens['diff'].set_name('half_width_diff')

        # Lemmatize the strings, and set the diff strings of the tank to
        # 'lemmatize_dff' for easier referencing.
        if self.lemmatize:
            tokens, tokens_slots = td.lemmatize(tokens['target'])
            tokens['diff'].set_name('lemmatize_diff')
            tokens_slots['lemmatizer'].set_name('lemmatizer')

        languages, _ = td.clone(splits[1])
        languages['b'].set_name('languages')

        dim_size, _ = td.dim_size(languages['a'], axis=0)
        shape, _ = td.tube_list(dim_size['target'], 1, 1)
        tile, _ = td.reshape(
            languages['a'],
            shape['target'],
            tube_plugs={
                'old_shape':
                lambda z: (z[self._pre('languages', prefix)].shape[0], 1)
            })
        tile, _ = td.tile(
            tile['target'], (1, 1, self.max_sent_len),
            tube_plugs={
                'old_shape':
                lambda z: (z[self._pre('languages', prefix)].shape[0], 1, 1)
            })

        # Find all the strings which are not in the list of known words and
        # replace them with the 'unknown token'.
        maps_with_empty_strings = {
            k: v + ['']
            for k, v in self.index_to_word_maps.iteritems()
        }
        isin, isin_slots = td.multi_isin(tokens['target'],
                                         maps_with_empty_strings,
                                         tile['target'])

        mask, _ = td.logical_not(isin['target'])
        tokens, _ = td.replace(
            isin['a'],
            mask['target'],
            '[UNK]',
            tube_plugs={
                'mask': lambda z: z[self._pre('indices', prefix)] == 0
            })

        # Keep track values that were overwritten with a 'unknown token'
        tokens['replaced_vals'].set_name('missing_vals')
        isin_slots['bs'].set_name('index_to_word_maps')

        # Convert the tokens into indices.
        indices, indices_slots = td.multi_cat_to_index(
            tokens['target'],
            tile['target'],
            self.word_to_index_maps,
            tube_plugs={
                'selector':
                lambda z: np.tile(
                    np.reshape(z[self._pre('languages')],
                               (z[self._pre('languages')].shape[0], 1, 1)),
                    (1, 1, self.max_sent_len)),
                'missing_vals':
                lambda z: np.full(
                    z[self._pre('indices')].shape, '', dtype=np.unicode),
                'input_dtype':
                self.input_dtype
            })

        # Set the names of the slots and tubes of this tank for easier referencing
        indices['target'].set_name('indices')
        # indices['selector'].set_name('languages')
        indices_slots['cat_to_index_maps'].set_name('word_to_index_maps')

        if return_tubes is not None:
            ww = indices['target'].waterwork
            r_tubes = []
            for r_tube_key in return_tubes:
                r_tubes.append(ww.maybe_get_tube(r_tube_key))
            return r_tubes
예제 #4
0
    def define_waterwork(self, array=empty, return_tubes=None, prefix=''):
        """Get the waterwork that completely describes the pour and pump transformations.

    Parameters
    ----------
    array : np.ndarray or empty
      The array to be transformed.

    Returns
    -------
    Waterwork
      The waterwork with all the tanks (operations) added, and names set.

    """
        # Replace all the NaN's with the inputted replace_with function.
        nans, nans_slots = td.isnan(array)
        nans_slots['a'].set_name('array')

        nums, _ = td.replace(
            nans['a'],
            nans['target'],
            slot_plugs={
                'replace_with':
                lambda z: self.fill_nan_func(z[self._pre('array', prefix)])
            },
            tube_plugs={
                'replace_with': np.array([]),
                'replaced_vals': np.array(np.nan)
            })

        nums['replaced_vals'].set_name('replaced_vals')
        nums['mask'].set_name('nans')

        # Do any additional normalization
        if self.norm_mode == 'mean_std':
            nums, _ = td.sub(nums['target'],
                             self.mean,
                             tube_plugs={
                                 'a_is_smaller': False,
                                 'smaller_size_array': self.mean
                             })
            nums, _ = td.div(nums['target'],
                             self.std,
                             tube_plugs={
                                 'a_is_smaller': False,
                                 'smaller_size_array': self.std,
                                 'missing_vals': np.array([]),
                                 'remainder': np.array([])
                             })
        elif self.norm_mode == 'min_max':
            nums, _ = td.sub(nums['target'],
                             self.min,
                             tube_plugs={
                                 'a_is_smaller': False,
                                 'smaller_size_array': self.min
                             })
            nums, _ = td.div(nums['target'], (self.max - self.min),
                             tube_plugs={
                                 'a_is_smaller': False,
                                 'smaller_size_array': (self.max - self.min),
                                 'missing_vals': np.array([]),
                                 'remainder': np.array([])
                             })

        nums['target'].set_name('nums')

        if return_tubes is not None:
            ww = nums['target'].waterwork
            r_tubes = []
            for r_tube_key in return_tubes:
                r_tubes.append(ww.maybe_get_tube(r_tube_key))
            return r_tubes
예제 #5
0
    def define_waterwork(self, array=empty, return_tubes=None, prefix=''):
        """Get the waterwork that completely describes the pour and pump transformations.

    Parameters
    ----------
    array : np.ndarray or empty
      The array to be transformed.

    Returns
    -------
    Waterwork
      The waterwork with all the tanks (operations) added, and names set.

    """
        # Tokenize the full strings into words
        tokens, tokens_slots = td.tokenize(strings=array,
                                           tokenizer=self.word_tokenizer,
                                           detokenizer=self.word_detokenizer,
                                           max_len=self.max_sent_len)
        tokens_slots['strings'].unplug()

        # Set the names of various tubes and slots to make it easier to reference
        # them in further downstream.
        tokens['diff'].set_name('tokenize_diff')
        tokens_slots['max_len'].set_name('max_sent_len')
        tokens_slots['strings'].set_name('array')
        tokens_slots['tokenizer'].set_name('tokenizer')
        tokens_slots['detokenizer'].set_name('detokenizer')

        # lower_case the strings, and set the diff strings of the tank to
        # 'lower_case_dff' for easier referencing.
        if self.lower_case:
            tokens, tokens_slots = td.lower_case(tokens['target'])
            tokens['diff'].set_name('lower_case_diff')

        # Half width the strings, and set the diff strings of the tank to
        # 'half_width_diff' for easier referencing.
        if self.half_width:
            tokens, tokens_slots = td.half_width(tokens['target'])
            tokens['diff'].set_name('half_width_diff')

        # Find all the strings which are not in the list of known words and
        # replace them with the 'unknown token'.
        isin, isin_slots = td.isin(tokens['target'], self.index_to_word + [''])
        mask, _ = td.logical_not(isin['target'])
        tokens, _ = td.replace(
            isin['a'],
            mask['target'],
            self.index_to_word[0],
            tube_plugs={
                'mask': lambda z: z[self._pre('indices', prefix)] == 0
            })

        # Keep track values that were overwritten with a 'unknown token'
        tokens['replaced_vals'].set_name('missing_vals')
        isin_slots['b'].set_name('index_to_word')

        # Convert the tokens into indices.
        indices, indices_slots = td.cat_to_index(
            tokens['target'],
            self.word_to_index,
            tube_plugs={
                'missing_vals':
                lambda z: np.full(z[self._pre('indices', prefix)].shape,
                                  '',
                                  dtype=np.unicode),
                'input_dtype':
                self.input_dtype
            })

        # Set the names of the slots and tubes of this tank for easier referencing
        indices['target'].set_name('indices')
        indices_slots['cat_to_index_map'].set_name('word_to_index')

        if return_tubes is not None:
            ww = indices['target'].waterwork
            r_tubes = []
            for r_tube_key in return_tubes:
                r_tubes.append(ww.maybe_get_tube(r_tube_key))
            return r_tubes
예제 #6
0
  def define_waterwork(self, array=empty, return_tubes=None, prefix=''):
    """Get the waterwork that completely describes the pour and pump transformations.

    Parameters
    ----------
    array : np.ndarray or empty
      The array to be transformed.

    Returns
    -------
    Waterwork
      The waterwork with all the tanks (operations) added, and names set.

    """
    # Replace all the NaT's with the inputted replace_with.
    nats, nats_slots = td.isnat(array)
    nats_slots['a'].set_name('array')

    replaced, _ = td.replace(
      nats['a'], nats['target'],
      slot_plugs={
        'replace_with': lambda z: self.fill_nat_func(z[self._pre('array', prefix)])
      },
      tube_plugs={
        'replace_with': np.array([]),
        'replaced_vals': np.array([None], dtype=np.datetime64)
      }
    )

    replaced['replaced_vals'].set_name('replaced_vals')
    replaced['mask'].set_name('nats')

    # Convert the datetimes to numbers
    nums, _ = td.datetime_to_num(replaced['target'], self.zero_datetime, self.num_units, self.time_unit, name='dtn')
    nums['diff'].set_name('diff')

    if self.norm_mode == 'mean_std':
      nums, _ = td.sub(
        nums['target'], self.mean,
        tube_plugs={'a_is_smaller': False, 'smaller_size_array': self.mean}
      )
      nums, _ = td.div(
        nums['target'], self.std,
        tube_plugs={'a_is_smaller': False, 'smaller_size_array': self.std, 'missing_vals': np.array([]), 'remainder': np.array([])}
      )
    elif self.norm_mode == 'min_max':
      nums, _ = td.sub(
        nums['target'], self.min,
        tube_plugs={'a_is_smaller': False, 'smaller_size_array': self.min}
      )
      nums, _ = td.div(
        nums['target'], (self.max - self.min),
        tube_plugs={'a_is_smaller': False, 'smaller_size_array': (self.max - self.min), 'missing_vals': np.array([]), 'remainder': np.array([])}
      )

    nums['target'].set_name('nums')

    if return_tubes is not None:
      ww = nums['target'].waterwork
      r_tubes = []
      for r_tube_key in return_tubes:
        r_tubes.append(ww.maybe_get_tube(r_tube_key))
      return r_tubes
예제 #7
0
  def define_waterwork(self, array=empty, return_tubes=None, prefix=''):
    """Get the waterwork that completely describes the pour and pump transformations.

    Parameters
    ----------
    array : np.ndarray or empty
      The array to be transformed.

    Returns
    -------
    Waterwork
      The waterwork with all the tanks (operations) added, and names set.

    """
    splits, splits_slots = td.split(array, [1], axis=1)
    splits_slots['a'].unplug()
    splits_slots['a'].set_name('array')

    splits, _ = td.iter_list(splits['target'], 2)
    splits[1].set_name('amps')

    times, _ = td.reshape(
      splits[0],
      slot_plugs={'shape': lambda r: r[self._pre('array', prefix)].shape[:1]},
      tube_plugs={'old_shape': lambda r: list(r[self._pre('nums', prefix)].shape[:1]) + [1]}
    )
    times, _ = td.cast(
      times['target'], np.datetime64,
      tube_plugs={
        'input_dtype': self.input_dtype,
        'diff': np.array([], dtype=self.input_dtype)
      }
    )
    # Replace all the NaT's with the inputted replace_with.
    nats, nats_slots = td.isnat(times['target'])

    replaced, _ = td.replace(
      nats['a'], nats['target'],
      slot_plugs={
        'replace_with': lambda z: self.fill_nat_func(z[self._pre('array', prefix)])
      },
      tube_plugs={
        'replace_with': np.array([]),
        'replaced_vals': np.array([None], dtype=np.datetime64)
      }
    )

    replaced['replaced_vals'].set_name('replaced_vals')
    replaced['mask'].set_name('nats')

    end = (self.end_datetime - self.zero_datetime) / np.timedelta64(self.num_units, self.time_unit)
    end = end.astype(self.dtype)

    # Convert the datetimes to numbers
    nums, _ = td.datetime_to_num(replaced['target'], self.zero_datetime, self.num_units, self.time_unit, name='dtn')

    nums['diff'].set_name('diff')

    # nums, _ = td.sub(
    #   nums['target'], 0.0,
    #   tube_plugs={'a_is_smaller': False, 'smaller_size_array': 0.0}
    # )
    nums, _ = td.div(
      nums['target'], end,
      tube_plugs={'a_is_smaller': False, 'smaller_size_array': end, 'missing_vals': np.array([]), 'remainder': np.array([])}
    )

    decomp, _ = td.phase_decomp(
      nums['target'], self.w_k[:self.top_frequencies],
    )

    decomp['div'].set_name('div')
    decomp['target'].set_name('nums')

    if return_tubes is not None:
      ww = decomp['target'].waterwork
      r_tubes = []
      for r_tube_key in return_tubes:
        r_tubes.append(ww.maybe_get_tube(r_tube_key))
      return r_tubes