def define_waterwork(self, array=empty, return_tubes=None): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ # Replace all the NaT's with the inputted replace_with. nats, _ = td.isnat(array) replaced, _ = td.replace(nats['a'], nats['target']) replaced['replaced_vals'].set_name('replaced_vals') replaced['mask'].set_name('nats') # Convert the datetimes to numbers nums, _ = td.datetime_to_num(replaced['target'], self.zero_datetime, self.num_units, self.time_unit, name='dtn') nums['diff'].set_name('diff') # Do any additional normalizations if self.norm_mode == 'mean_std': nums, _ = nums['target'] - self.mean nums, _ = nums['target'] / self.std elif self.norm_mode == 'min_max': nums, _ = nums['target'] - self.min nums, _ = nums['target'] / (self.max - self.min) nums['target'].set_name('nums') if return_tubes is not None: ww = nums['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ # Replace all the NaN's with the inputted replace_with function. nans, _ = td.isnan(array) nums, _ = td.replace(nans['a'], nans['target']) nums['replaced_vals'].set_name('replaced_vals') nums['mask'].set_name('nans') # Do any additional normalization if self.norm_mode == 'mean_std': nums, _ = nums['target'] - self.mean nums, _ = nums['target'] / self.std elif self.norm_mode == 'min_max': nums, _ = nums['target'] - self.min nums, _ = nums['target'] / (self.max - self.min) nums['target'].set_name('nums') if return_tubes is not None: ww = nums['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None, prefix=''): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ splits, splits_slots = td.split(array, [1], axis=1) splits_slots['a'].unplug() splits_slots['a'].set_name('array') splits, _ = td.iter_list(splits['target'], 2) # Tokenize the full strings into words tokens, tokens_slots = td.multi_tokenize( strings=splits[0], selector=splits[1], tokenizers=self.word_tokenizers, detokenizers=self.word_detokenizers, max_len=self.max_sent_len) # Set the names of various tubes and slots to make it easier to reference # them in further downstream. tokens['diff'].set_name('tokenize_diff') tokens_slots['max_len'].set_name('max_sent_len') tokens_slots['tokenizers'].set_name('tokenizers') tokens_slots['detokenizers'].set_name('detokenizers') # lower_case the strings, and set the diff strings of the tank to # 'lower_case_dff' for easier referencing. if self.lower_case: tokens, tokens_slots = td.lower_case(tokens['target']) tokens['diff'].set_name('lower_case_diff') # Half width the strings, and set the diff strings of the tank to # 'half_width_diff' for easier referencing. if self.half_width: tokens, tokens_slots = td.half_width(tokens['target']) tokens['diff'].set_name('half_width_diff') # Lemmatize the strings, and set the diff strings of the tank to # 'lemmatize_dff' for easier referencing. if self.lemmatize: tokens, tokens_slots = td.lemmatize(tokens['target']) tokens['diff'].set_name('lemmatize_diff') tokens_slots['lemmatizer'].set_name('lemmatizer') languages, _ = td.clone(splits[1]) languages['b'].set_name('languages') dim_size, _ = td.dim_size(languages['a'], axis=0) shape, _ = td.tube_list(dim_size['target'], 1, 1) tile, _ = td.reshape( languages['a'], shape['target'], tube_plugs={ 'old_shape': lambda z: (z[self._pre('languages', prefix)].shape[0], 1) }) tile, _ = td.tile( tile['target'], (1, 1, self.max_sent_len), tube_plugs={ 'old_shape': lambda z: (z[self._pre('languages', prefix)].shape[0], 1, 1) }) # Find all the strings which are not in the list of known words and # replace them with the 'unknown token'. maps_with_empty_strings = { k: v + [''] for k, v in self.index_to_word_maps.iteritems() } isin, isin_slots = td.multi_isin(tokens['target'], maps_with_empty_strings, tile['target']) mask, _ = td.logical_not(isin['target']) tokens, _ = td.replace( isin['a'], mask['target'], '[UNK]', tube_plugs={ 'mask': lambda z: z[self._pre('indices', prefix)] == 0 }) # Keep track values that were overwritten with a 'unknown token' tokens['replaced_vals'].set_name('missing_vals') isin_slots['bs'].set_name('index_to_word_maps') # Convert the tokens into indices. indices, indices_slots = td.multi_cat_to_index( tokens['target'], tile['target'], self.word_to_index_maps, tube_plugs={ 'selector': lambda z: np.tile( np.reshape(z[self._pre('languages')], (z[self._pre('languages')].shape[0], 1, 1)), (1, 1, self.max_sent_len)), 'missing_vals': lambda z: np.full( z[self._pre('indices')].shape, '', dtype=np.unicode), 'input_dtype': self.input_dtype }) # Set the names of the slots and tubes of this tank for easier referencing indices['target'].set_name('indices') # indices['selector'].set_name('languages') indices_slots['cat_to_index_maps'].set_name('word_to_index_maps') if return_tubes is not None: ww = indices['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None, prefix=''): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ # Replace all the NaN's with the inputted replace_with function. nans, nans_slots = td.isnan(array) nans_slots['a'].set_name('array') nums, _ = td.replace( nans['a'], nans['target'], slot_plugs={ 'replace_with': lambda z: self.fill_nan_func(z[self._pre('array', prefix)]) }, tube_plugs={ 'replace_with': np.array([]), 'replaced_vals': np.array(np.nan) }) nums['replaced_vals'].set_name('replaced_vals') nums['mask'].set_name('nans') # Do any additional normalization if self.norm_mode == 'mean_std': nums, _ = td.sub(nums['target'], self.mean, tube_plugs={ 'a_is_smaller': False, 'smaller_size_array': self.mean }) nums, _ = td.div(nums['target'], self.std, tube_plugs={ 'a_is_smaller': False, 'smaller_size_array': self.std, 'missing_vals': np.array([]), 'remainder': np.array([]) }) elif self.norm_mode == 'min_max': nums, _ = td.sub(nums['target'], self.min, tube_plugs={ 'a_is_smaller': False, 'smaller_size_array': self.min }) nums, _ = td.div(nums['target'], (self.max - self.min), tube_plugs={ 'a_is_smaller': False, 'smaller_size_array': (self.max - self.min), 'missing_vals': np.array([]), 'remainder': np.array([]) }) nums['target'].set_name('nums') if return_tubes is not None: ww = nums['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None, prefix=''): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ # Tokenize the full strings into words tokens, tokens_slots = td.tokenize(strings=array, tokenizer=self.word_tokenizer, detokenizer=self.word_detokenizer, max_len=self.max_sent_len) tokens_slots['strings'].unplug() # Set the names of various tubes and slots to make it easier to reference # them in further downstream. tokens['diff'].set_name('tokenize_diff') tokens_slots['max_len'].set_name('max_sent_len') tokens_slots['strings'].set_name('array') tokens_slots['tokenizer'].set_name('tokenizer') tokens_slots['detokenizer'].set_name('detokenizer') # lower_case the strings, and set the diff strings of the tank to # 'lower_case_dff' for easier referencing. if self.lower_case: tokens, tokens_slots = td.lower_case(tokens['target']) tokens['diff'].set_name('lower_case_diff') # Half width the strings, and set the diff strings of the tank to # 'half_width_diff' for easier referencing. if self.half_width: tokens, tokens_slots = td.half_width(tokens['target']) tokens['diff'].set_name('half_width_diff') # Find all the strings which are not in the list of known words and # replace them with the 'unknown token'. isin, isin_slots = td.isin(tokens['target'], self.index_to_word + ['']) mask, _ = td.logical_not(isin['target']) tokens, _ = td.replace( isin['a'], mask['target'], self.index_to_word[0], tube_plugs={ 'mask': lambda z: z[self._pre('indices', prefix)] == 0 }) # Keep track values that were overwritten with a 'unknown token' tokens['replaced_vals'].set_name('missing_vals') isin_slots['b'].set_name('index_to_word') # Convert the tokens into indices. indices, indices_slots = td.cat_to_index( tokens['target'], self.word_to_index, tube_plugs={ 'missing_vals': lambda z: np.full(z[self._pre('indices', prefix)].shape, '', dtype=np.unicode), 'input_dtype': self.input_dtype }) # Set the names of the slots and tubes of this tank for easier referencing indices['target'].set_name('indices') indices_slots['cat_to_index_map'].set_name('word_to_index') if return_tubes is not None: ww = indices['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None, prefix=''): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ # Replace all the NaT's with the inputted replace_with. nats, nats_slots = td.isnat(array) nats_slots['a'].set_name('array') replaced, _ = td.replace( nats['a'], nats['target'], slot_plugs={ 'replace_with': lambda z: self.fill_nat_func(z[self._pre('array', prefix)]) }, tube_plugs={ 'replace_with': np.array([]), 'replaced_vals': np.array([None], dtype=np.datetime64) } ) replaced['replaced_vals'].set_name('replaced_vals') replaced['mask'].set_name('nats') # Convert the datetimes to numbers nums, _ = td.datetime_to_num(replaced['target'], self.zero_datetime, self.num_units, self.time_unit, name='dtn') nums['diff'].set_name('diff') if self.norm_mode == 'mean_std': nums, _ = td.sub( nums['target'], self.mean, tube_plugs={'a_is_smaller': False, 'smaller_size_array': self.mean} ) nums, _ = td.div( nums['target'], self.std, tube_plugs={'a_is_smaller': False, 'smaller_size_array': self.std, 'missing_vals': np.array([]), 'remainder': np.array([])} ) elif self.norm_mode == 'min_max': nums, _ = td.sub( nums['target'], self.min, tube_plugs={'a_is_smaller': False, 'smaller_size_array': self.min} ) nums, _ = td.div( nums['target'], (self.max - self.min), tube_plugs={'a_is_smaller': False, 'smaller_size_array': (self.max - self.min), 'missing_vals': np.array([]), 'remainder': np.array([])} ) nums['target'].set_name('nums') if return_tubes is not None: ww = nums['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None, prefix=''): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ splits, splits_slots = td.split(array, [1], axis=1) splits_slots['a'].unplug() splits_slots['a'].set_name('array') splits, _ = td.iter_list(splits['target'], 2) splits[1].set_name('amps') times, _ = td.reshape( splits[0], slot_plugs={'shape': lambda r: r[self._pre('array', prefix)].shape[:1]}, tube_plugs={'old_shape': lambda r: list(r[self._pre('nums', prefix)].shape[:1]) + [1]} ) times, _ = td.cast( times['target'], np.datetime64, tube_plugs={ 'input_dtype': self.input_dtype, 'diff': np.array([], dtype=self.input_dtype) } ) # Replace all the NaT's with the inputted replace_with. nats, nats_slots = td.isnat(times['target']) replaced, _ = td.replace( nats['a'], nats['target'], slot_plugs={ 'replace_with': lambda z: self.fill_nat_func(z[self._pre('array', prefix)]) }, tube_plugs={ 'replace_with': np.array([]), 'replaced_vals': np.array([None], dtype=np.datetime64) } ) replaced['replaced_vals'].set_name('replaced_vals') replaced['mask'].set_name('nats') end = (self.end_datetime - self.zero_datetime) / np.timedelta64(self.num_units, self.time_unit) end = end.astype(self.dtype) # Convert the datetimes to numbers nums, _ = td.datetime_to_num(replaced['target'], self.zero_datetime, self.num_units, self.time_unit, name='dtn') nums['diff'].set_name('diff') # nums, _ = td.sub( # nums['target'], 0.0, # tube_plugs={'a_is_smaller': False, 'smaller_size_array': 0.0} # ) nums, _ = td.div( nums['target'], end, tube_plugs={'a_is_smaller': False, 'smaller_size_array': end, 'missing_vals': np.array([]), 'remainder': np.array([])} ) decomp, _ = td.phase_decomp( nums['target'], self.w_k[:self.top_frequencies], ) decomp['div'].set_name('div') decomp['target'].set_name('nums') if return_tubes is not None: ww = decomp['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes