def test_multi(self): with wa.Waterwork() as ww: add0_tubes, add0_slots = empty + empty add0_slots['b'].set_plug(np.array([3, 4])) add0_tubes['a_is_smaller'].set_plug(False) add1_tubes, add1_slots = add0_tubes['target'] + empty add1_slots['b'].set_plug( lambda d: 0.5 * d['Add_0/slots/a'] + np.array([0.5, 1.0])) add1_tubes['a_is_smaller'].set_plug( lambda d: not d['output_1'].any()) cl0_tubes, _ = td.clone(a=add0_tubes['smaller_size_array']) add2_tubes, _ = td.add(a=cl0_tubes['a'], b=add1_tubes['smaller_size_array']) add2_tubes['target'].set_plug( lambda d: d['output_1'] + np.array([1, 2])) cl0_tubes['b'].set_name('output_1') true_funnel_dict = {('Add_0', 'a'): np.array([1, 2])} funnel_dicts = [true_funnel_dict] * 3 tap_dicts = ww.multi_pour(funnel_dicts, key_type='str') for tap_dict in tap_dicts: # tap_dict = ww.pour(true_funnel_dict, key_type='str') true_tap_dict = { "output_1": np.array([3, 4]), 'Add_1/tubes/target': np.array([5, 8]), 'Add_2/tubes/a_is_smaller': False, 'Add_2/tubes/smaller_size_array': np.array([1, 2]), 'Add_2/tubes/a_is_smaller': False, # 'Add_2/tubes/target': np.array([4, 6]), } self.assertEqual(set(tap_dict.keys()), set(true_tap_dict.keys())) for tap in tap_dict: th.assert_arrays_equal(self, tap_dict[tap], true_tap_dict[tap]) self.assertEqual( ww._pump_tank_order(), [ww.tanks[k] for k in ['Add_2', 'Add_1', 'Clone_0', 'Add_0']]) funnel_dicts = ww.multi_pump(tap_dicts, key_type='tuple') for funnel_dict in funnel_dicts: self.assertEqual(sorted(funnel_dict.keys()), sorted(true_funnel_dict.keys())) for funnel in funnel_dict: th.assert_arrays_equal(self, funnel_dict[funnel], true_funnel_dict[funnel]) ww.clear_vals() for d in [ww.slots, ww.tubes]: for key in d: self.assertEqual(d[key].get_val(), None) pickle_name = os.path.join(self.temp_dir, 'ww.pickle')
def test_pour_pump_non_eager(self): with wa.Waterwork() as ww: add0_tubes, add0_slots = empty + empty add1_tubes, add1_slots = add0_tubes['target'] + empty cl0_tubes, _ = td.clone(a=add0_tubes['smaller_size_array']) add2_tubes, _ = td.add(a=cl0_tubes['a'], b=add1_tubes['smaller_size_array']) cl0_tubes['b'].set_name('output_1') true_funnel_dict = { ('Add_0', 'a'): np.array([1, 2]), ('Add_0', 'b'): np.array([3, 4]), ('Add_1', 'b'): np.array([1, 2]) } for _ in xrange(2): self.assertEqual([str(t) for t in ww._pour_tank_order()], ['Add_0', 'Add_1', 'Clone_0', 'Add_2']) tap_dict = ww.pour(true_funnel_dict, key_type='str') true_tap_dict = { "output_1": np.array([3, 4]), 'Add_0/tubes/a_is_smaller': False, 'Add_1/tubes/a_is_smaller': False, 'Add_1/tubes/target': np.array([5, 8]), 'Add_2/tubes/a_is_smaller': False, 'Add_2/tubes/smaller_size_array': np.array([1, 2]), 'Add_2/tubes/a_is_smaller': False, 'Add_2/tubes/target': np.array([4, 6]), } self.assertEqual(set(tap_dict.keys()), set(true_tap_dict.keys())) for tap in tap_dict: th.assert_arrays_equal(self, tap_dict[tap], true_tap_dict[tap]) self.assertEqual( ww._pump_tank_order(), [ww.tanks[k] for k in ['Add_2', 'Add_1', 'Clone_0', 'Add_0']]) funnel_dict = ww.pump(true_tap_dict, key_type='tuple') self.assertEqual(sorted(funnel_dict.keys()), sorted(true_funnel_dict.keys())) for funnel in funnel_dict: th.assert_arrays_equal(self, funnel_dict[funnel], true_funnel_dict[funnel]) ww.clear_vals() for d in [ww.slots, ww.tubes]: for key in d: self.assertEqual(d[key].get_val(), None) pickle_name = os.path.join(self.temp_dir, 'ww.pickle') ww.save_to_file(pickle_name) ww = wa.Waterwork(from_file=pickle_name)
def test_pour_pump_eager(self): with wa.Waterwork() as ww: cl0_tubes, cl0_slots = td.clone(a=np.array([1, 2])) cl0_slots['a'].unplug() add0_tubes, add0_slots = cl0_tubes['a'] + np.array([3, 4]) add0_slots['b'].unplug() add1_tubes, _ = add0_tubes['target'] + cl0_tubes['b'] cl1_tubes, _ = td.clone(a=add0_tubes['smaller_size_array']) add2_tubes, _ = cl1_tubes['a'] * add1_tubes['smaller_size_array'] add2_tubes['target'].set_name('answer') true_funnel_dict = { cl0_slots['a']: np.array([1, 2]), add0_slots['b']: np.array([3, 4]) } # print [str(t) for t in ww._pour_tank_order()] # self.assertEqual(ww._pour_tank_order(), [cl0, add0, cl1, add1, add2]) true_tap_dict = { cl1_tubes['b']: np.array([3, 4]), add1_tubes['target']: np.array([5, 8]), add2_tubes['smaller_size_array']: np.array([1, 2]), "answer": np.array([3, 8]), } temp_tap_dict = {} temp_tap_dict.update(true_tap_dict) temp_tap_dict[add2_tubes['target']] = temp_tap_dict['answer'] del temp_tap_dict['answer'] for tap in temp_tap_dict: th.assert_arrays_equal(self, tap.get_val(), temp_tap_dict[tap]) # print [str(t) for t in ww._pump_tank_order()] # self.assertEqual(ww._pump_tank_order(), [add2, cl1, add1, add0, cl0]) # print [str(k) for k in ww.taps] funnel_dict = ww.pump(true_tap_dict) self.assertEqual(sorted(funnel_dict.keys()), sorted(true_funnel_dict.keys())) for funnel in funnel_dict: th.assert_arrays_equal(self, funnel_dict[funnel], true_funnel_dict[funnel])
def define_waterwork(self, array=empty, return_tubes=None, prefix=''): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ # Convert the category values to indices. cti, cti_slots = td.cat_to_index( array, self.cat_val_to_index, tube_plugs={'input_dtype': lambda z: self.input_dtype} ) cti_slots['cats'].set_name('array') cti['missing_vals'].set_name('missing_vals') # Clone the indices so that a copy of 'indices' can be outputted as a tap. cloned, _ = td.clone(cti['target']) cloned['a'].set_name('indices') # Convert the indices into one-hot vectors. one_hots, _ = td.one_hot( cloned['b'], len(self.cat_val_to_index), tube_plugs={ 'missing_vals': lambda z: np.ones(z[self._pre('indices', prefix)].shape)*-2 } ) if self.norm_mode == 'mean_std': one_hots, _ = td.sub( one_hots['target'], self.mean, tube_plugs={'a_is_smaller': False, 'smaller_size_array': self.mean} ) one_hots, _ = td.div( one_hots['target'], self.std, tube_plugs={'a_is_smaller': False, 'smaller_size_array': self.std, 'missing_vals': np.array([]), 'remainder': np.array([])} ) one_hots['target'].set_name('one_hots') if return_tubes is not None: ww = one_hots['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ # Convert the category values to indices. cti, cti_slots = td.cat_to_index( array, self.cat_val_to_index, ) cti['missing_vals'].set_name('missing_vals') # Clone the indices so that a copy of 'indices' can be outputted as a tap. cloned, _ = td.clone(cti['target']) cloned['a'].set_name('indices') # Convert the indices into one-hot vectors. one_hots, _ = td.one_hot(cloned['b'], len(self.cat_val_to_index)) # Normalize the one_hots if the norm_mode is set. if self.norm_mode == 'mean_std': one_hots, _ = one_hots['target'] - self.mean one_hots, _ = one_hots['target'] / self.std one_hots['target'].set_name('one_hots') if return_tubes is not None: ww = one_hots['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ assert self.input_dtype is not None, ( "Run calc_global_values before running the transform") with ns.NameSpace(self.name): for trans_num, trans_key in enumerate(self.transform_order): trans = self.transforms[trans_key] with ns.NameSpace(trans.name): if trans_num < len(self.transform_order) - 1: tap_key = self.tap_keys[trans_num + 1] return_tubes = [self._pre(tap_key)] else: return_tubes = None tubes = trans.define_waterwork(array, return_tubes) if tubes is None: continue old_name = tubes[0].name tubes[0].set_name("to_be_cloned") tube_dict, _ = td.clone(tubes[0]) array = tube_dict['a'] tube_dict['b'].set_name(old_name)
def define_waterwork(self, array=empty, return_tubes=None, prefix=''): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ splits, splits_slots = td.split(array, [1], axis=1) splits_slots['a'].unplug() splits_slots['a'].set_name('array') splits, _ = td.iter_list(splits['target'], 2) # Tokenize the full strings into words tokens, tokens_slots = td.multi_tokenize( strings=splits[0], selector=splits[1], tokenizers=self.word_tokenizers, detokenizers=self.word_detokenizers, max_len=self.max_sent_len) # Set the names of various tubes and slots to make it easier to reference # them in further downstream. tokens['diff'].set_name('tokenize_diff') tokens_slots['max_len'].set_name('max_sent_len') tokens_slots['tokenizers'].set_name('tokenizers') tokens_slots['detokenizers'].set_name('detokenizers') # lower_case the strings, and set the diff strings of the tank to # 'lower_case_dff' for easier referencing. if self.lower_case: tokens, tokens_slots = td.lower_case(tokens['target']) tokens['diff'].set_name('lower_case_diff') # Half width the strings, and set the diff strings of the tank to # 'half_width_diff' for easier referencing. if self.half_width: tokens, tokens_slots = td.half_width(tokens['target']) tokens['diff'].set_name('half_width_diff') # Lemmatize the strings, and set the diff strings of the tank to # 'lemmatize_dff' for easier referencing. if self.lemmatize: tokens, tokens_slots = td.lemmatize(tokens['target']) tokens['diff'].set_name('lemmatize_diff') tokens_slots['lemmatizer'].set_name('lemmatizer') languages, _ = td.clone(splits[1]) languages['b'].set_name('languages') dim_size, _ = td.dim_size(languages['a'], axis=0) shape, _ = td.tube_list(dim_size['target'], 1, 1) tile, _ = td.reshape( languages['a'], shape['target'], tube_plugs={ 'old_shape': lambda z: (z[self._pre('languages', prefix)].shape[0], 1) }) tile, _ = td.tile( tile['target'], (1, 1, self.max_sent_len), tube_plugs={ 'old_shape': lambda z: (z[self._pre('languages', prefix)].shape[0], 1, 1) }) # Find all the strings which are not in the list of known words and # replace them with the 'unknown token'. maps_with_empty_strings = { k: v + [''] for k, v in self.index_to_word_maps.iteritems() } isin, isin_slots = td.multi_isin(tokens['target'], maps_with_empty_strings, tile['target']) mask, _ = td.logical_not(isin['target']) tokens, _ = td.replace( isin['a'], mask['target'], '[UNK]', tube_plugs={ 'mask': lambda z: z[self._pre('indices', prefix)] == 0 }) # Keep track values that were overwritten with a 'unknown token' tokens['replaced_vals'].set_name('missing_vals') isin_slots['bs'].set_name('index_to_word_maps') # Convert the tokens into indices. indices, indices_slots = td.multi_cat_to_index( tokens['target'], tile['target'], self.word_to_index_maps, tube_plugs={ 'selector': lambda z: np.tile( np.reshape(z[self._pre('languages')], (z[self._pre('languages')].shape[0], 1, 1)), (1, 1, self.max_sent_len)), 'missing_vals': lambda z: np.full( z[self._pre('indices')].shape, '', dtype=np.unicode), 'input_dtype': self.input_dtype }) # Set the names of the slots and tubes of this tank for easier referencing indices['target'].set_name('indices') # indices['selector'].set_name('languages') indices_slots['cat_to_index_maps'].set_name('word_to_index_maps') if return_tubes is not None: ww = indices['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def _join_tubes_to_slots(self, input_dict, waterwork): """Join the tubes incoming from other tanks to this tank's slots. If the slot was previously identified as a 'funnel', i.e. a slot not attached to other tube then take it out of the list of funnels. If the tube was previously identified as a tap, i.e. a tube that is not connected to another slot, then take it out of the list of taps. Parameters ---------- input_dict : dict( keys - Slot keys. Must be the same as the attribute slot_keys. values - Tube, None or some valid input data type. ) The inputs to the tank. waterwork : Waterwork The waterwork that the part will be added to. Returns ------- type Description of returned object. """ for key in input_dict: slot = self.slots[key] tube = input_dict[key] if type(tube) is not tu.Tube: continue # If the tube was already used for another tank, then it'll have to be # cloned. if type(tube.slot) is not Empty: # Save the slot in order to connect it to the clone tube later. other_slot = tube.slot tube.slot = empty import wtrwrks.tanks.tank_defs as td c_tubes, c_slots = td.clone(a=tube) # Join the other slot to the 'b' tube of the clone tank other_slot.tube = c_tubes['b'] c_tubes['b'].slot = other_slot # Join this slot to the 'a' tube of the clone tank slot.tube = c_tubes['a'] c_tubes['a'].slot = slot # Remove the newly created clone tupes from the taps, since they are # immediately connected to slots. del waterwork.taps[c_tubes['a'].name] del waterwork.taps[c_tubes['b'].name] # Remove the slots from the funnels, since the clone slot is now the # funnel. if slot.name in waterwork.funnels: del waterwork.funnels[slot.name] if slot.plug is not None: raise ValueError( str(slot) + ' was plugged but is no longer a funnel. Only funnels can have plugs.' ) if other_slot.name in waterwork.funnels: del waterwork.funnels[other_slot.name] if other_slot.plug is not None: raise ValueError( str(other_slot) + ' was plugged but is no longer a funnel. Only funnels can have plugs.' ) else: tube.slot = slot slot.tube = tube if type(tube) is tu.Tube: if tube.name in waterwork.taps: del waterwork.taps[tube.name] if slot.name in waterwork.funnels: del waterwork.funnels[slot.name] if slot.plug is not None: raise ValueError( str(slot) + ' was plugged but is no longer a funnel. Only funnels can have plugs.' ) if tube.plug is not None: raise ValueError( str(tube) + ' was plugged but is no longer a tap. Only taps can have plugs.' )