def char2vec_one_hot_fast(pairs, character_positions_in_vocabulary): if not isinstance(pairs[0], tuple): pairs = [pairs] b_size = len(pairs) num_punc_marks = [len(pair) - 1 for pair in pairs] word_char_positions = character_positions_in_vocabulary[0] punctuation_char_positions = character_positions_in_vocabulary[1] word_vec = np.zeros(shape=(b_size, 1), dtype=np.int32) punc_vecs = [ np.zeros(shape=(b_size, 1), dtype=np.int32) for _ in range(MAX_NUM_PUNCTUATION_MARKS) ] for b, pair in enumerate(pairs): word_vec[b, 0] = char2id(pair[0], word_char_positions) for punc_idx, punc_vec in enumerate(punc_vecs): if punc_idx < num_punc_marks[b]: # print('pair:', pair) # print('punc_idx:', punc_idx) # print('num_punc_marks[b]:', num_punc_marks[b]) punc_vec[b, 0] = char2id(pair[punc_idx + 1], punctuation_char_positions) + 1 else: punc_vec[b, 0] = 0 np.set_printoptions(threshold=np.nan, linewidth=52) # print('(char2vec_one_hot)pairs:', pairs) # print('(char2vec_one_hot) returned:\n', np.reshape(np.concatenate(tuple([word_vec] + punc_vecs), axis=1), [-1])) return np.concatenate(tuple([word_vec] + punc_vecs), axis=1)
def _next_batch(self): """Generate a single batch from the current cursor position in the data.""" base = np.zeros(shape=(self._batch_size, self._vocabulary_size), dtype=np.float32) bot_speaks_flags = np.zeros(shape=(self._batch_size, 1), dtype=np.float32) speaker_flags = np.zeros(shape=(self._batch_size, self._number_of_speakers), dtype=np.float32) for b in range(self._batch_size): try: pos = self._cursor[b] chr = self._text[pos] chr_id = char2id(chr, self._character_positions_in_vocabulary) base[b, chr_id] = 1.0 except IndexError: # print('(_next_batch)self._cursor:', self._cursor) # print('(_next_batch)b:', b) # print('(_next_batch)self._text:', self._text) # print('(_next_batch)pos:', pos) raise speaker_flags[b, self._speaker_flags[self._cursor[b]]] = 1.0 bot_speaks_flags[b, 0] = float( self._bot_speaks_flags[self._cursor[b]]) self._cursor[b] = (self._cursor[b] + 1) % self._text_size inputs = np.concatenate((base, speaker_flags, bot_speaks_flags), 1) labels = np.concatenate((base, bot_speaks_flags), 1) self._counter += 1 return inputs, labels
def _start_batch(self): batch = np.zeros(shape=(self._batch_size, self._vocabulary_size), dtype=np.float) for b in range(self._batch_size): batch[b, char2id('\n', self.characters_positions_in_vocabulary)] = 1.0 return batch
def _next_batch(self): """Generate a single batch from the current cursor position in the data.""" ret = np.array([[char2id(self._text[self._cursor[b]], self.character_positions_in_vocabulary)] for b in range(self._batch_size)]) for b in range(self._batch_size): self._cursor[b] = (self._cursor[b] + 1) % self._text_size return ret
def _next_batch(self): """Generate a single batch from the current cursor position in the data.""" batch = np.zeros(shape=(self._batch_size, self._vocabulary_size), dtype=np.float) for b in range(self._batch_size): batch[b, char2id(self._text[self._cursor[b]], self.characters_positions_in_vocabulary)] = 1.0 self._cursor[b] = (self._cursor[b] + 1) % self._text_size return batch
def _next_batch(self): """Generate a single batch from the current cursor position in the data.""" batch = np.zeros(shape=(self._batch_size, self._vocabulary_size), dtype=np.float) for b in range(self._batch_size): # print('len(self._pairs):', len(self._pairs)) # print('self._cursor[b]:', self._cursor[b]) batch[b, char2id(self._pairs[self._cursor[b]], self.character_positions_in_vocabulary)] = 1.0 self._cursor[b] = (self._cursor[b] + 1) % self._number_of_pairs return batch
def _create_id_array(pairs, character_positions_in_vocabulary): number_of_pairs = len(pairs) ids = np.zeros(shape=(number_of_pairs, MAX_NUM_PUNCTUATION_MARKS), dtype=np.int16) for p_idx, p in enumerate(pairs): for t_idx, (token, cpiv) in enumerate( zip(p, character_positions_in_vocabulary)): ids[p_idx, t_idx] = char2id(token, cpiv) + 1 return ids
def _next_batch_with_tokens(self): tokens = list() bs = list() for b in range(self._batch_size): # print('len(self._pairs):', len(self._pairs)) # print('self._cursor[b]:', self._cursor[b]) tokens.append(self._pairs[self._cursor[b]]) bs.append(np.array([char2id(self._pairs[self._cursor[b]], self.character_positions_in_vocabulary)])) self._cursor[b] = (self._cursor[b] + 1) % self._number_of_pairs return np.stack(bs), tokens
def _next_batch_with_tokens(self): batch = np.zeros(shape=(self._batch_size, self._vocabulary_size), dtype=np.float) tokens = list() for b in range(self._batch_size): # print('len(self._pairs):', len(self._pairs)) # print('self._cursor[b]:', self._cursor[b]) tokens.append(self._pairs[self._cursor[b]]) batch[b, char2id(self._pairs[self._cursor[b]], self.character_positions_in_vocabulary)] = 1.0 self._cursor[b] = (self._cursor[b] + 1) % self._number_of_pairs return batch, tokens
def char2vec(char, character_positions_in_vocabulary, speaker_idx, speaker_flag_size): voc_size = len(character_positions_in_vocabulary) vec = np.zeros(shape=(1, voc_size + speaker_flag_size + 1), dtype=np.float32) vec[0, char2id(char, character_positions_in_vocabulary)] = 1.0 vec[0, voc_size + speaker_idx] = 1.0 if speaker_idx > 0: vec[0, voc_size + speaker_flag_size] = 1. return vec
def char2vec(character_positions_in_vocabulary, char, speaker_flag_size=2, speaker_idx=0, bot_answer_flag=0, eod=False): voc_size = len(character_positions_in_vocabulary) vec = np.zeros(shape=(1, voc_size + speaker_flag_size + 2), dtype=np.float32) vec[0, char2id(char, character_positions_in_vocabulary)] = 1.0 vec[0, voc_size + speaker_idx] = 1.0 vec[0, voc_size + speaker_flag_size] = float(bot_answer_flag) vec[0, voc_size + speaker_flag_size + 1] = float(eod) return vec
def _start_batch(self): base = np.zeros(shape=(self._batch_size, self._vocabulary_size), dtype=np.float32) bot_speaks_flags = np.zeros(shape=(self._batch_size, 1), dtype=np.float32) speaker_flags = np.zeros(shape=(self._batch_size, self._number_of_speakers)) for b in range(self._batch_size): base[b, char2id('\n', self._character_positions_in_vocabulary)] = 1.0 speaker_flags[b, 1] = 1. bot_speaks_flags[b, 0] = 0. start_inputs = np.concatenate((base, speaker_flags, bot_speaks_flags), 1) start_labels = np.concatenate((base, bot_speaks_flags), 1) return start_inputs, start_labels
def _start_batch(self): word_batch = np.zeros(shape=(self._batch_size, self._vocabulary_sizes[0]), dtype=np.float) for b in range(self._batch_size): word_batch[ b, char2id('\n', self.character_positions_in_vocabulary[0])] = 1.0 no_punc_batch = np.zeros(shape=(self._batch_size, self._vocabulary_sizes[1] + 1), dtype=np.float) for b in range(self._batch_size): no_punc_batch[b, 0] = 1.0 return np.concatenate( tuple([word_batch] + [no_punc_batch] * MAX_NUM_PUNCTUATION_MARKS), axis=1)
def _next_batch(self): """Generate a single batch from the current cursor position in the data.""" base = np.zeros(shape=(self._batch_size, self._vocabulary_size), dtype=np.float32) bot_answer_flags = np.zeros(shape=(self._batch_size, 1), dtype=np.float32) speaker_flags = np.zeros(shape=(self._batch_size, self._number_of_speakers), dtype=np.float32) eod_flags = np.zeros(shape=(self._batch_size, 1), dtype=np.float32) for b in range(self._batch_size): base[b, char2id(self._text[self._cursor[b]], self. _character_positions_in_vocabulary)] = 1.0 speaker_flags[b, self._speaker_flags[self._cursor[b]]] = 1.0 eod_flags[b, 0] = float(self._eod_flags[self._cursor[b]]) bot_answer_flags[b, 0] = float( self._bot_answer_flags[self._cursor[b]]) self._cursor[b] = (self._cursor[b] + 1) % self._text_size inputs = np.concatenate( (base, speaker_flags, bot_answer_flags, eod_flags), 1) labels = np.concatenate((base, bot_answer_flags), 1) return inputs, labels
def _start_batch(self): return np.array([[char2id('\n', self.character_positions_in_vocabulary)] for _ in range(self._batch_size)])
def _create_id_array(pairs, character_positions_in_vocabulary): number_of_pairs = len(pairs) ids = np.ndarray(shape=(number_of_pairs), dtype=np.int16) for p_idx, p in enumerate(pairs): ids[p_idx] = char2id(p, character_positions_in_vocabulary) return ids
def char2vec(char, characters_positions_in_vocabulary, speaker_idx, speaker_flag_size): return np.reshape(np.array([char2id(char, characters_positions_in_vocabulary)]), (1, 1, 1))
def char_2_base_vec(character_positions_in_vocabulary, char): voc_size = len(character_positions_in_vocabulary) vec = np.zeros(shape=(1, voc_size), dtype=np.float32) vec[0, char2id(char, character_positions_in_vocabulary)] = 1.0 return vec
def _start_batch(self): return np.array( [[char2id('\n', self.character_positions_in_vocabulary[0])] + [0] * MAX_NUM_PUNCTUATION_MARKS for _ in range(self._batch_size)])