def _init(self, loader: ResourceLoader, voc: Iterable[str]): # TODO we should not be building variables here if voc is not None: word_to_vec = loader.load_word_vec(self.vec_name, voc) else: word_to_vec = loader.load_word_vec(self.vec_name) voc = set(word_to_vec.keys()) self._word_to_ix = {} dim = next(iter(word_to_vec.values())).shape[0] null_embed = tf.zeros((1, dim), dtype=tf.float32) unk_embed = tf.get_variable(shape=(1, dim), name="unk_embed", dtype=np.float32, trainable=self.learn_unk, initializer=tf.random_uniform_initializer( -self.word_vec_init_scale, self.word_vec_init_scale)) ix = 2 matrix_list = [null_embed, unk_embed] if self._special_tokens is not None and len(self._special_tokens) > 0: print("Building embeddings for %d special_tokens" % (len(self._special_tokens))) tok_embed = tf.get_variable( shape=(len(self._special_tokens), dim), name="token_embed", dtype=np.float32, trainable=True, initializer=tf.random_uniform_initializer( -self.word_vec_init_scale, self.word_vec_init_scale)) matrix_list.append(tok_embed) for token in self._special_tokens: self._word_to_ix[token] = ix ix += 1 mat = [] for word in voc: if word in self._word_to_ix: continue # in case we already added due after seeing a capitalized version of `word` if word in word_to_vec: mat.append(word_to_vec[word]) self._word_to_ix[word] = ix ix += 1 else: lower = word.lower() # Full back to the lower-case version if lower in word_to_vec and lower not in self._word_to_ix: mat.append(word_to_vec[lower]) self._word_to_ix[lower] = ix ix += 1 print("Had pre-trained word embeddings for %d of %d words" % (len(mat), len(voc))) #code.interact(local=locals()) matrix_list.append(tf.constant(value=np.vstack(mat))) self._word_emb_mat = tf.concat(matrix_list, axis=0)
def _init(self, loader: ResourceLoader, voc: Iterable[str]): # TODO we should not be building variables here if voc is not None: word_to_vec = loader.load_word_vec(self.vec_name, voc) else: word_to_vec = loader.load_word_vec(self.vec_name) voc = set(word_to_vec.keys()) self._word_to_ix = {} dim = next(iter(word_to_vec.values())).shape[0] if self.placeholder_flag: dim += 1 null_embed = tf.zeros((1, dim), dtype=tf.float32) ix = 1 matrix_list = [null_embed] if self._special_tokens is not None and len(self._special_tokens) > 0: print("Building embeddings for %d special_tokens" % (len(self._special_tokens))) tok_embed = tf.get_variable( shape=(len(self._special_tokens), dim), name="token_embed", dtype=np.float32, trainable=True, initializer=tf.random_uniform_initializer( -self.word_vec_init_scale, self.word_vec_init_scale)) matrix_list.append(tok_embed) for token in self._special_tokens: self._word_to_ix[token] = ix ix += 1 mat = [] for word in voc: if word in self._word_to_ix: continue # in case we already added due after seeing a capitalized version of `word` if word in word_to_vec: mat.append(word_to_vec[word]) self._word_to_ix[word] = ix ix += 1 else: lower = word.lower() # Full back to the lower-case version if lower in word_to_vec and lower not in self._word_to_ix: mat.append(word_to_vec[lower]) self._word_to_ix[lower] = ix ix += 1 print("Had pre-trained word embeddings for %d of %d words" % (len(mat), len(voc))) mat = np.vstack(mat) if self.placeholder_flag: mat = np.concatenate( [mat, np.zeros((len(mat), 1), dtype=np.float32)], axis=1) matrix_list.append(tf.constant(value=mat)) self._placeholder_start = ix if self.placeholder_flag: def init(shape, dtype=None, partition_info=None): out = tf.random_normal((self.n_placeholders, dim - 1), stddev=self.placeholder_stddev) return tf.concat([out, tf.ones((self.n_placeholders, 1))], axis=1) init_fn = init else: init_fn = tf.random_normal_initializer( stddev=self.placeholder_stddev) matrix_list.append( tf.get_variable("placeholders", (self.n_placeholders, mat.shape[1]), tf.float32, trainable=False, initializer=init_fn)) self._word_emb_mat = tf.concat(matrix_list, axis=0)
def _init(self, loader: ResourceLoader, voc: Iterable[str], allow_update=False, do_update=False): # TODO we should not be building variables here if voc is not None: word_to_vec = loader.load_word_vec(self.vec_name, voc) else: word_to_vec = loader.load_word_vec(self.vec_name) voc = set(word_to_vec.keys()) self._word_to_ix = {} dim = next(iter(word_to_vec.values())).shape[0] null_embed = tf.zeros((1, dim), dtype=tf.float32) if not do_update: self.unk_embed = tf.get_variable( shape=(1, dim), name="unk_embed", dtype=np.float32, trainable=self.learn_unk, initializer=tf.random_uniform_initializer(-self.word_vec_init_scale, self.word_vec_init_scale)) ix = 2 matrix_list = [null_embed, self.unk_embed] if self._special_tokens is not None and len(self._special_tokens) > 0: print("Building embeddings for %d special_tokens" % (len(self._special_tokens))) tok_embed = tf.get_variable(shape=(len(self._special_tokens), dim), name="token_embed", dtype=np.float32, trainable=True, initializer=tf.random_uniform_initializer(-self.word_vec_init_scale, self.word_vec_init_scale)) matrix_list.append(tok_embed) for token in self._special_tokens: self._word_to_ix[token] = ix ix += 1 mat = [] for word in voc: if word in self._word_to_ix: continue # in case we already added due after seeing a capitalized version of `word` if word in word_to_vec: mat.append(word_to_vec[word]) self._word_to_ix[word] = ix ix += 1 else: lower = word.lower() # Full back to the lower-case version if lower in word_to_vec and lower not in self._word_to_ix: mat.append(word_to_vec[lower]) self._word_to_ix[lower] = ix ix += 1 print("Had pre-trained word embeddings for %d of %d words" % (len(mat), len(voc))) # Encoder will feed this as value of self.common_word_mat # Allows us to quickly change the vocabulary at test time self.common_word_mat_np = np.vstack(mat) if not do_update: # Set up the tf graph only once if allow_update: self.common_word_mat = tf.placeholder(tf.float32, shape=(None, dim), name='common_word_mat') matrix_list.append(self.common_word_mat) else: self.common_word_mat = None matrix_list.append(tf.constant(value=self.common_word_mat_np)) self._word_emb_mat = tf.concat(matrix_list, axis=0)