def __init__(self, kb, size, num_buckets, rel2seq, batch_size, learning_rate=1e-2): self._kb = kb self._size = size self._batch_size = batch_size self._rel2seq = rel2seq self.learning_rate = tf.Variable(float(learning_rate), trainable=False, name="lr") self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.0) l_count = dict() total = 0 max_l = 0 self._vocab = {"#PADDING#": 0} for (rel, _, _), _, typ in kb.get_all_facts(): s = self._rel2seq(rel) l = len(s) for word in s: if word not in self._vocab: self._vocab[word] = len(self._vocab) max_l = max(max_l, l) if l not in l_count: l_count[l] = 0 l_count[l] += 1 total += 1 self._seq_inputs = [tf.placeholder(tf.int64, shape=[None], name="seq_input%d" % i) for i in xrange(max_l)] with vs.variable_scope("composition", initializer=model.default_init()): seq_outputs = self._comp_f() self._bucket_outputs = [] ct = 0 self._buckets = [] for l in xrange(max_l): c = l_count.get(l) if c: ct += c if ct % (total / num_buckets) < c: self._bucket_outputs.append(seq_outputs[l]) self._buckets.append(l) if len(self._buckets) >= num_buckets: self._buckets[-1] = max_l self._bucket_outputs[-1] = seq_outputs[-1] else: self._buckets.append(max_l) self._bucket_outputs.append(seq_outputs[-1]) self._input = [[0]*self._batch_size for _ in xrange(max_l)] # fill input with padding self._feed_dict = dict() train_params = filter(lambda v: "composition" in v.name, tf.trainable_variables()) self._grad = tf.placeholder(tf.float32, shape=[None, self._size], name="rel_grad") self._grad_in = np.zeros((self._batch_size, self._size), dtype=np.float32) self._grads = [tf.gradients(o, train_params, self._grad) for o in self._bucket_outputs] self._bucket_update = [self.opt.apply_gradients(zip(grads, train_params)) for o, grads in zip(self._bucket_outputs, self._grads)]
def __init__(self, kb, size, batch_size, is_train=True, num_neg=200, learning_rate=1e-2, l2_lambda=0.0, is_batch_training=False): self._kb = kb self._size = size self._batch_size = batch_size self._is_batch_training = is_batch_training self._is_train = is_train self._init = model.default_init() with vs.variable_scope(self.name(), initializer=self._init): self.learning_rate = tf.Variable(float(learning_rate), trainable=False, name="lr") self.global_step = tf.Variable(0, trainable=False, name="step") with tf.device("/cpu:0"): if is_batch_training: self.opt = rprop.RPropOptimizer() # tf.train.GradientDescentOptimizer(self.learning_rate) else: self.opt = tf.train.AdamOptimizer(self.learning_rate, beta1=0.0) self._init_inputs() with vs.variable_scope("score", initializer=self._init): self._scores = self._scoring_f() if is_train or is_batch_training: assert batch_size % (num_neg+1) == 0, "Batch size must be multiple of num_neg+1 during training" #with vs.variable_scope("score", initializer=init): # tf.get_variable_scope().reuse_variables() # for i in xrange(num_neg): # self.triple_inputs.append((tf.placeholder(tf.int64, shape=[None], name="rel_%d" % (i+1)), # tf.placeholder(tf.int64, shape=[None], name="subj_%d" % (i+1)), # tf.placeholder(tf.int64, shape=[None], name="obj_%d" % (i+1)))) # self.scores.append( # self._scoring_f(self.triple_inputs[i+1][0], self.triple_inputs[i+1][1], self.triple_inputs[i+1][2])) num_pos = int(batch_size/(num_neg+1)) scores = tf.reshape(self._scores, [num_pos, num_neg + 1]) labels = np.zeros([num_pos, num_neg+1], dtype=np.float32) labels[:, 0] = 1 labels = tf.constant(labels, name="labels_constant", dtype=tf.float32) loss = math_ops.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(scores, labels)) train_params = filter(lambda v: self.name() in v.name, tf.trainable_variables()) self.training_weight = tf.Variable(float(learning_rate), trainable=False, name="training_weight") self._feed_dict[self.training_weight] = np.array([1.0], dtype=np.float32) with tf.device("/cpu:0"): #clipped_gradients = _clip_by_value(self.grads, -max_grad, max_grad) if is_batch_training: self._grads = tf.gradients(loss, train_params, self.training_weight) with vs.variable_scope("batch_gradient", initializer=self._init): self._acc_gradients = map(lambda param: tf.get_variable(param.name.split(":")[0], param.get_shape(), param.dtype, tf.constant_initializer(0.0), False), train_params) self._loss = tf.get_variable("acc_loss", (), tf.float32, tf.constant_initializer(0.0), False) # We abuse the gradient descent optimizer for accumulating gradients and loss (summing) acc_opt = tf.train.GradientDescentOptimizer(-1.0) self._accumulate_gradients = acc_opt.apply_gradients(zip(self._grads, self._acc_gradients)) self._acc_loss = acc_opt.apply_gradients([(loss, self._loss)]) self._update = self.opt.apply_gradients( zip(map(lambda v: v.value(), self._acc_gradients), train_params), global_step=self.global_step) self._reset = map(lambda param: param.initializer, self._acc_gradients) self._reset.append(self._loss.initializer) else: self._loss = loss / math_ops.cast(num_pos, dtypes.float32) in_params = self._input_params() if not in_params: self._grads = tf.gradients(self._loss, train_params, self.training_weight) else: self._grads = tf.gradients(self._loss, train_params + in_params, self.training_weight) self._input_grads = self._grads[len(train_params):] if len(train_params) > 0: self._update = self.opt.apply_gradients(zip(self._grads[:len(train_params)], train_params), global_step=self.global_step) if l2_lambda > 0.0: l2 = tf.reduce_sum(array_ops.pack([tf.nn.l2_loss(t) for t in train_params])) l2_loss = l2_lambda * l2 if is_batch_training: l2_grads = tf.gradients(l2_loss, train_params) self._l2_accumulate_gradients = acc_opt.apply_gradients(zip(l2_grads, self._acc_gradients)) self._l2_acc_loss = acc_opt.apply_gradients([(l2_loss, self._loss)]) else: self._l2_update = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(l2_loss, var_list=train_params) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, kb, size, num_buckets, rel2seq, batch_size, learning_rate=1e-2): self._kb = kb self._size = size self._batch_size = batch_size self._rel2seq = rel2seq self.learning_rate = tf.Variable(float(learning_rate), trainable=False, name="lr") self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.0) l_count = dict() total = 0 max_l = 0 self._vocab = {"#PADDING#": 0} for (rel, _, _), _, typ in kb.get_all_facts(): s = self._rel2seq(rel) l = len(s) for word in s: if word not in self._vocab: self._vocab[word] = len(self._vocab) max_l = max(max_l, l) if l not in l_count: l_count[l] = 0 l_count[l] += 1 total += 1 self._seq_inputs = [ tf.placeholder(tf.int64, shape=[None], name="seq_input%d" % i) for i in xrange(max_l) ] with vs.variable_scope("composition", initializer=model.default_init()): seq_outputs = self._comp_f() self._bucket_outputs = [] ct = 0 self._buckets = [] for l in xrange(max_l): c = l_count.get(l) if c: ct += c if ct % (total / num_buckets) < c: self._bucket_outputs.append(seq_outputs[l]) self._buckets.append(l) if len(self._buckets) >= num_buckets: self._buckets[-1] = max_l self._bucket_outputs[-1] = seq_outputs[-1] else: self._buckets.append(max_l) self._bucket_outputs.append(seq_outputs[-1]) self._input = [[0] * self._batch_size for _ in xrange(max_l)] # fill input with padding self._feed_dict = dict() train_params = filter(lambda v: "composition" in v.name, tf.trainable_variables()) self._grad = tf.placeholder(tf.float32, shape=[None, self._size], name="rel_grad") self._grad_in = np.zeros((self._batch_size, self._size), dtype=np.float32) self._grads = [ tf.gradients(o, train_params, self._grad) for o in self._bucket_outputs ] self._bucket_update = [ self.opt.apply_gradients(zip(grads, train_params)) for o, grads in zip(self._bucket_outputs, self._grads) ]
def __init__(self, kb, size, batch_size, is_train=True, num_neg=200, learning_rate=1e-2, l2_lambda=0.0, is_batch_training=False): self._kb = kb self._size = size self._batch_size = batch_size self._is_batch_training = is_batch_training self._is_train = is_train self._init = model.default_init() with vs.variable_scope(self.name(), initializer=self._init): self.learning_rate = tf.Variable(float(learning_rate), trainable=False, name="lr") self.global_step = tf.Variable(0, trainable=False, name="step") with tf.device("/cpu:0"): if is_batch_training: self.opt = rprop.RPropOptimizer( ) # tf.train.GradientDescentOptimizer(self.learning_rate) else: self.opt = tf.train.AdamOptimizer(self.learning_rate, beta1=0.0) self._init_inputs() with vs.variable_scope("score", initializer=self._init): self._scores = self._scoring_f() if is_train or is_batch_training: assert batch_size % ( num_neg + 1 ) == 0, "Batch size must be multiple of num_neg+1 during training" #with vs.variable_scope("score", initializer=init): # tf.get_variable_scope().reuse_variables() # for i in xrange(num_neg): # self.triple_inputs.append((tf.placeholder(tf.int64, shape=[None], name="rel_%d" % (i+1)), # tf.placeholder(tf.int64, shape=[None], name="subj_%d" % (i+1)), # tf.placeholder(tf.int64, shape=[None], name="obj_%d" % (i+1)))) # self.scores.append( # self._scoring_f(self.triple_inputs[i+1][0], self.triple_inputs[i+1][1], self.triple_inputs[i+1][2])) num_pos = int(batch_size / (num_neg + 1)) scores = tf.reshape(self._scores, [num_pos, num_neg + 1]) labels = np.zeros([num_pos, num_neg + 1], dtype=np.float32) labels[:, 0] = 1 labels = tf.constant(labels, name="labels_constant", dtype=tf.float32) loss = math_ops.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(scores, labels)) train_params = filter(lambda v: self.name() in v.name, tf.trainable_variables()) self.training_weight = tf.Variable(float(learning_rate), trainable=False, name="training_weight") self._feed_dict[self.training_weight] = np.array([1.0], dtype=np.float32) with tf.device("/cpu:0"): #clipped_gradients = _clip_by_value(self.grads, -max_grad, max_grad) if is_batch_training: self._grads = tf.gradients(loss, train_params, self.training_weight) with vs.variable_scope("batch_gradient", initializer=self._init): self._acc_gradients = map( lambda param: tf.get_variable( param.name.split(":")[0], param. get_shape(), param.dtype, tf.constant_initializer(0.0), False), train_params) self._loss = tf.get_variable("acc_loss", (), tf.float32, tf.constant_initializer(0.0), False) # We abuse the gradient descent optimizer for accumulating gradients and loss (summing) acc_opt = tf.train.GradientDescentOptimizer(-1.0) self._accumulate_gradients = acc_opt.apply_gradients( zip(self._grads, self._acc_gradients)) self._acc_loss = acc_opt.apply_gradients([(loss, self._loss)]) self._update = self.opt.apply_gradients( zip(map(lambda v: v.value(), self._acc_gradients), train_params), global_step=self.global_step) self._reset = map(lambda param: param.initializer, self._acc_gradients) self._reset.append(self._loss.initializer) else: self._loss = loss / math_ops.cast(num_pos, dtypes.float32) in_params = self._input_params() if not in_params: self._grads = tf.gradients(self._loss, train_params, self.training_weight) else: self._grads = tf.gradients(self._loss, train_params + in_params, self.training_weight) self._input_grads = self._grads[len(train_params):] if len(train_params) > 0: self._update = self.opt.apply_gradients( zip(self._grads[:len(train_params)], train_params), global_step=self.global_step) if l2_lambda > 0.0: l2 = tf.reduce_sum( array_ops.pack([tf.nn.l2_loss(t) for t in train_params])) l2_loss = l2_lambda * l2 if is_batch_training: l2_grads = tf.gradients(l2_loss, train_params) self._l2_accumulate_gradients = acc_opt.apply_gradients( zip(l2_grads, self._acc_gradients)) self._l2_acc_loss = acc_opt.apply_gradients([(l2_loss, self._loss)]) else: self._l2_update = tf.train.GradientDescentOptimizer( self.learning_rate).minimize(l2_loss, var_list=train_params) self.saver = tf.train.Saver(tf.all_variables())