def TrainFn(fnsim, embeddings, leftop, rightop, loss=loss.hinge, loss_margin=1.0, op='', method='SGD', decay=0.999, epsilon=1e-6, max_learning_rate=None, weight_L1_param_regularizer=None, weight_L2_param_regularizer=None, weight_contractive_regularizer_left=None, weight_contractive_regularizer_right=None): """ This function returns a theano function to perform a training iteration, contrasting couples of positive and negative triplets. members are given as sparse matrices. for one positive triplet there is one negative triplet. :param fnsim: similarity function (on theano variables). :param embeddings: an embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr, inpl, inpo = S.csr_matrix('inpr'), S.csr_matrix( 'inpl'), S.csr_matrix('inpo') inpln, inprn, inpon = S.csr_matrix('inpln'), S.csr_matrix( 'inprn'), S.csr_matrix('inpon') # Learning rates for parameters and embeddings rate_params = T.scalar('rate_params') rate_embeddings = T.scalar('rate_embeddings') # E: D x N, inp: N x B -> <E, inp>: D x B -> <E, inp>.T: B x D # Positive triplet functions lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T # Negative triplet functions lhsn = S.dot(embedding.E, inpln).T rhsn = S.dot(embedding.E, inprn).T relln = S.dot(relationl.E, inpon).T relrn = S.dot(relationr.E, inpon).T # Similarity Function, applied to g_lhs and g_rhs lop, rop = leftop(lhs, rell), rightop(rhs, relr) lopn, ropn = leftop(lhsn, relln), rightop(rhsn, relrn) simi = fnsim(lop, rop) simin = fnsim(lopn, ropn) supported_loss_args = inspect.getargspec(loss)[0] loss_args = {} if 'margin' not in supported_loss_args else { 'margin': loss_margin } cost, out = loss(simi, simin, **loss_args) # <EXPERIMENTAL_CODE> # Should I also plug examples from corrupted triples ? if weight_contractive_regularizer_left is not None: cost = cost + (weight_contractive_regularizer_left * R.contractive_regularizer(lop, lhs)) if weight_contractive_regularizer_right is not None: cost = cost + (weight_contractive_regularizer_right * R.contractive_regularizer(rop, rhs)) for rel_param in set([relationl.E, relationr.E]): if weight_L1_param_regularizer is not None: cost = cost + (weight_L1_param_regularizer * R.L1_regularizer(rel_param)) if weight_L2_param_regularizer is not None: cost = cost + (weight_L2_param_regularizer * R.L2_regularizer(rel_param)) # </EXPERIMENTAL_CODE> params = leftop.params + rightop.params + (fnsim.params if hasattr( fnsim, 'params') else []) params = list(set(params)) embeds = [embedding.E] + ([relationr.E, relationl.E] if (type(embeddings) == list) else []) embeds = list(set(embeds)) # The function updates the implicit function arguments according to the updates. updates = collections.OrderedDict() if (method == 'SGD'): pass # do nothing elif (method == 'MOMENTUM'): param_previous_update_map = collections.OrderedDict() for param in params + embeds: # Allocate the previous updates previous_update_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_previous_update = theano.shared(value=previous_update_value, name='su_' + param.name) param_previous_update_map[param] = param_previous_update elif (method == 'ADAGRAD'): param_squared_gradients_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared( value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients elif (method == 'ADADELTA'): param_squared_gradients_map = collections.OrderedDict() param_squared_updates_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared( value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients # Allocate the sums of squared updates squared_updates_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_updates = theano.shared(value=squared_updates_value, name='su_' + param.name) param_squared_updates_map[param] = param_squared_updates elif (method == 'RMSPROP'): param_squared_gradients_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared( value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients else: raise ValueError('Unknown method: %s' % (method)) # Parameter Gradients gradientsparams = T.grad(cost, params) # Embeddings gradients gradientsembeds = T.grad(cost, embeds) # Learning Rates rates_params = [rate_params for i in range(len(params))] # In TransE etc. the rate for predicates' embeddings (that do not get normalized) is rate_params, not rate_embeddings rates_embeddings = [rate_embeddings, rate_params, rate_params] if len(embeds) > 1 else [ rate_embeddings ] # [rate_embeddings for i in range(len(embeds))] for param, gradient, rate in zip(params + embeds, gradientsparams + gradientsembeds, rates_params + rates_embeddings): if (method == 'SGD'): # SGD U.sgd(param, rate, gradient, updates) elif (method == 'MOMENTUM'): # SGD+MOMENTUM param_previous_update = param_previous_update_map[param] U.momentum(param, rate, decay, gradient, updates, param_previous_update) elif (method == 'ADAGRAD'): # ADAGRAD param_squared_gradients = param_squared_gradients_map[param] U.adagrad(param, rate, epsilon, gradient, updates, param_squared_gradients) elif (method == 'ADADELTA'): # ADADELTA param_squared_gradients = param_squared_gradients_map[param] param_squared_updates = param_squared_updates_map[param] U.adadelta(param, rate, decay, epsilon, gradient, updates, param_squared_gradients, param_squared_updates) elif (method == 'RMSPROP'): # RMSPROP param_squared_gradients = param_squared_gradients_map[param] U.rmsprop(param, rate, decay, max_learning_rate, epsilon, gradient, updates, param_squared_gradients) else: raise ValueError('Unknown method: %s' % (method)) """ Theano function inputs. :input rate_embeddings: learning/decay rate for the embeddings. :input rate_params: learning/decay rate for the parameters. :input inpl: sparse csr matrix representing the indexes of the positive triplet 'left' member, shape=(#examples,N [Embeddings]). :input inpr: sparse csr matrix representing the indexes of the positive triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpo: sparse csr matrix representing the indexes of the positive triplet relation member, shape=(#examples,N [Embeddings]). :input inpln: sparse csr matrix representing the indexes of the negative triplet 'left' member, shape=(#examples,N [Embeddings]). :input inprn: sparse csr matrix representing the indexes of the negative triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpon: sparse csr matrix representing the indexes of the negative triplet relation member, shape=(#examples,N [Embeddings]). Theano function output. :output mean(cost): average cost. :output mean(out): ratio of examples for which the margin is violated, i.e. for which an update occurs. """ return theano.function( [rate_embeddings, rate_params, inpl, inpr, inpo, inpln, inprn, inpon], [T.mean(cost), T.mean(out)], updates=updates, on_unused_input='ignore')
def TrainFn1Member(fnsim, embeddings, leftop, rightop, rel=True, loss=loss.hinge, loss_margin=1.0, op=None, method='SGD', decay=0.999, epsilon=1e-6, max_learning_rate=None, weight_L1_param_regularizer=None, weight_L2_param_regularizer=None, weight_contractive_regularizer_left=None, weight_contractive_regularizer_right=None): embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr, inpl, inpo = S.csr_matrix('inpr'), S.csr_matrix( 'inpl'), S.csr_matrix('inpo') inpln, inprn = S.csr_matrix('inpln'), S.csr_matrix('inprn') # Learning rates for parameters and embeddings rate_params = T.scalar('rate_params') rate_embeddings = T.scalar('rate_embeddings') # Graph lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T lhsn = S.dot(embedding.E, inpln).T rhsn = S.dot(embedding.E, inprn).T simi = fnsim(leftop(lhs, rell), rightop(rhs, relr)) # Negative 'left' member similn = fnsim(leftop(lhsn, rell), rightop(rhs, relr)) # Negative 'right' member simirn = fnsim(leftop(lhs, rell), rightop(rhsn, relr)) costl, outl = loss(simi, similn, margin=loss_margin) costr, outr = loss(simi, simirn, margin=loss_margin) cost, out = costl + costr, T.concatenate([outl, outr]) # List of inputs of the function list_in = [rate_embeddings, rate_params, inpl, inpr, inpo, inpln, inprn] if rel: # If rel is True, we also consider a negative relation member inpon = S.csr_matrix() relln = S.dot(relationl.E, inpon).T relrn = S.dot(relationr.E, inpon).T simion = fnsim(leftop(lhs, relln), rightop(rhs, relrn)) costo, outo = loss(simi, simion, margin=loss_margin) cost += costo out = T.concatenate([out, outo]) list_in += [inpon] # <EXPERIMENTAL_CODE> # Should I also plug examples from corrupted triples ? if weight_contractive_regularizer_left is not None: cost = cost + (weight_contractive_regularizer_left * R.contractive_regularizer(lop, lhs)) if weight_contractive_regularizer_right is not None: cost = cost + (weight_contractive_regularizer_right * R.contractive_regularizer(rop, rhs)) for rel_param in set([relationl.E, relationr.E]): if weight_L1_param_regularizer is not None: cost = cost + (weight_L1_param_regularizer * R.L1_regularizer(rel_param)) if weight_L2_param_regularizer is not None: cost = cost + (weight_L2_param_regularizer * R.L2_regularizer(rel_param)) # </EXPERIMENTAL_CODE> params = leftop.params + rightop.params + (fnsim.params if hasattr( fnsim, 'params') else []) embeds = [embedding.E] + ([relationr.E, relationl.E] if (type(embeddings) == list) else []) # The function updates the implicit function arguments according to the updates. updates = collections.OrderedDict() if (method == 'SGD'): pass # do nothing elif (method == 'MOMENTUM'): param_previous_update_map = collections.OrderedDict() for param in params + embeds: # Allocate the previous updates previous_update_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_previous_update = theano.shared(value=previous_update_value, name='su_' + param.name) param_previous_update_map[param] = param_previous_update elif (method == 'ADAGRAD'): param_squared_gradients_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared( value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients elif (method == 'ADADELTA'): param_squared_gradients_map = collections.OrderedDict() param_squared_updates_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared( value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients # Allocate the sums of squared updates squared_updates_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_updates = theano.shared(value=squared_updates_value, name='su_' + param.name) param_squared_updates_map[param] = param_squared_updates elif (method == 'RMSPROP'): param_squared_gradients_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared( value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients else: raise ValueError('Unknown method: %s' % (method)) # Parameter Gradients gradientsparams = T.grad(cost, params) # Embeddings gradients gradientsembeds = T.grad(cost, embeds) # Learning Rates rates_params = [rate_params for i in range(len(params))] # In TransE etc. the rate for predicates' embeddings (that do not get normalized) is rate_params, not rate_embeddings rates_embeddings = [rate_embeddings, rate_params, rate_params] if len(embeds) > 1 else [ rate_embeddings ] # [rate_embeddings for i in range(len(embeds))] for param, gradient, rate in zip(params + embeds, gradientsparams + gradientsembeds, rates_params + rates_embeddings): if (method == 'SGD'): # SGD U.sgd(param, rate, gradient, updates) elif (method == 'MOMENTUM'): # SGD+MOMENTUM param_previous_update = param_previous_update_map[param] U.momentum(param, rate, decay, gradient, updates, param_previous_update) elif (method == 'ADAGRAD'): # ADAGRAD param_squared_gradients = param_squared_gradients_map[param] U.adagrad(param, rate, epsilon, gradient, updates, param_squared_gradients) elif (method == 'ADADELTA'): # ADADELTA param_squared_gradients = param_squared_gradients_map[param] param_squared_updates = param_squared_updates_map[param] U.adadelta(param, rate, decay, epsilon, gradient, updates, param_squared_gradients, param_squared_updates) elif (method == 'RMSPROP'): # RMSPROP param_squared_gradients = param_squared_gradients_map[param] U.rmsprop(param, rate, decay, max_learning_rate, epsilon, gradient, updates, param_squared_gradients) else: raise ValueError('Unknown method: %s' % (method)) return theano.function(list_in, [T.mean(cost), T.mean(out)], updates=updates, on_unused_input='ignore')