def learnSubmodularMixture(training_data, submod_shells, loss_fun, params=None, loss_supermodular=False): ''' Learns mixture weights of submodular functions. This code implements algorithm 1 of [1] :param training_data: training data. S[t].Y: indices of possible set elements S[t].y_gt: indices selected in the ground truth solution S[t].budget: The budget of for this example :param submod_shells: A cell containing submodular shell functions They need to be of the format submodular_function = shell_function(S[t]) :param loss_function: A (submodular) loss :param maxIter: Maximal number of iterations :param loss_supermodular: True, if the loss is supermodular. Then, [5] is used for loss-augmented inference :return: learnt weights, weights per iteration ''' if params == None: params = SGDparams() logger.info('%s' % params) if len(training_data) == 0: raise IOError('No training examples given') # Make a copy of the training samples so that is doesn't shuffle the input list training_examples = training_data[:] ''' Initialize the weights ''' function_list, names = utils.instaciateFunctions(submod_shells, training_examples[0]) w_0 = np.ones(len(function_list), np.float) #w_0=np.random.rand(len(function_list)) learn_lambda = params.learn_lambda T = len(training_examples) * params.max_iter if learn_lambda is None: ''' Set learning rate according to theorem from "Learning Mixtures of Submodular Shells" - Lin & Bilmes 2012 ''' M = len(submod_shells) G = 1.0 ''' Assume: - w_i,f_i are all upperbounded by 1 - loss l <= B for some B - ||g_t|| <= G, for some G then, we use learning rate nu=2/ (lambda*t) with ''' learn_lambda = G / M * np.sqrt((2 + (1 + np.log(T)) / float(T))) # fudge factor as in http://xcorr.net/2014/01/23/adagrad-eliminating-learning-rates-in-stochastic-gradient-descent/ fudge_factor = 1e-6 #for numerical stability logger.debug('Training using %d samples' % T) if len(function_list) <= 1: logger.info('Just 1 function. No work for me here :-)\n') return 1 ''' Start training ''' logger.info('regularizer lambda: %.3f' % learn_lambda) it = 0 w = [] exitTraining = False g_t_old = np.zeros(len(function_list)) if params.use_ada_grad: historical_grad = np.zeros(len(function_list)) while exitTraining == False: start_time = time.time() if it == 0: w.append(w_0) else: w.append(w[it - 1]) t = np.mod(it, len(training_examples)) ''' Before each iteration: shuffle training examples ''' if t == 0: logger.debug('Suffle training examples') training_examples = training_examples random.shuffle(training_examples) if np.mod(it, 50) == 0: logger.info('Example %d of %d' % (it, T)) logger.debug('%s (budget: %d)' % (training_examples[t], training_examples[t].budget)) logger.debug(training_examples[t].y_gt) ''' Instanciate the shells to submodular functions ''' function_list, names = utils.instaciateFunctions( submod_shells, training_examples[t]) ''' Approximate loss augmented inference (this is equivalent to a greedy submodular optimization) ''' if loss_supermodular: y_t, score = submodular_supermodular_maximization( training_examples[t], w[it], function_list, training_examples[t].budget, loss_fun) else: y_t, score, online_bound = leskovec_maximize( training_examples[t], w[it], function_list, training_examples[t].budget, loss_fun) assert (len(y_t) == training_examples[t].budget) ''' Subgradient ''' score_t = utils.evalSubFun(function_list, y_t, False) score_gt = utils.evalSubFun(function_list, list(training_examples[t].y_gt), True) if params.norm_objective_scores: score_t /= score_t.sum() score_gt /= score_gt.sum() if params.use_l1_projection: g_t = score_t - score_gt else: # Lin et al. use an l2 regularized formulation, and have thus a different gradient g_t = learn_lambda * w[it] + (score_t - score_gt) g_t = ((1 - params.momentum) * g_t + params.momentum * g_t_old) if params.use_ada_grad: # See [6,7] g_t_old = g_t historical_grad += g_t**2 g_t = g_t / (fudge_factor + np.sqrt(historical_grad)) logger.debug('Gradient:') logger.debug(g_t) ''' Update weights ''' if params.nu is None: nu = 2.0 / float(learn_lambda * (it + 1)) else: if hasattr(params.nu, '__call__'): nu = params.nu(it, T) else: nu = params.nu if np.mod(it, 10) == 0: logger.info( 'Nu: %.3f; Gradient: %s; Grad magnitue (abs): %.4f' % (nu, ', '.join(map(str, g_t)), nu * np.sum(np.abs(g_t)))) w[it] = w[it] - nu * g_t ''' Project to feasible set''' if params.use_l1_projection: # We want to keep the euclidean distance between the initial and the projected weight minimal if params.use_ada_grad: # See [7] obj = lambda w_t: (np.multiply(w_t - w[it], w_t - w[it]) / (fudge_factor + historical_grad)).sum() else: obj = lambda w_t: np.inner(w_t - w[it], w_t - w[it]) cons = [] bnds = [] # Define the bounds such that w[it]>0 for idx in range(0, len(function_list)): bnds.append((0, None)) # Define the l1-ball inequality cons.append({'type': 'ineq', 'fun': lambda x: 1 - np.abs(x).sum()}) cons = tuple(cons) bnds = tuple(bnds) # Optimize for the best projection into the l-1 ball if it == 0: res = scipy.optimize.minimize( obj, w_0, constraints=cons, bounds=bnds) #, options={'maxiter':10**3}) else: res = scipy.optimize.minimize( obj, w[it - 1], constraints=cons, bounds=bnds) #, options={'maxiter':10**3}) if res.success: assert (res.x < -10**-5).any() == False w[it] = res.x # Note: We need to re-normalize the weights to sum to one, in order to give each SGD step the same weight if np.sum(w[it]) > 0: w[it] = w[it] / np.sum(w[it]) else: logger.warn( 'Iteration %d: l1: Failed to find constraint solution on w' % it) w[it][w[it] < 0] = 0 if w[it].sum() > 0: w[it] = w[it] / w[it].sum() else: # projection of [1] ''' update the weights accoring to [1] algorithm 1''' w[it][w[it] < 0] = 0 if w[it].sum() > 0: w[it] = w[it] / np.sum(np.abs(w[it])) #w[it][np.isnan(w[it])]=0 if np.mod(it, 10) == 0: logger.info('w[it]:\t%s' % ', '.join(map(str, w[it]))) it = it + 1 logger.debug(it) if it >= len(training_examples) * params.max_iter: logger.warn('Break without convergence\n') exitTraining = True logger.debug("--- %.1f seconds ---" % (time.time() - start_time)) ''' Return the averaged weights (See [1] algorithm 1) ''' w_res = np.asarray(w).mean(axis=0) w_res /= np.abs(w_res).sum() logger.info('----------------------------\n') logger.info('Weights:\n') for w_idx in range(len(w_res)): logger.info(' %20s: %2.3f%%' % (names[w_idx], round(10000 * w_res[w_idx]) / 100)) logger.info('----------------------------\n') return w_res, w
def learnSubmodularMixture(training_data, submod_shells, loss_fun, params=None, loss_supermodular=False): ''' Learns mixture weights of submodular functions. This code implements algorithm 1 of [1] :param training_data: training data. S[t].Y: indices of possible set elements S[t].y_gt: indices selected in the ground truth solution S[t].budget: The budget of for this example :param submod_shells: A cell containing submodular shell functions They need to be of the format submodular_function = shell_function(S[t]) :param loss_function: A (submodular) loss :param maxIter: Maximal number of iterations :param loss_supermodular: True, if the loss is supermodular. Then, [5] is used for loss-augmented inference :return: learnt weights, weights per iteration ''' if params == None: params = SGDparams() logger.info('%s' % params) if len(training_data) ==0: raise IOError('No training examples given') # Make a copy of the training samples so that is doesn't shuffle the input list training_examples=training_data[:] ''' Initialize the weights ''' function_list,names=utils.instaciateFunctions(submod_shells,training_examples[0]) w_0=np.ones(len(function_list),np.float) #w_0=np.random.rand(len(function_list)) learn_lambda = params.learn_lambda T = len(training_examples)*params.max_iter if learn_lambda is None: ''' Set learning rate according to theorem from "Learning Mixtures of Submodular Shells" - Lin & Bilmes 2012 ''' M=len(submod_shells) G=1.0 ''' Assume: - w_i,f_i are all upperbounded by 1 - loss l <= B for some B - ||g_t|| <= G, for some G then, we use learning rate nu=2/ (lambda*t) with ''' learn_lambda=G/M * np.sqrt((2+(1+np.log(T)) / float(T))) # fudge factor as in http://xcorr.net/2014/01/23/adagrad-eliminating-learning-rates-in-stochastic-gradient-descent/ fudge_factor = 1e-6 #for numerical stability logger.debug('Training using %d samples' % T) if len(function_list)<=1: logger.info('Just 1 function. No work for me here :-)\n') return 1 ''' Start training ''' logger.info('regularizer lambda: %.3f' % learn_lambda) it=0 w=[] exitTraining=False g_t_old=np.zeros(len(function_list)) if params.use_ada_grad: historical_grad=np.zeros(len(function_list)) while exitTraining==False: start_time = time.time() if it==0: w.append(w_0); else: w.append(w[it-1]) t=np.mod(it,len(training_examples)) ''' Before each iteration: shuffle training examples ''' if t==0: logger.debug('Suffle training examples') training_examples=training_examples random.shuffle(training_examples) if np.mod(it,50)==0: logger.info('Example %d of %d' % (it,T)) logger.debug('%s (budget: %d)' % (training_examples[t],training_examples[t].budget)) logger.debug(training_examples[t].y_gt) ''' Instanciate the shells to submodular functions ''' function_list,names=utils.instaciateFunctions(submod_shells,training_examples[t]) ''' Approximate loss augmented inference (this is equivalent to a greedy submodular optimization) ''' if loss_supermodular: y_t,score = submodular_supermodular_maximization(training_examples[t],w[it],function_list,training_examples[t].budget,loss_fun) else: y_t,score,online_bound = leskovec_maximize(training_examples[t],w[it],function_list,training_examples[t].budget,loss_fun) assert(len(y_t)==training_examples[t].budget) ''' Subgradient ''' score_t = utils.evalSubFun(function_list,y_t,False) score_gt = utils.evalSubFun(function_list,list(training_examples[t].y_gt),True) if params.norm_objective_scores: score_t /= score_t.sum() score_gt /= score_gt.sum() if params.use_l1_projection: g_t = score_t - score_gt else: # Lin et al. use an l2 regularized formulation, and have thus a different gradient g_t = learn_lambda*w[it] + (score_t - score_gt) g_t = ((1 - params.momentum) * g_t + params.momentum * g_t_old) if params.use_ada_grad: # See [6,7] g_t_old=g_t historical_grad+= g_t**2 g_t= g_t / (fudge_factor + np.sqrt(historical_grad)) logger.debug('Gradient:') logger.debug(g_t) ''' Update weights ''' if params.nu is None: nu = 2.0 / float(learn_lambda*(it+1)) else: if hasattr(params.nu,'__call__'): nu=params.nu(it,T) else: nu=params.nu if np.mod(it,10)==0: logger.info('Nu: %.3f; Gradient: %s; Grad magnitue (abs): %.4f' % (nu, ', '.join(map(str,g_t)),nu*np.sum(np.abs(g_t)))) w[it]=w[it]-nu*g_t ''' Project to feasible set''' if params.use_l1_projection: # We want to keep the euclidean distance between the initial and the projected weight minimal if params.use_ada_grad: # See [7] obj=lambda w_t: (np.multiply(w_t-w[it],w_t-w[it]) / (fudge_factor + historical_grad)).sum() else: obj=lambda w_t: np.inner(w_t-w[it],w_t-w[it]) cons=[] bnds=[] # Define the bounds such that w[it]>0 for idx in range(0,len(function_list)): bnds.append((0, None)) # Define the l1-ball inequality cons.append({'type': 'ineq','fun' : lambda x: 1-np.abs(x).sum()}) cons=tuple(cons) bnds=tuple(bnds) # Optimize for the best projection into the l-1 ball if it==0: res=scipy.optimize.minimize(obj,w_0,constraints=cons,bounds=bnds)#, options={'maxiter':10**3}) else: res=scipy.optimize.minimize(obj,w[it-1],constraints=cons,bounds=bnds)#, options={'maxiter':10**3}) if res.success: assert (res.x<-10**-5).any()==False w[it]=res.x # Note: We need to re-normalize the weights to sum to one, in order to give each SGD step the same weight if np.sum(w[it])>0: w[it]=w[it]/np.sum(w[it]) else: logger.warn('Iteration %d: l1: Failed to find constraint solution on w' % it) w[it][w[it]<0]=0 if w[it].sum()>0: w[it]=w[it]/w[it].sum() else: # projection of [1] ''' update the weights accoring to [1] algorithm 1''' w[it][w[it]<0]=0 if w[it].sum()>0: w[it]=w[it]/np.sum(np.abs(w[it])) #w[it][np.isnan(w[it])]=0 if np.mod(it,10)==0: logger.info('w[it]:\t%s' % ', '.join(map(str,w[it]))) it=it+1 logger.debug(it) if it>=len(training_examples)*params.max_iter: logger.warn('Break without convergence\n') exitTraining=True logger.debug("--- %.1f seconds ---" % (time.time() - start_time)) ''' Return the averaged weights (See [1] algorithm 1) ''' w_res = np.asarray(w).mean(axis=0) w_res/=np.abs(w_res).sum() logger.info('----------------------------\n') logger.info('Weights:\n') for w_idx in range(len(w_res)): logger.info(' %20s: %2.3f%%' % (names[w_idx],round(10000*w_res[w_idx]) / 100)) logger.info('----------------------------\n') return w_res,w
def lazy_greedy_maximize(S, w, submod_fun, budget, loss_fun=None, useCost=False, randomize=True): ''' Implements the submodular maximization algorithm of [4] :param S: data object containing information on needed in the objective functions :param w: weights of the objectives :param submod_fun: submodular functions :param budget: budget :param loss_fun: optional loss function (for learning) :param useCost: boolean. Take into account the costs per element or not :param randomize: randomize marginals brefore getting the maximum. This results in selecting a random element among the top scoring ones, rather then taking the one with the lowest index. :return: y, score: selected indices y and the score of the solution ''' sel_indices = [] type = 'UC' if useCost: type = 'CB' ''' Init arrays to keep track of marginal benefits ''' marginal_benefits = np.ones(len(S.Y), np.float32) * np.Inf mb_indices = np.arange(len(S.Y)) isUpToDate = np.zeros((len(S.Y), 1)) costs = S.getCosts() currCost = 0.0 currScore = 0.0 i = 0 if loss_fun is None: #FIXME: this is not actually a zero loss, but just a loss that is the same for all elements # This is a hack to ensure that, in case all weights w are zero, a non empty set is selected # i.e., just a random subset of size S.budget loss_fun = utils.zero_loss ''' Select as long as we are within budget and have elements to select ''' while True: ''' Find the highest scoring element ''' while (isUpToDate[mb_indices[0]] == 0): cand = list(sel_indices) cand.append(mb_indices[0]) if useCost: t_marg = ( (np.dot(w, utils.evalSubFun(submod_fun, cand, False, w)) + loss_fun(S, cand)) - currScore) / float( costs[mb_indices[0]]) else: t_marg = ( np.dot(w, utils.evalSubFun(submod_fun, cand, False, w)) + loss_fun(S, cand) - currScore) if not skipAssertions: assert marginal_benefits[mb_indices[0]] - t_marg >= -10**-5, ( '%s: Non-submodular objective at element %d!: Now: %.3f; Before: %.3f' % (type, mb_indices[0], t_marg, marginal_benefits[mb_indices[0]])) marginal_benefits[mb_indices[0]] = t_marg isUpToDate[mb_indices[0]] = True if randomize: idx1 = np.random.permutation(len(marginal_benefits)) idx2 = (-marginal_benefits[idx1]).argsort(axis=0) mb_indices = idx1[idx2] else: mb_indices = (-marginal_benefits).argsort(axis=0) if not skipAssertions: assert marginal_benefits[ -1] > -10**-5, 'Non monotonic objective' # Compute upper bound (see [4]) if i == 0: best_sel_indices = np.where( costs[mb_indices].cumsum() <= budget)[0] minoux_bound = marginal_benefits[mb_indices][best_sel_indices].sum( ) ''' Select the highest scoring element ''' if marginal_benefits[mb_indices[0]] > 0.0: logger.debug('Select element %d (gain %.3f)' % (mb_indices[0], marginal_benefits[mb_indices[0]])) sel_indices.append(mb_indices[0]) if useCost: currScore = currScore + marginal_benefits[ mb_indices[0]] * float(costs[mb_indices[0]]) else: currScore = currScore + marginal_benefits[mb_indices[0]] currCost = currCost + costs[mb_indices[0]] # Set the selected element to -1 (so that it is not becoming a candidate again) # Set all others to not up to date (so that the marignal gain will be recomputed) marginal_benefits[mb_indices[0]] = 0 #-np.inf isUpToDate[isUpToDate == 1] = 0 isUpToDate[mb_indices[0]] = -1 mb_indices = (-marginal_benefits).argsort() else: logger.debug(' If the best element is zero, we are done ') logger.debug(sel_indices) return sel_indices, currScore, minoux_bound ''' Check if we still have budget to select something ''' for elIdx in range(0, len(S.Y)): if costs[elIdx] + currCost > budget: marginal_benefits[elIdx] = 0 isUpToDate[elIdx] = 1 if marginal_benefits.max() == 0: logger.debug('no elements left to select. Done') logger.debug( 'Selected %d elements with a cost of %.1f (max: %.1f)' % (len(sel_indices), currCost, budget)) logger.debug(sel_indices) return sel_indices, currScore, minoux_bound ''' Increase iteration number''' i += 1
def lazy_greedy_maximize(S,w,submod_fun,budget,loss_fun=None,useCost=False,randomize=True): ''' Implements the submodular maximization algorithm of [4] :param S: data object containing information on needed in the objective functions :param w: weights of the objectives :param submod_fun: submodular functions :param budget: budget :param loss_fun: optional loss function (for learning) :param useCost: boolean. Take into account the costs per element or not :param randomize: randomize marginals brefore getting the maximum. This results in selecting a random element among the top scoring ones, rather then taking the one with the lowest index. :return: y, score: selected indices y and the score of the solution ''' sel_indices=[] type='UC' if useCost: type='CB' ''' Init arrays to keep track of marginal benefits ''' marginal_benefits = np.ones(len(S.Y),np.float32)*np.Inf mb_indices = np.arange(len(S.Y)) isUpToDate = np.zeros((len(S.Y),1)) costs = S.getCosts() currCost = 0.0 currScore = 0.0 i = 0 if loss_fun is None: #FIXME: this is not actually a zero loss, but just a loss that is the same for all elements # This is a hack to ensure that, in case all weights w are zero, a non empty set is selected # i.e., just a random subset of size S.budget loss_fun=utils.zero_loss ''' Select as long as we are within budget and have elements to select ''' while True: ''' Find the highest scoring element ''' while (isUpToDate[mb_indices[0]]==0): cand=list(sel_indices) cand.append(mb_indices[0]) if useCost: t_marg=((np.dot(w,utils.evalSubFun(submod_fun,cand,False,w)) + loss_fun(S,cand)) - currScore) / float(costs[mb_indices[0]]) else: t_marg=(np.dot(w,utils.evalSubFun(submod_fun,cand,False,w)) + loss_fun(S,cand) - currScore) if not skipAssertions: assert marginal_benefits[mb_indices[0]]-t_marg >= np.max(-10**-5,-10**-8*t_marg), ('%s: Non-submodular objective at element %d!: Now: %.3f; Before: %.3f' % (type,mb_indices[0],t_marg,marginal_benefits[mb_indices[0]])) marginal_benefits[mb_indices[0]]=t_marg isUpToDate[mb_indices[0]]=True if randomize: idx1=np.random.permutation(len(marginal_benefits)) idx2=(-marginal_benefits[idx1]).argsort(axis=0) mb_indices=idx1[idx2] else: mb_indices=(-marginal_benefits).argsort(axis=0) if not skipAssertions: assert marginal_benefits[-1]> -10**-5,'Non monotonic objective' # Compute online bound (see [4]) if i==0: best_sel_indices=np.where(costs[mb_indices].cumsum()<=budget)[0] minoux_bound = marginal_benefits[mb_indices][best_sel_indices].sum() ''' Select the highest scoring element ''' if marginal_benefits[mb_indices[0]] > 0.0: logger.debug('Select element %d (gain %.3f)' % (mb_indices[0],marginal_benefits[mb_indices[0]])) sel_indices.append(mb_indices[0]) if useCost: currScore=currScore + marginal_benefits[mb_indices[0]] * float(costs[mb_indices[0]]) else: currScore=currScore + marginal_benefits[mb_indices[0]] currCost=currCost+ costs[mb_indices[0]] # Set the selected element to -1 (so that it is not becoming a candidate again) # Set all others to not up to date (so that the marignal gain will be recomputed) marginal_benefits[mb_indices[0]] = 0#-np.inf isUpToDate[isUpToDate==1]=0 isUpToDate[mb_indices[0]]=-1 mb_indices=(-marginal_benefits).argsort() else: logger.debug(' If the best element is zero, we are done ') logger.debug(sel_indices) return sel_indices,currScore,minoux_bound ''' Check if we still have budget to select something ''' for elIdx in range(0,len(S.Y)): if costs[elIdx]+currCost>budget: marginal_benefits[elIdx]=0 isUpToDate[elIdx]=1 if marginal_benefits.max()==0: logger.debug('no elements left to select. Done') logger.debug('Selected %d elements with a cost of %.1f (max: %.1f)' % (len(sel_indices),currCost,budget)) logger.debug(sel_indices) return sel_indices,currScore,minoux_bound ''' Increase iteration number''' i+=1