def grad_h(self, w, i=None, j=None): '''Gradient at w. If i is None, returns the full gradient; if i is not None but j is, returns the gradient in the i-th machine; otherwise,return the gradient of j-th sample in i-th machine. ''' if w.ndim == 1: if type(j) is int: j = [j] if i is None and j is None: # Return the full gradient return self.forward_backward(self.X_train, self.Y_train, w)[0] elif i is not None and j is None: # Return the local gradient return self.forward_backward(self.X[i], self.Y[i], w)[0] elif i is None and j is not None: # Return the stochastic gradient return self.forward_backward(self.X_train[j], self.Y_train[j], w)[0] else: # Return the stochastic gradient return self.forward_backward(self.X[i][j], self.Y[i][j], w)[0] elif w.ndim == 2: if i is None and j is None: # Return the distributed gradient return np.array([self.forward_backward(self.X[i], self.Y[i], w[:, i])[0].copy() for i in range(self.n_agent)]).T elif i is None and j is not None: # Return the stochastic gradient return np.array([self.forward_backward(self.X[i][j[i]], self.Y[i][j[i]], w[:, i])[0].copy() for i in range(self.n_agent)]).T else: log.fatal('For distributed gradients j must be None') else: log.fatal('Parameter dimension should only be 1 or 2')
def h(self, w, i=None, j=None, split='train'): '''Function value at w. If i is None, returns f(x); if i is not None but j is, returns the function value in the i-th machine; otherwise,return the function value of j-th sample in i-th machine.''' if split == 'train': X = self.X_train Y = self.Y_train elif split == 'test': if w.ndim > 1 or i is not None or j is not None: log.fatal( "Function value on test set only applies to one parameter vector" ) X = self.X_test Y = self.Y_test if i is None: # Return the function value tmp = X.dot(w) return -xp.sum( (Y - 1) * tmp - xp.log1p(xp.exp(-tmp))) / X.shape[0] + xp.sum( w**2) * self.LAMBDA / 2 elif j is None: # Return the function value in machine i tmp = self.X[i].dot(w) return -xp.sum((self.Y[i] - 1) * tmp - xp.log1p(xp.exp(-tmp)) ) / self.m + xp.sum(w**2) * self.LAMBDA / 2 else: # Return the gradient of sample j in machine i tmp = self.X[i][j].dot(w) return -((self.Y[i][j] - 1) * tmp - xp.log1p(xp.exp(-tmp))) + xp.sum(w**2) * self.LAMBDA / 2
def generate_graph(self, graph_type='expander', params=None): '''Generate connected connectivity graph according to the params.''' if graph_type == 'expander': G = nx.paley_graph(self.n_agent).to_undirected() elif graph_type == 'grid': G = nx.grid_2d_graph(*params) elif graph_type == 'cycle': G = nx.cycle_graph(self.n_agent) elif graph_type == 'path': G = nx.path_graph(self.n_agent) elif graph_type == 'star': G = nx.star_graph(self.n_agent - 1) elif graph_type == 'er': if params < 2 / (self.n_agent - 1): log.fatal( "Need higher probability to create a connected E-R graph!") G = None while G is None or nx.is_connected(G) is False: G = nx.erdos_renyi_graph(self.n_agent, params) else: log.fatal('Graph type %s not supported' % graph_type) self.n_edges = G.number_of_edges() self.G = G
def grad_h(self, w, i=None, j=None): '''Gradient of h(x) at w. Depending on the shape of w and parameters i and j, this function behaves differently: 1. If w is a vector of shape (dim,) 1.1 If i is None and j is None returns the full gradient. 1.2 If i is not None and j is None returns the gradient at the i-th agent. 1.3 If i is None and j is not None returns the i-th gradient of all training data. 1.4 If i is not None and j is not None returns the gradient of the j-th data sample at the i-th agent. Note i, j can be integers, lists or vectors. 2. If w is a matrix of shape (dim, n_agent) 2.1 if j is None returns the gradient of each parameter at the corresponding agent 2.2 if j is not None returns the gradient of each parameter of the j-th sample at the corresponding agent. Note j can be lists of lists or vectors. ''' if w.ndim == 1: if type(j) is int: j = [j] if i is None and j is None: # Return the full gradient return self.X_train.T.dot( logit_1d(self.X_train, w) - self.Y_train) / self.m_total + w * self.LAMBDA elif i is not None and j is None: return self.X[i].T.dot(logit_1d(self.X[i], w) - self.Y[i]) / self.m + w * self.LAMBDA elif i is None and j is not None: # Return the full gradient return self.X_train[j].T.dot( logit_1d(self.X_train[j], w) - self.Y_train[j]) / len(j) + w * self.LAMBDA else: # Return the gradient of sample j at machine i return (logit_1d(self.X[i][j], w) - self.Y[i][j]).dot( self.X[i][j]) / len(j) + w * self.LAMBDA elif w.ndim == 2: if i is None and j is None: # Return the distributed gradient tmp = logit_2d(self.X, w) - self.Y return xp.einsum('ikj,ik->ji', self.X, tmp) / self.m + w * self.LAMBDA elif i is None and j is not None: # Return the stochastic gradient res = [] for i in range(self.n_agent): if type(j[i]) is int: samples = [j[i]] else: samples = j[i] res.append(self.X[i][samples].T.dot( logit_1d(self.X[i][samples], w[:, i]) - self.Y[i][samples]) / len(samples) + w[:, i] * self.LAMBDA) return xp.array(res).T else: log.fatal('For distributed gradients j must be None') else: log.fatal('Parameter dimension should only be 1 or 2')
def split_data(self, X): '''Helper function to split data according to the number of training samples per agent.''' if self.m * self.n_agent != len(X): log.fatal('Data cannot be distributed equally to %d agents' % self.n_agent) if X.ndim == 1: return X.reshape(self.n_agent, -1) else: return X.reshape(self.n_agent, self.m, -1)
def grad_h(self, w, i=None, j=None, split='train'): '''Gradient of h(x) at w. Depending on the shape of w and parameters i and j, this function behaves differently: 1. If w is a vector of shape (dim,) 1.1 If i is None and j is None returns the full gradient. 1.2 If i is not None and j is None returns the gradient at the i-th agent. 1.3 If i is None and j is not None returns the i-th gradient of all training data. 1.4 If i is not None and j is not None returns the gradient of the j-th data sample at the i-th agent. Note i, j can be integers, lists or vectors. 2. If w is a matrix of shape (dim, n_agent) 2.1 if j is None returns the gradient of each parameter at the corresponding agent 2.2 if j is not None returns the gradient of each parameter of the j-th sample at the corresponding agent. Note j can be lists of lists or vectors. ''' if w.ndim == 1: if type(j) is int: j = [j] if i is None and j is None: # Return the full gradient return self.H.dot(w) - self.X_T_Y elif i is not None and j is None: # Return the local gradient return self.H_list[i].dot(w) - self.X_T_Y_list[i] elif i is None and j is not None: # Return the stochastic gradient return (self.X_train[j].dot(w) - self.Y_train[j]).dot( self.X_train[j]) / len(j) else: # Return the stochastic gradient return (self.X[i][j].dot(w) - self.Y[i][j]).dot( self.X[i][j]) / len(j) elif w.ndim == 2: if i is None and j is None: # Return the distributed gradient return xp.einsum('ijk,ki->ji', self.H_list, w) - self.X_T_Y_list.T elif i is None and j is not None: # Return the stochastic gradient res = [] for i in range(self.n_agent): if type(j[i]) is int: samples = [j[i]] else: samples = j[i] res.append((self.X[i][samples].dot(w[:, i]) - self.Y[i][samples]).dot(self.X[i][samples]) / len(samples)) return xp.array(res).T else: log.fatal('For distributed gradients j must be None') else: log.fatal('Parameter dimension should only be 1 or 2')
def h(self, w, i=None, j=None, split='train'): '''Function value of h(x) at w. If i is None, returns h(x); if i is not None but j is, returns the function value at the i-th machine; otherwise,return the function value of j-th sample at the i-th machine.''' if i is None and j is None: # Return the function value Z = xp.sqrt(2 * self.m_total) return xp.sum((self.Y_train / Z - (self.X_train / Z).dot(w))**2) elif i is not None and j is None: # Return the function value at machine i return xp.sum((self.Y[i] - self.X[i].dot(w))**2) / 2 / self.m elif i is not None and j is not None: # Return the function value of sample j at machine i return xp.sum((self.Y[i][j] - self.X[i][j].dot(w))**2) / 2 else: log.fatal('When i is None, j mush be None')
def accuracy(self, w, split='train'): if len(w.shape) > 1: w = w.mean(axis=1) if split == 'train': X = self.X_train Y = self.Y_train elif split == 'test': X = self.X_test Y = self.Y_test else: log.fatal('Data split %s is not supported' % split) Y_hat = X.dot(w) Y_hat[Y_hat > 0] = 1 Y_hat[Y_hat <= 0] = 0 return np.mean(Y_hat == Y)
def accuracy(self, w, split='test'): if w.ndim > 1: w = w.mean(axis=1) if split == 'train': X = self.X_train Y = self.Y_train labels = self.Y_train_labels elif split == 'test': X = self.X_test Y = self.Y_test labels = self.Y_test_labels else: log.fatal('Data split %s is not supported' % split) loss, _, A2 = self.forward(X, Y, w) pred = A2.argmax(axis=1) return sum(pred == labels) / len(pred), loss
def h(self, w, i=None, j=None, split='train'): '''Function value at w. If i is None, returns f(x); if i is not None but j is, returns the function value in the i-th machine; otherwise,return the function value of j-th sample in i-th machine.''' if split == 'train': X = self.X_train Y = self.Y_train elif split == 'test': if w.ndim > 1 or i is not None or j is not None: log.fatal( "Function value on test set only applies to one parameter vector" ) X = self.X_test Y = self.Y_test if i is None and j is None: # Return the function value return self.forward(X, Y, w)[0] elif i is not None and j is None: # Return the function value at machine i return self.forward(self.X[i], self.Y[i], w)[0] else: # Return the function value at machine i if type(j) is int: j = [j] return self.forward(self.X[i][j], self.Y[i][j], w)[0]