def draw_random_binary(n,A): ''' If p is the size of the square, lower triangular A, creates a n by p matrix of random binary vectors using Schafer's method ''' m,p = A.shape ones = np.ones((n,1)) output = np.empty((n,p)) for i in np.arange(0,p): output[:,i] = npran.binomial(1,invlogit(np.dot(np.hstack((output[:,0:i],ones)),A[i,0:(i+1)]))) return output
def ising_X(p,n,A_base_diag=-1,A_sd=.2): """ generate X from ising model """ A = npran.normal(0,A_sd,(p,p))+np.diag(A_base_diag*np.ones(p)) m,p = A.shape ones = np.ones((n,1)) X = np.empty((n,p)) for i in np.arange(0,p): X[:,i] = npran.binomial(1,invlogit(np.dot(np.hstack((X[:,0:i],ones)),A[i,0:(i+1)]))) return X
def ising_X(p, n, A_base_diag=-1, A_sd=.2): """ generate X from ising model """ A = npran.normal(0, A_sd, (p, p)) + np.diag(A_base_diag * np.ones(p)) m, p = A.shape ones = np.ones((n, 1)) X = np.empty((n, p)) for i in np.arange(0, p): X[:, i] = npran.binomial( 1, invlogit(np.dot(np.hstack((X[:, 0:i], ones)), A[i, 0:(i + 1)]))) return X
def bern_y(X, p1, base_prob=.25, beta_sd=1): n, p = X.shape X_1 = X[:, :p1] v = 0 while v < 1E-5: beta = npran.randn(p1) * beta_sd if p1 > 0: eta = cutoff(np.dot(X_1, beta) + logit(base_prob)) y = npran.binomial(1, invlogit(eta), n) else: y = npran.binomial(1, base_prob, n) v = np.min(nplin.svd(np.hstack((X, y[:, np.newaxis])))[1]) return y
def bern_y(X,p1,base_prob=.25,beta_sd=1): n,p = X.shape X_1 = X[:,:p1] v = 0 while v<1E-5: beta = npran.randn(p1)*beta_sd if p1>0: eta = cutoff(np.dot(X_1,beta)+logit(base_prob)) y = npran.binomial(1,invlogit(eta),n) else: y = npran.binomial(1,base_prob,n) v = np.min(nplin.svd(np.hstack((X,y[:,np.newaxis])))[1]) return y
def genXy_bern_X_norm_beta(seed,n,p1,pnull,x_prob=.25,base_prob=.25,beta_sd=1): """ The X are normal. p1 predictive vars, pnull null vars. beta on the p1 vars is ~normal(0,beta_sd) and the intercept is logit(base_prob)""" if not seed == None: npran.seed(seed) X_1 = npran.binomial(1,x_prob,(n,p1)) X_null = npran.binomial(1,x_prob,(n,pnull)) X = np.concatenate((X_1,X_null),axis=1) beta = npran.randn(p1)*beta_sd if p1>0: eta = cutoff(np.dot(X_1,beta)+logit(base_prob)) y = npran.binomial(1,invlogit(eta),n) else: y = npran.binomial(1,base_prob,n) return X,y
def genXy_binary_X_norm_beta(seed,n,p1,pnull,base_prob=.25,beta_sd=1,A_base_diag=-1,A_sd=.2): ''' X is binary from the isling model, with the coefficients drawn from a normal. Y is binary, with beta's coefficients also from a normal ''' if not seed == None: npran.seed(seed) p = p1 + pnull A = npran.normal(0,.2,(p,p))-np.diag(A_base_diag*np.ones(p)) X = draw_random_binary(n,A) X_1 = X[:,:p1] X_null = X[:,p1:] beta = npran.randn(p1)*beta_sd if p1>0: eta = cutoff(np.dot(X_1,beta)+logit(base_prob)) y = npran.binomial(1,invlogit(eta),n) else: y = npran.binomial(1,base_prob,n) return X,y
def genXy_given_X_norm_beta(seed,data,n,p1,pnull,base_prob=.25,beta_sd=1): ''' X is binary from the isling model, with the coefficients drawn from a normal. Y is binary, with beta's coefficients also from a normal ''' if not seed == None: npran.seed(seed) p = p1 + pnull h,w = data.shape rows = npran.choice(h,n) X = data[rows,:][:,npran.choice(w,p)] X_1 = X[:,:p1] X_null = X[:,p1:] beta = npran.randn(p1)*beta_sd if p1>0: eta = cutoff(np.dot(X_1,beta)+logit(base_prob)) y = npran.binomial(1,invlogit(eta),n) else: y = npran.binomial(1,base_prob,n) return X,y
def dinvlogit(x): '''Derivative of logit function at each point of vextor x''' return invlogit(x) * (1 - invlogit(x))
def _binary_knockoff(self): ''' This creates the new binary knockoffs, which are random multivariate bernoulli which should have, in expectation, the same first two moments as X. Only will work if X is all binaray ''' self._derive_crossmoments() #################################################### # Get the data corresponding to the original x for the simulations #################################################### A = np.zeros((2 * self.p, 2 * self.p)) # Simulate fresh x based on the original data if self.method == 'fresh_sim': # Fit the upperhalf of A on the actual data, making this easier A[1, 1] = logit(self.mu_lrg[0]) for i in np.arange(0, self.p): # inject the paramters from the logit X_i ~ X_1 + ... + X_(i-1) + 1 into the ith row of A A[i, 0:(i + 1)] = sm.GLM( self.X_orig[:, i], np.hstack((self.X_orig[:, 0:i], np.ones((self.n, 1)))), family=sm.families.Binomial()).fit().params # Then draw the X X_fix = draw_random_binary(self.MCsize, A[:self.p, :self.p]) nMC = self.MCsize # just repeat X a bunch of times elif self.method == 'bootstrap': #Rather than simulate entirely new X at each stage, I will used a fixed set of X_1 ... X_(i-1) #This definitely makes sense for the orignal X vars (why simulate when we already have it), but possibly less sense for the knockoffs #To get the desired size of montecarlo simulation, I will replicate X until it has at least self.MCsize rows repl = np.min((self.MCsize // self.n, 1)) X_fix = np.repeat(self.X_orig, repl, 0) nMC = X_fix.shape[0] elif self.method == 'approx': X_fix = self.X_orig ################################################### # Derive remaing A from Newton-Raphson ################################################### if self.method == 'approx': covinv = np.diag(self.M - np.outer(self.mu_lrg, self.mu_lrg))**-1 upwt = 1 X_tmp = np.hstack((self.X_orig, np.ones((self.n, 1)))).T / self.n for i in np.arange(self.p, 2 * self.p): m = np.append(self.M[i, 0:self.p], self.M[i, i]) wt = np.diag( np.append(np.append(np.ones(i - self.p), upwt), np.ones(2 * self.p - i))) * np.diag( np.append(covinv[:self.p], covinv[i])) ps = cvx.Variable(self.n) objective = cvx.Minimize(cvx.norm(wt * (X_tmp * ps - m), 2)) constraints = [0 <= ps, ps <= 1] prob = cvx.Problem(objective, constraints) prob.solve(solver=cvx.SCS, max_iters=100) X_fix = np.hstack((X_fix, ps.value)) for j in range(2): # make sure order of variables is mixed up for i in np.arange(self.p, 2 * self.p)[npran.permutation(self.p)]: X_tmp = X_fix X_tmp[:, i] = np.ones((self.n, 1)) m = self.M[i, :] wt = np.diag( np.append( np.append(np.ones(i - self.p), upwt), np.ones(3 * self.p - i - 1))) * np.diag(covinv) ps = cvx.Variable(self.n) objective = cvx.Minimize( cvx.norm(wt * (X_tmp.T / self.n * ps - m), 2)) constraints = [0 <= ps, ps <= 1] prob = cvx.Problem(objective, constraints) prob.solve(solver=cvx.SCS, max_iters=100) X_fix[:, i] = ps.value # draw the actual responses X_fix[:, self.p:] = npran.binomial(1, cutoff(X_fix[:, self.p:])) if not self.method == 'approx': # Largely from 5.1 in Schafer, including notation # the current value and the derivatves are derived by simulation # sequence of portions between 0 and 1 to deal with case of ill conditioned hessian por_seq = np.arange(0, 1, .25) self.por = np.empty((1, 0)) for i in np.arange(self.p, 2 * self.p): # Now, the Newton-Raphson steps # If the hessian becomes singular, we will relax the cross moment requirements, as described in Schafer 5.1 point 2 # the idea is that the problem is relaxed until X_i is independent of all prior vars # as por increases, covariance drops # a is the row we are adding to A. Initialize with values as if independent all other vars a = np.append(np.zeros(i), logit(self.mu_lrg[i])) X_fix = np.hstack((X_fix, np.ones((nMC, 1)))) for por in por_seq: # m are the cross moments we are trying to fit m = (1 - por) * self.M[i, 0:(i + 1)] + por * self.M[ i, i] * np.append(np.diag(self.M)[0:i], 1) # Minimize the actual difference vector opt = root(self._vector_objective, a, args=(X_fix, m), method='anderson', options={ 'maxiter': (i * 2 + 150), 'fatol': 1E-5, 'jac_options': { 'M': 20 } }) # update a to most recent estimate, even without convergence a = opt.x # Stop once optimal has been reached if opt.success: self.por = np.append(self.por, por) if por > 0: print "Variable %d relaxed by tau=%.2f" % ( i - self.p + 1, por) break if not opt.success: a = np.append(np.zeros(i), logit(self.mu_lrg[i])) self.por = np.append(self.por, 1) print "Variable %d fully relaxed" % (i - self.p + 1) # put a into A matrix, draw X_i for 'fixed' matrix, update X_out_fix A[i, 0:(i + 1)] = a X_fix[:, -1] = npran.binomial(1, invlogit(np.dot(X_fix, a))) # hang onto A self.A = A ############################################## # Wrapup and get X_ko ############################################## # If we freshly simulated x, we need to draw ~x based on x if self.method == 'fresh_sim': self.X_lrg = np.hstack((self.X_orig, np.empty((self.n, self.p)))) for i in np.arange(self.p, 2 * self.p): # need to make sure the knockoff isn't uniformly 0 or 1 count = 0 j = 0 while count == 0 or count == self.n: # first five times we try regenerating if j < 5: self.X_lrg[:, i] = npran.binomial( 1, invlogit( np.dot( np.hstack((self.X_lrg[:, 0:i], np.ones((self.n, 1)))), A[i, 0:(i + 1)]))) if j > 0: print "Knockoff regenerated to avoid constant value" # otherwise, just randomly flip a few bits else: print "Random noise added to knockoff to avoid constant value" self.X_lrg[:, i] = np.where( npran.binomial(1, .01 * np.ones(self.n)), 1 - self.X_lrg[:, i], self.X_lrg[:, i]) count = np.sum(self.X_lrg[:, i]) j += 1 elif self.method == 'bootstrap': # since we've been drawing the X along the way, can subset X_fix to get X_ko self.X_lrg = np.concatenate((self.X_orig, X_fix[0::repl, self.p:]), axis=1) elif self.method == 'approx': self.X_lrg = X_fix # Evaluate how close we are emperically to M self.M_distortion = nplin.norm( self.M[:, self.p:] - np.dot(self.X_lrg.T, self.X_lrg[:, self.p:]) / self.n) / nplin.norm(self.M[:, self.p:]) self.emp_ko_corr = np.corrcoef( self.X_lrg, rowvar=0, bias=1)[:self.p, self.p:2 * self.p][np.identity(self.p) == 1] if np.sum(np.isnan(self.emp_ko_corr)) > 0: print "There were %d out of %d variables who had missing correlation" % ( np.sum(np.isnan(self.emp_ko_corr)), self.p)
def _vector_objective(self,a,X_fix,m): return np.mean(invlogit(np.dot(X_fix,a))[:,np.newaxis]*X_fix,axis=0) - m
def dinvlogit(x): '''Derivative of logit function at each point of vextor x''' return invlogit(x)*(1-invlogit(x))
def _binary_knockoff(self): ''' This creates the new binary knockoffs, which are random multivariate bernoulli which should have, in expectation, the same first two moments as X. Only will work if X is all binaray ''' self._derive_crossmoments() #################################################### # Get the data corresponding to the original x for the simulations #################################################### A = np.zeros((2*self.p,2*self.p)) # Simulate fresh x based on the original data if self.method == 'fresh_sim': # Fit the upperhalf of A on the actual data, making this easier A[1,1] = logit(self.mu_lrg[0]) for i in np.arange(0,self.p): # inject the paramters from the logit X_i ~ X_1 + ... + X_(i-1) + 1 into the ith row of A A[i,0:(i+1)] = sm.GLM(self.X_orig[:,i],np.hstack((self.X_orig[:,0:i],np.ones((self.n,1)))),family=sm.families.Binomial()).fit().params # Then draw the X X_fix = draw_random_binary(self.MCsize,A[:self.p,:self.p]) nMC = self.MCsize # just repeat X a bunch of times elif self.method=='bootstrap': #Rather than simulate entirely new X at each stage, I will used a fixed set of X_1 ... X_(i-1) #This definitely makes sense for the orignal X vars (why simulate when we already have it), but possibly less sense for the knockoffs #To get the desired size of montecarlo simulation, I will replicate X until it has at least self.MCsize rows repl = np.min((self.MCsize//self.n,1)) X_fix = np.repeat(self.X_orig,repl,0) nMC = X_fix.shape[0] elif self.method== 'approx': X_fix = self.X_orig ################################################### # Derive remaing A from Newton-Raphson ################################################### if self.method== 'approx': covinv = np.diag(self.M - np.outer(self.mu_lrg,self.mu_lrg))**-1 upwt = 1 X_tmp = np.hstack((self.X_orig,np.ones((self.n,1)))).T/self.n for i in np.arange(self.p,2*self.p): m = np.append(self.M[i,0:self.p],self.M[i,i]) wt = np.diag(np.append(np.append(np.ones(i-self.p),upwt),np.ones(2*self.p-i)))*np.diag(np.append(covinv[:self.p],covinv[i])) ps = cvx.Variable(self.n) objective = cvx.Minimize(cvx.norm( wt*(X_tmp * ps - m) ,2)) constraints = [0 <= ps, ps<=1] prob = cvx.Problem(objective,constraints) prob.solve(solver=cvx.SCS,max_iters=100) X_fix = np.hstack((X_fix,ps.value)) for j in range(2): # make sure order of variables is mixed up for i in np.arange(self.p,2*self.p)[npran.permutation(self.p)]: X_tmp = X_fix X_tmp[:,i] = np.ones((self.n,1)) m = self.M[i,:] wt = np.diag(np.append(np.append(np.ones(i-self.p),upwt),np.ones(3*self.p-i-1)))*np.diag(covinv) ps = cvx.Variable(self.n) objective = cvx.Minimize(cvx.norm(wt*(X_tmp.T/self.n * ps - m) ,2)) constraints = [0 <= ps, ps<=1] prob = cvx.Problem(objective,constraints) prob.solve(solver=cvx.SCS,max_iters=100) X_fix[:,i] = ps.value # draw the actual responses X_fix[:,self.p:] = npran.binomial(1,cutoff(X_fix[:,self.p:])) if not self.method =='approx': # Largely from 5.1 in Schafer, including notation # the current value and the derivatves are derived by simulation # sequence of portions between 0 and 1 to deal with case of ill conditioned hessian por_seq = np.arange(0,1,.25) self.por = np.empty((1,0)) for i in np.arange(self.p,2*self.p): # Now, the Newton-Raphson steps # If the hessian becomes singular, we will relax the cross moment requirements, as described in Schafer 5.1 point 2 # the idea is that the problem is relaxed until X_i is independent of all prior vars # as por increases, covariance drops # a is the row we are adding to A. Initialize with values as if independent all other vars a = np.append(np.zeros(i),logit(self.mu_lrg[i])) X_fix = np.hstack((X_fix,np.ones((nMC,1)))) for por in por_seq: # m are the cross moments we are trying to fit m = (1-por)*self.M[i,0:(i+1)] + por*self.M[i,i]*np.append(np.diag(self.M)[0:i],1) # Minimize the actual difference vector opt = root(self._vector_objective, a, args=(X_fix,m), method='anderson', options = {'maxiter':(i*2+150),'fatol':1E-5,'jac_options':{'M':20}} ) # update a to most recent estimate, even without convergence a = opt.x # Stop once optimal has been reached if opt.success: self.por = np.append(self.por,por) if por>0: print "Variable %d relaxed by tau=%.2f" % (i-self.p+1,por) break if not opt.success: a = np.append(np.zeros(i),logit(self.mu_lrg[i])) self.por = np.append(self.por,1) print "Variable %d fully relaxed" % (i-self.p+1) # put a into A matrix, draw X_i for 'fixed' matrix, update X_out_fix A[i,0:(i+1)] = a X_fix[:,-1] = npran.binomial(1,invlogit(np.dot(X_fix,a))) # hang onto A self.A = A ############################################## # Wrapup and get X_ko ############################################## # If we freshly simulated x, we need to draw ~x based on x if self.method=='fresh_sim': self.X_lrg = np.hstack((self.X_orig,np.empty((self.n,self.p)))) for i in np.arange(self.p,2*self.p): # need to make sure the knockoff isn't uniformly 0 or 1 count = 0 j=0 while count==0 or count==self.n: # first five times we try regenerating if j<5: self.X_lrg[:,i] = npran.binomial(1,invlogit(np.dot(np.hstack((self.X_lrg[:,0:i],np.ones((self.n,1)))),A[i,0:(i+1)]))) if j>0: print "Knockoff regenerated to avoid constant value" # otherwise, just randomly flip a few bits else: print "Random noise added to knockoff to avoid constant value" self.X_lrg[:,i] = np.where(npran.binomial(1,.01*np.ones(self.n)),1-self.X_lrg[:,i],self.X_lrg[:,i]) count = np.sum(self.X_lrg[:,i]) j += 1 elif self.method=='bootstrap': # since we've been drawing the X along the way, can subset X_fix to get X_ko self.X_lrg = np.concatenate((self.X_orig,X_fix[0::repl,self.p:]), axis=1) elif self.method=='approx': self.X_lrg = X_fix # Evaluate how close we are emperically to M self.M_distortion = nplin.norm(self.M[:,self.p:]-np.dot(self.X_lrg.T,self.X_lrg[:,self.p:])/self.n)/nplin.norm(self.M[:,self.p:]) self.emp_ko_corr= np.corrcoef(self.X_lrg,rowvar=0,bias=1)[:self.p,self.p:2*self.p][np.identity(self.p)==1] if np.sum(np.isnan(self.emp_ko_corr))>0: print "There were %d out of %d variables who had missing correlation" % (np.sum(np.isnan(self.emp_ko_corr)),self.p)
def plot_dataset(X0_l, X0_h, X, cov_l, cov_d, K, K_noiseless, K_s, f, f_latent, low_fidelity_error_inds=None, trace_vals=None, trace_high_only_vals=None, post_mean_hf_label_regr=None, is_legend_on=False): n_l = len(X0_l) n_h = len(X0_h) fig, ax = plt.subplots(figsize=(14, 4)) ax.scatter(X0_h, f[n_l:n_l + n_h], s=40, color='black', label='$D_H$', zorder=2) if low_fidelity_error_inds is not None: low_fidelity_correct_inds = np.setdiff1d(np.arange(n_l), low_fidelity_error_inds) ax.scatter(X0_l[low_fidelity_correct_inds], f[low_fidelity_correct_inds], s=20, color='grey', label='$D_L$', marker='o', facecolor='none') ax.scatter(X0_l[low_fidelity_error_inds], f[low_fidelity_error_inds], color='coral', marker='x', label='Ошибки в $D_L$') else: ax.scatter(X0_l, f[:n_l], s=20, color='grey', label='$D_L$', marker='o', facecolor='none') plt.hlines(0.5, 0, 3, linestyle=':', linewidth=1, color='grey', label='Граница классов') L_hf = np.linalg.cholesky(K.eval()) alpha_hf = np.linalg.solve(L_hf.T, np.linalg.solve(L_hf, f_latent)) post_mean_hf = invlogit(np.dot(K_s.T.eval(), alpha_hf)) ax.plot(X, post_mean_hf, color='g', alpha=0.8, label='Истинное $\sigma(f_H)$') if post_mean_hf_label_regr is not None: ax.plot(X, post_mean_hf_label_regr, color='gray', label='Регрессия', linestyle='--') if trace_high_only_vals is not None: ax.plot(X, invlogit(np.mean(logit(trace_high_only_vals), axis=0)), color='blue', label='$p(c(x_*)=1|D_H, x_*)$', linestyle='--') if trace_vals is not None: ax.plot(X, invlogit(np.mean(logit(trace_vals), axis=0)), color='darkblue', label='$p(c(x_*)=1|D_L, D_H, x_*)$') plt.xlabel('$\Omega$', fontsize=16) plt.ylabel('Значения классов и прогнозов', fontsize=14) ax.set_xlim(0, 1) ax.set_ylim(-0.1, 1.1) if is_legend_on: ax.legend(bbox_to_anchor=(1, 1), loc=2, fontsize=14)
def _vector_objective(self, a, X_fix, m): return np.mean(invlogit(np.dot(X_fix, a))[:, np.newaxis] * X_fix, axis=0) - m
def _new_obj(self, eta, X, m): return np.append( np.dot(X.T, invlogit(eta)) - m, np.zeros(X.shape[0] - X.shape[1]))
def _new_obj(self,eta,X,m): return np.append(np.dot(X.T,invlogit(eta)) - m,np.zeros(X.shape[0]-X.shape[1]))