def optimize_logit_for(self, pair): # extract the right images from the set [a,b] = [int(pair[0]), int(pair[1])] data = np.array(pd.read_csv('./data/kaggle/train.csv', header=0)).astype('float64') data_t = data[0:, :] data_a = data_t[data_t[:,0]==a] data_b = data_t[data_t[:,0]==b] data_ab = np.vstack([data_a, data_b]) xt = data_ab[:, 1:] yt = data_ab[:, 0].astype('int8') yt = np.atleast_2d(yt).T print 'xt shape', xt.shape # Perform logistic regression xt2 = np.column_stack([np.ones((xt.shape[0],1)), xt]) yt2 = yt.flatten() count = 0 for i in range(yt2.size): if yt2[i]==a: yt2[i] = 1 count +=1 else: yt2[i] = 0 ini_thetas = 0.005*np.random.rand(xt2.shape[1],1) L = 1e+5 opt_thetas = logit.optimizeThetas(ini_thetas, xt2, yt2, L, visual=False) return opt_thetas
def test_optimize_thetas_reg(): l = Logysterical(tr, None) ini_thetas = np.zeros((l.tr["X"].shape[1], 1)) pp = l.optimizeThetas(ini_thetas) print pp diff = pp - np.array( [ 1.273005, 0.624876, 1.177376, -2.020142, -0.912616, -1.429907, 0.125668, -0.368551, -0.360033, -0.171068, -1.460894, -0.052499, -0.618889, -0.273745, -1.192301, -0.240993, -0.207934, -0.047224, -0.278327, -0.296602, -0.453957, -1.045511, 0.026463, -0.294330, 0.014381, -0.328703, -0.143796, -0.924883, ] ) for e in diff: assert e < 0.01
def check_accuracy(self): logit_thetas = {} soft_thetas = np.array(pd.read_csv('./data/kaggle/optimized_thetas.csv', header=None)) soft_thetas = soft_thetas.reshape(self.LABS, -1) h = soft.h(soft_thetas, self.xe) m = h.shape[0] misses = 0.00 count = 0.0 for i in range(m): true_label = self.ye[i,0] [ml_1, ml_2] = h[i,:].argsort()[-2:][::-1] # 1st and 2nd model choices p1,p2 = h[i,:][ml_1], h[i,:][ml_2] right_order = True if ml_1 > ml_2: right_order = False s = `ml_2`+`ml_1` else: s = `ml_1`+`ml_2` if p1<0.99 and p2>0.01: if s not in logit_thetas: count +=1 logit_thetas[s] = self.optimize_logit_for(s) l_t = logit_thetas[s] logix = np.hstack([1, self.xe[i,:]]) p = logit.h(l_t, logix) if (p>0.5): prediction = (ml_1 if right_order else ml_2) else: prediction = (ml_2 if right_order else ml_1) else: prediction = ml_1 #print prediction, true_label if prediction!=true_label: misses +=1.0 print 'misses', misses print 'logit thetas searched', count acc = 1 - misses/m print 'accuracy:', acc pass
def test_model_submit(self): logit_thetas = {} soft_thetas = np.array(pd.read_csv('./data/kaggle/submit_optimized_thetas.csv', header=None)) soft_thetas = soft_thetas.reshape(self.LABS, -1) m, n = self.x_test.shape h = soft.h(soft_thetas, self.x_test) predictions = np.zeros((m,2)) for i in range(m): [ml_1, ml_2] = h[i,:].argsort()[-2:][::-1] # 1st and 2nd model choices p1,p2 = h[i,:][ml_1], h[i,:][ml_2] right_order = True if ml_1 > ml_2: right_order = False s = `ml_2`+`ml_1` else: s = `ml_1`+`ml_2` if p1<0.99 and p2>0.01: if s not in logit_thetas: logit_thetas[s] = self.optimize_logit_for(s) l_t = logit_thetas[s] logix = np.hstack([1, self.x_test[i,:]]) p = logit.h(l_t, logix) if (p>0.5): predictions[i,:] = ([i+1, ml_1] if right_order else [i+1, ml_2]) else: predictions[i,:] = ([i+1, ml_2] if right_order else [i+1, ml_1]) else: predictions[i,:]=[i+1, ml_1] print 'To submitt add header: ImageId,Label' print predictions[0:10,:] np.savetxt('./data/kaggle/predictions_2steps.csv', predictions, fmt='%i,%i') pass
def test_J_with_lam_and_zero_theta(): l = Logysterical(tr) theta = np.zeros((l.tr["X"].shape[1], 1)) assert l.j(theta) - 0.69314718056 < epsilon