def test(X, y, learned_params): N = np.shape(X)[0] #no of instances X = np.append(np.ones((N,1)), X,1) #appending a column of ones as bias (used in logistic regression weights prediction) F = np.shape(X)[1] #no of features+1 class_prob = [] for w in learned_params.keys(): prob = Utils.logistic_transformation(learned_params[w], X) class_prob.append(prob) max_prob = np.max(class_prob, 0) predicted_y = [] output_label = range(min_class_label, max_class_label+1) for i in xrange(np.size(max_prob)): class_label = np.where(class_prob == max_prob[i])[0] predicted_y.append(output_label[class_label[0]]) print "predicted y :", predicted_y print "Actual y:", y accuracy = Utils.calculate_accuracy(np.array(y), np.array(predicted_y)) print "accuracy for test data :", accuracy f_score_mean, f_score_std = Utils.calculate_average_F1score(np.array(y), np.array(predicted_y), min_class_label, max_class_label) print "Average f score for test data :", f_score_mean error_rate = Utils.calculate_error_rate(np.array(y), np.array(predicted_y)) #ch = stdin.read(1) return (accuracy, f_score_mean, f_score_std, error_rate)
def Estep(x, w, a, b): p = Utils.logistic_transformation(w, x) log_p_a =np.log(p) + np.log(a) log_p_ab = np.log(p*a + (1-p)*b) log_ycap = log_p_a - log_p_ab ycap = np.exp(log_ycap) return ycap
def train(training_data, set_id): EM_result = {} Utils.initPlot( len(no_of_experts), set_id) for ex in xrange(len(no_of_experts)): #generate expert #self.Training_instances = self.N - self.Testing_instanceswrong percentage #60% good:40 % bad print "For group ", ex expert_wrong_percentage = [] for i in xrange(int(no_of_experts[ex]*expert_bads)): #bads num = 0.90 #((random()%0.5) + 0.5) % 1.0 expert_wrong_percentage.append(num) for i in xrange(int(no_of_experts[ex]*expert_goods)): #goods num = 0.20 #random()%0.5 expert_wrong_percentage.append(num) crowds_EM = None failed = 0 iterations = 0 total_iter = 10 while iterations < total_iter : try: crowds_EM = Crowds_EM( training_data, min_class_label, max_class_label, expert_wrong_percentage, verbose= verbose_output, synthetic=synthetic_data) crowds_EM.run_EM_missing() except Exception,e: #Rerunning ... import traceback print traceback.print_exc() failed+=1 try: crowds_EM = Crowds_EM( training_data, min_class_label, max_class_label, expert_wrong_percentage, verbose= verbose_output, synthetic = synthetic_data) crowds_EM.run_EM_missing() except Exception,e: failed+=1 pass else : EM_result[ex] = crowds_EM.results break else : EM_result[ex] = crowds_EM.results break #print "iteration :" , iterations iterations+=1
def logistic_regression(x,y,beta_start=None,verbose=False,CONV_THRESH=1.e-3, MAXIT=500): """ Uses the Newton-Raphson algorithm to calculate maximum likliehood estimates of a logistic regression. Can handle multivariate case (more than one predictor). x - 2-d array of predictors. Number of predictors = x.shape[0]=N y - binary outcomes (len(y) = x.shape[1]) beta_start - initial beta vector (default zeros(N+1,x.dtype) if verbose=True, diagnostics printed for each iteration. MAXIT - max number of iterations (default 500) CONV_THRESH - convergence threshold (sum of absolute differences of beta-beta_old) returns beta (the logistic regression coefficients, a N+1 element vector), J_bar (the (N+1)x(N=1) information matrix), and l (the log-likeliehood). J_bar can be used to estimate the covariance matrix and the standard error beta. l can be used for a chi-squared significance test. covmat = inverse(J_bar) --> covariance matrix stderr = sqrt(diag(covmat)) --> standard errors for beta deviance = -2l --> scaled deviance statistic chi-squared value for -2l is the model chi-squared test. """ if x.shape[-1] != len(y): raise ValueError, "x.shape[-1] and y should be the same length!" try: N, npreds = x.shape[1], x.shape[0] except: # single predictor, use simple logistic regression routine. N, npreds = x.shape[-1], 1 return simple_logistic_regression(x,y,beta_start=beta_start, CONV_THRESH=CONV_THRESH,MAXIT=MAXIT,verbose=verbose) if beta_start is None: beta_start = np.zeros(npreds+1,x.dtype) X = np.ones((npreds+1,N), x.dtype) X[1:, :] = x Xt = np.transpose(X) iter = 0; diff = 1.; beta = beta_start # initial values l = np.sum( y * -np.logaddexp(0, -1 * np.dot(beta, X)) + (1-y) * -np.logaddexp(0, 1 * np.dot(beta, X))) if verbose: print 'Logistic Regression : ' print 'iteration beta log-likliehood |log-log_old|' try: while iter < MAXIT: beta_old = beta l_old = l #ebx = np.exp(np.dot(beta, X)) p = Utils.logistic_transformation(beta.T, X.T) p = p.T #p = ebx/(1.+ebx) #l = np.sum(y*np.log(p) + (1.-y)*np.log(1.-p)) # log-likeliehood #l = np.sum( y * -np.logaddexp(0, -1 * np.dot(beta, X)) + (1-y) * -np.logaddexp(0, 1 * np.dot(beta, X))) s = np.dot(X, y-p) # scoring function J_bar = np.dot(X*np.multiply(p,1.-p),Xt) # information matrix #beta = beta_old + np.dot(np.linalg.inv(J_bar),s) # new value of beta beta = beta_old + invertAdotB(J_bar, s) #diff = np.sum(np.fabs(beta-beta_old)) # sum of absolute differences l = np.sum( y * -np.logaddexp(0, -1 * np.dot(beta, X)) + (1-y) * -np.logaddexp(0, 1 * np.dot(beta, X))) diff = np.sum(np.fabs(l - l_old)) if verbose: print iter+1, beta, l, diff if diff <= CONV_THRESH and l>l_old: break iter = iter + 1 if iter == MAXIT and diff > CONV_THRESH: print 'warning: convergence not achieved with threshold of %s in %s iterations' % (CONV_THRESH,MAXIT) return beta #, J_bar, l except Exception, e: #print "beta", beta #print "J_bar", J_bar #print "s", s #import traceback #print traceback.print_exc() raise
def run_EM_missing(self): try: for class_no in range(self.min_class_label, self.max_class_label+1): y_observed, experty_observed = self.binary_y_experty(class_no) #random initializations for this class label weights = np.random.random(self.F) alpha = np.random.random(self.E) #expert sensitivity beta = np.random.random(self.E) #expert specificity l =0 iter = 0 while iter < self.MAXITER: # First iteration if not iter: l_old = 0 expertcombined = np.array([]) for e in xrange(self.E): experty_observed[e][experty_observed[e] == -1] = randrange(self.min_class_label,self.max_class_label+1) #self.Training_experty[e] = self.experty[e][:self.Training_instances] for e in experty_observed: expertcombined = np.append(expertcombined,experty_observed[e], axis=0) expertcombined = np.reshape(expertcombined, (self.E, self.Training_instances)) y_predicted = np.average( expertcombined, axis=0) y_majority_voting = y_predicted.copy() #acc_MV = np.size(np.where((y_majority_voting.round())==y_observed))/float(self.Training_instances) self.results['weights_mv'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_majority_voting).reshape(-1),verbose=False, MAXIT=10000) self.results['weights_at'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_observed).reshape(-1),verbose=False, MAXIT=10000) else : l_old = l w_old = weights alpha_old = alpha beta_old = beta experty_learnt = self.learn_experty_missing(alpha_old, beta_old, y_observed) for e in experty_observed: missing_ids = np.where(self.experty[e] == -1) for m in missing_ids: experty_observed[e][m] = experty_learnt[e][m] #print "experty :" #pprint(experty_observed) a = Utils.a_calculations(alpha_old, experty_observed,self.y_shape) b = Utils.b_calculations(beta_old, experty_observed, self.y_shape) # E-step y_predicted = EM.Estep(self.Training_x, w_old, a, b) y_predicted = np.asarray(y_predicted).reshape(-1) # M-step weights, alpha, beta = EM.Mstep(self.Training_x, y_predicted, experty_observed) a = Utils.a_calculations(alpha, experty_observed, self.y_shape) b = Utils.b_calculations(beta, experty_observed, self.y_shape) l = self.calculate_loglikelihood(y_predicted, weights, a, b) #acc_EM = np.size(np.where(y_observed==y_predicted.round()))/float(self.Training_instances) diff = np.fabs(l-l_old) if diff <= self.CONV_THRESH and l>=l_old : break iter = iter+1 if self.verbose: print "EM algorithm :","diff:",diff,"log:", l, "iteration:", iter self.results['weights'][class_no] = weights self.results['alpha'][class_no] = alpha.round(1) self.results['beta'][class_no] = beta.round(1) """self.results['loglikelihood'][class_no] = l self.results['EM_perf']['f1_Score'][class_no] = Utils.calculate_F1score(y_observed, y_predicted) self.results['MV_perf']['f1_Score'][class_no] = Utils.calculate_F1score(y_observed, y_majority_voting) self.results['EM_perf']['rmse'][class_no] = Utils.calculate_RMSE(y_observed, y_predicted) self.results['MV_perf']['rmse'][class_no] = Utils.calculate_RMSE(y_observed, y_majority_voting) self.results['experty'] [class_no] = experty_observed fig = plt.figure() ax = fig.add_subplot(111) ax.set_title('class :' + str(class_no)) ax.set_ylim(-1,2) ax.plot(y_observed,'ro-',y_predicted,'-b.')""" if self.verbose: print "alphacap :" pprint (alpha.round(1)) print "betacap :" pprint (beta.round(1)) print "weights :" pprint (weights) print "f1_Score of EM approach :" print self.results['EM_perf']['f1_Score'][class_no] print "f1_Score of majority voting approach :" print self.results['MV_perf']['f1_Score'][class_no] print "y" print y_observed print "y maj" print y_majority_voting.round(2) print "y pred" print y_predicted.round(2) print "Expert wrong percentage" print self.expert_wrong_percentage print '--'*30 except Exception, e: raise
def run(self): try: for class_no in range(self.min_class_label, self.max_class_label+1): y_observed, experty_observed = self.binary_y_experty(class_no) #random initializations for this class label weights = np.random.random(self.F) alpha = np.random.random(self.E) #expert sensitivity beta = np.random.random(self.E) #expert specificity l =0 iter = 0 while iter < self.MAXITER: # First iteration if not iter: l_old = 0 expertcombined = np.array([]) for e in experty_observed: expertcombined = np.append(expertcombined,experty_observed[e], axis=0) expertcombined = np.reshape(expertcombined, (self.E, self.Training_instances)) y_average = np.average( expertcombined, axis=0) mv_expert_combined = np.reshape(expertcombined,expertcombined.size,order='F').reshape(np.shape(expertcombined)[1],np.shape(expertcombined)[0]) y_predicted = np.array([]) for emv in mv_expert_combined: y_predicted = np.append(y_predicted, np.bincount(emv.astype(int)).argmax()) y_predicted = y_predicted.astype(float) """ Classifier with MV as input """ self.results['weights_avg'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_average).reshape(-1),verbose=False, MAXIT=10000) self.results['weights_mv'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_predicted).reshape(-1),verbose=False, MAXIT=10000) self.results['weights_at'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_observed).reshape(-1),verbose=False, MAXIT=10000) #acc_MV = np.size(np.where((y_majority_voting.round())==y_observed))/float(self.Training_instances) else : l_old = l w_old = weights alpha_old = alpha beta_old = beta a = Utils.a_calculations(alpha_old, experty_observed,self.y_shape) b = Utils.b_calculations(beta_old, experty_observed, self.y_shape) # E-step y_predicted = EM.Estep(self.Training_x, w_old, a, b) y_predicted = np.asarray(y_predicted).reshape(-1) # M-step weights, alpha, beta = EM.Mstep(self.Training_x, y_predicted, experty_observed) a = Utils.a_calculations(alpha, experty_observed, self.y_shape) b = Utils.b_calculations(beta, experty_observed, self.y_shape) l = self.calculate_loglikelihood(y_predicted, weights, a, b) #acc_EM = np.size(np.where(y_observed==y_predicted.round()))/float(self.Training_instances) diff = np.fabs(l-l_old) if diff <= self.CONV_THRESH and l>=l_old : break iter = iter+1 if self.verbose: print "EM algorithm :","diff:",diff,"log:", l, "iteration:", iter self.results['weights'][class_no] = weights self.results['alpha'][class_no] = alpha.round(1) self.results['beta'][class_no] = beta.round(1) if self.verbose: print "alphacap :" pprint (alpha.round(1)) print "betacap :" pprint (beta.round(1)) print "weights :" pprint (weights) """print "f1_Score of EM approach :" print self.results['EM_perf']['f1_Score'][class_no] print "f1_Score of majority voting approach :" print self.results['MV_perf']['f1_Score'][class_no]""" print "y" print y_observed print "y maj" print y_average.round(2) print "y pred" print y_predicted.round(2) print "Expert wrong percentage" print self.expert_wrong_percentage print '--'*30 except Exception, e: raise
print "Final results :\n" pprint(EM_result) #for e in xrange(len(no_of_experts)): # Utils.visualize( EM_result[e], min_class_label, max_class_label, no_of_experts[e] ) return EM_result def test(test_data): print "Test data:" pprint (test_data) #k_fold_cross_validation() train(data, 0) Utils.showPlot() """else: EM_perf = crowds_EM.predict_EM(crowds_EM.x, crowds_EM.y) print "EM perf ", EM_perf EM_acc += EM_perf if EM_perf > EM_highest_performance['accuracy']: EM_highest_performance['accuracy'] = EM_perf EM_highest_performance['results'] = crowds_EM.results MV_acc += crowds_EM.predict_MV(crowds_EM.x, crowds_EM.y) #np.save('X.npy', crowds_EM.x)""" """print "No. of failed iterations : ", failed print "Average EM accuracy after ", iterations," iter : ", EM_acc/(total_iter-failed)