def cluster_experiment(): p = [0.33,0.33,0.34] max_k = 5 n_factors = 1 res = {'cv':np.zeros(max_k-1),'ca':np.zeros(max_k-1),'cstd':np.zeros(max_k-1),'aic':np.zeros(max_k-1),'bic':np.zeros(max_k-1)} result_list = [] count = 100 distance_dist = np.zeros((1, max_k-1),dtype=int) for X, Y in sample_generator(2, p, 600): print count result = {'id':count} # recorde the input data # np.savetxt("intput_data_%d.csv" % count, X, delimiter=',') # fit mfa model with validation mfa_cluster = MFACluster(X,max_k,n_factors,Y,iters=10) mfa_cluster.fit() res['cv'][mfa_cluster.best_k("voting")-2] += 1 res['ca'][mfa_cluster.best_k("averaging")-2] += 1 res['cstd'][mfa_cluster.best_k("std")-2] += 1 result['cv'] = mfa_cluster.best_k("voting") result['ca'] = mfa_cluster.best_k("averaging") result['cstd'] = mfa_cluster.best_k("std") distance_dist = np.vstack((distance_dist, mfa_cluster.result_matrix)) # fit mfa model to whole dataset with aic and bic min_aic = min_bic = sys.maxint k_aic = k_bic = 0 for k in xrange(2, max_k+1): m = MixtureFA(k, n_factors) m.fit(X) result['lik_%d' % k] = m.ll[-1] result['aic_%d' % k] = m.aic() result['bic_%d' % k] = m.bic() if m.aic() < min_aic: k_aic = k min_aic = m.aic() if m.bic() < min_bic: k_bic = k min_bic = m.bic() res['aic'][k_aic-2] += 1 res['bic'][k_bic-2] += 1 result['aic'] = k_aic result['bic'] = k_bic # collect result # save iter number, likelihood_k, aic_k, bic_k, cv, ca, cstd into csv file result_list.append(result) count += 1 save_dict_to_csv(result_list) np.savetxt(os.path.join('.','result',"distance_dist.csv"), distance_dist, delimiter=',') print res
def mfa_experiment(n_mixtures, output=False): p = [0.1, 0.3, 0.6] mus = [] pis = [] pis_diff = [] phis = [] lls = [] lambdas = [] count = 0 for X, ratio in sample_generator(1, p, 100): count += 1 mfa = MixtureFA(n_mixtures, 1) mfa.fit(X) # np.savetxt("data.csv", X, delimiter=',') mus.append(mfa.mu) pis.append(mfa.pi) phis.append(mfa.phi) pis_diff.append(np.abs(mfa.pi - np.array(ratio))) lambdas.append(np.array([l.dot(l.T) for l in np.vsplit(mfa.lambdas,n_mixtures)])) # pdb.set_trace() lls.append(mfa.ll[-1]) if count % 10 == 0: print("finished %d" % count) print mfa.mu print mfa.pi if output: with open(os.path.join('.','result_seed0.txt'), 'w') as outfile: outfile.write("Mean of MU\n") outfile.write(np.array2string(np.mean(np.array(mus), axis=0), separator=',') + '\n') # outfile.write("std of MU\n") # outfile.write(np.array2string(np.std(np.array(mus), axis=0), separator=',') + '\n') outfile.write("Mean of lambda.T.dot(lambda)\n") outfile.write(np.array2string(np.mean(np.array(lambdas), axis=0), separator=',') + '\n') # outfile.write("std of lambda.T.dot(lambda)\n") # outfile.write(np.array2string(np.std(np.array(lambdas), axis=0), separator=',') + '\n') outfile.write("Mean of Phi\n") outfile.write(np.array2string(np.mean(np.array(phis), axis=0), separator=',') + '\n') # outfile.write("std of Phi\n") # outfile.write(np.array2string(np.std(np.array(phis), axis=0), separator=',') + '\n') outfile.write("Mean of Pi\n") outfile.write(np.array2string(np.mean(np.array(pis), axis=0), separator=',') + '\n')
def fit(self): iteration = 0 permu_list = range(self.n_obs) for iteration in xrange(self.n_iters): m1_data, m2_data, validation, m1_label, m2_label, val_label = self._split_data(permu_list) # paralle model fit? for k in xrange(2, self.max_k + 1): print ("%d's iteration with k=%d" % (iteration, k)) m1 = MixtureFA(k, self.n_factors) m1.fit(m1_data) m2 = MixtureFA(k, self.n_factors) m2.fit(m2_data) self.result_matrix[iteration, k - 2] = self._mismatch(m1, m2, validation)