Exemplo n.º 1
0
def cluster_experiment():
    p = [0.33,0.33,0.34]
    max_k = 5
    n_factors = 1
    res = {'cv':np.zeros(max_k-1),'ca':np.zeros(max_k-1),'cstd':np.zeros(max_k-1),'aic':np.zeros(max_k-1),'bic':np.zeros(max_k-1)}
    result_list = []
    count = 100
    distance_dist = np.zeros((1, max_k-1),dtype=int)
    for X, Y in sample_generator(2, p, 600):
        print count
        result = {'id':count}
        # recorde the input data
        # np.savetxt("intput_data_%d.csv" % count, X, delimiter=',')
        # fit mfa model with validation
        mfa_cluster = MFACluster(X,max_k,n_factors,Y,iters=10)
        mfa_cluster.fit()
        res['cv'][mfa_cluster.best_k("voting")-2] += 1
        res['ca'][mfa_cluster.best_k("averaging")-2] += 1
        res['cstd'][mfa_cluster.best_k("std")-2] += 1

        result['cv'] = mfa_cluster.best_k("voting")
        result['ca'] = mfa_cluster.best_k("averaging")
        result['cstd'] = mfa_cluster.best_k("std")
        distance_dist = np.vstack((distance_dist, mfa_cluster.result_matrix))

        # fit mfa model to whole dataset with aic and bic

        min_aic = min_bic = sys.maxint
        k_aic = k_bic = 0
        for k in xrange(2, max_k+1):
            m = MixtureFA(k, n_factors)
            m.fit(X)
            result['lik_%d' % k] = m.ll[-1]
            result['aic_%d' % k] = m.aic()
            result['bic_%d' % k] = m.bic()
            if m.aic() < min_aic:
                k_aic = k
                min_aic = m.aic()
            if m.bic() < min_bic:
                k_bic = k
                min_bic = m.bic()
        res['aic'][k_aic-2] += 1
        res['bic'][k_bic-2] += 1
        result['aic'] = k_aic
        result['bic'] = k_bic

        # collect result
        # save iter number, likelihood_k, aic_k, bic_k, cv, ca, cstd into csv file
        result_list.append(result)
        count += 1
    save_dict_to_csv(result_list)
    np.savetxt(os.path.join('.','result',"distance_dist.csv"), distance_dist, delimiter=',')
    print res
Exemplo n.º 2
0
def mfa_experiment(n_mixtures, output=False):
    p = [0.1, 0.3, 0.6]
    mus = []
    pis = []
    pis_diff = []
    phis = []
    lls = []
    lambdas = []
    count = 0
    for X, ratio in sample_generator(1, p, 100):
        count += 1
        mfa = MixtureFA(n_mixtures, 1)
        mfa.fit(X)

        # np.savetxt("data.csv", X, delimiter=',')

        mus.append(mfa.mu)
        pis.append(mfa.pi)
        phis.append(mfa.phi)
        pis_diff.append(np.abs(mfa.pi - np.array(ratio)))
        lambdas.append(np.array([l.dot(l.T) for l in np.vsplit(mfa.lambdas,n_mixtures)]))
        # pdb.set_trace()
        lls.append(mfa.ll[-1])
        if count % 10 == 0:
            print("finished %d" % count)
    print mfa.mu
    print mfa.pi
    if output:
        with open(os.path.join('.','result_seed0.txt'), 'w') as outfile:
            outfile.write("Mean of MU\n")
            outfile.write(np.array2string(np.mean(np.array(mus), axis=0), separator=',') + '\n')
            # outfile.write("std of MU\n")
            # outfile.write(np.array2string(np.std(np.array(mus), axis=0), separator=',') + '\n')
            outfile.write("Mean of lambda.T.dot(lambda)\n")
            outfile.write(np.array2string(np.mean(np.array(lambdas), axis=0), separator=',') + '\n')
            # outfile.write("std of lambda.T.dot(lambda)\n")
            # outfile.write(np.array2string(np.std(np.array(lambdas), axis=0), separator=',') + '\n')
            outfile.write("Mean of Phi\n")
            outfile.write(np.array2string(np.mean(np.array(phis), axis=0), separator=',') + '\n')
            # outfile.write("std of Phi\n")
            # outfile.write(np.array2string(np.std(np.array(phis), axis=0), separator=',') + '\n')
            outfile.write("Mean of Pi\n")
            outfile.write(np.array2string(np.mean(np.array(pis), axis=0), separator=',') + '\n')
Exemplo n.º 3
0
 def fit(self):
     iteration = 0
     permu_list = range(self.n_obs)
     for iteration in xrange(self.n_iters):
         m1_data, m2_data, validation, m1_label, m2_label, val_label = self._split_data(permu_list)
         # paralle model fit?
         for k in xrange(2, self.max_k + 1):
             print ("%d's iteration with k=%d" % (iteration, k))
             m1 = MixtureFA(k, self.n_factors)
             m1.fit(m1_data)
             m2 = MixtureFA(k, self.n_factors)
             m2.fit(m2_data)
             self.result_matrix[iteration, k - 2] = self._mismatch(m1, m2, validation)