def testem(self): # complex DataSet with HMM sequences and scalar data dat = self.gen.sampleSet(100) # sampling hmm data seq1 = self.h1.hmm.sample(40, 10) seq2 = self.h2.hmm.sample(60, 10) seq1.merge(seq2) data = mixtureHMM.SequenceDataSet() data.fromGHMM(dat, [seq1]) data.internalInit(self.m) tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]] tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2], [0.4, 0.3, 0.15, 0.15]] tpi = [0.3, 0.3, 0.4] th1 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi) tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4], [0.2, 0.1, 0.6, 0.1]] tpi2 = [0.3, 0.4, 0.3] th2 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2) tn1 = mixture.NormalDistribution(-1.5, 1.5) tn2 = mixture.NormalDistribution(9.0, 1.2) tmult1 = mixture.MultinomialDistribution(3, 4, [0.1, 0.1, 0.55, 0.25], alphabet=self.DIAG) tmult2 = mixture.MultinomialDistribution(3, 4, [0.4, 0.3, 0.1, 0.2], alphabet=self.DIAG) tc1 = mixture.ProductDistribution([tn1, tmult1, th1]) tc2 = mixture.ProductDistribution([tn2, tmult2, th2]) tmpi = [0.7, 0.3] tm = mixture.MixtureModel(2, tmpi, [tc1, tc2]) tm.EM(data, 80, 0.1, silent=1)
def testLymphData(): k = 5 d = 11 aux = [0] * d models = [] for i in range(k): aux1 = [0] * d aux2 = [0] * d aux3 = [0] * d models.append( mixture.ProductDistribution( [mixture.DependenceTreeDistribution(d, aux1, aux2, aux3)])) pi = [1.0] * k pi = numpy.array(pi) / k train = mixture.MixtureModel(k, pi, models) data = mixture.DataSet() data.fromFiles(['data/ltree2_2fold.txt'], ) train.modelInitialization(data) train.EM(data, 100, 0.01, silent=1)
def getBayesModel(G, p, mixPrior=None): """ Constructs a PWM CSI BayesMixtureModel. @param G: number of components @param p: number of positions of the binding site @return: BayesMixtureModel object """ if not mixPrior: piPrior = mixture.DirichletPrior(G, [1.0] * G) compPrior = [] for i in range(p): compPrior.append( mixture.DirichletPrior(4, [1.02, 1.02, 1.02, 1.02])) # arbitrary values of struct and comp parameters. Values should be # reset by user using the structPriorHeuristic method. mixPrior = mixture.MixtureModelPrior(0.05, 0.05, piPrior, compPrior) DNA = mixture.Alphabet(['A', 'C', 'G', 'T']) comps = [] for i in range(G): dlist = [] for j in range(p): phi = mixture.random_vector(4) dlist.append(mixture.DiscreteDistribution(4, phi, DNA)) comps.append(mixture.ProductDistribution(dlist)) pi = mixture.random_vector(G) m = mixture.BayesMixtureModel(G, pi, comps, mixPrior, struct=1) return m
def createDistribution(data, distribution): # creating a component p = data.p # type of distribution dist = None if distribution == 'normal': p = [] for i in range(data.p): p.append(mixture.NormalDistribution(0, 1)) dist = mixture.ProductDistribution(p) else: sigma = [1] beta = [] for i in range(data.p): beta.append(random.normalvariate(0, 1)) dist = mixture.ProductDistribution( [mixtureLinearGaussian.LinearGaussianDistribution(p, beta, sigma)]) return dist
def setUp(self): # building generating models self.DIAG = mixture.Alphabet(['.', '0', '8', '1']) A = [[0.3, 0.6, 0.1], [0.0, 0.5, 0.5], [0.4, 0.2, 0.4]] B = [[0.5, 0.2, 0.1, 0.2], [0.5, 0.4, 0.05, 0.05], [0.8, 0.1, 0.05, 0.05]] pi = [1.0, 0.0, 0.0] self.h1 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), A, B, pi) A2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] B2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.5], [0.2, 0.2, 0.3, 0.3]] pi2 = [0.6, 0.4, 0.0] self.h2 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), A2, B2, pi2) n1 = mixture.NormalDistribution(2.5, 0.5) n2 = mixture.NormalDistribution(6.0, 0.8) mult1 = mixture.MultinomialDistribution(3, 4, [0.23, 0.26, 0.26, 0.25], alphabet=self.DIAG) mult2 = mixture.MultinomialDistribution(3, 4, [0.7, 0.1, 0.1, 0.1], alphabet=self.DIAG) c1 = mixture.ProductDistribution([n1, mult1, self.h1]) c2 = mixture.ProductDistribution([n2, mult2, self.h2]) mpi = [0.4, 0.6] self.m = mixture.MixtureModel(2, mpi, [c1, c2]) # mixture for sampling gc1 = mixture.ProductDistribution([n1, mult1]) gc2 = mixture.ProductDistribution([n2, mult2]) self.gen = mixture.MixtureModel(2, mpi, [gc1, gc2])
def testsimpleem(self): # sampling hmm data seq1 = self.h1.hmm.sample(40, 10) seq2 = self.h2.hmm.sample(60, 10) seq1.merge(seq2) data = mixtureHMM.SequenceDataSet() data.fromGHMM([], [seq1]) tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]] tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2], [0.4, 0.3, 0.15, 0.15]] tpi = [0.3, 0.3, 0.4] th1 = mixture.ProductDistribution([ mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi) ]) tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4], [0.2, 0.1, 0.6, 0.1]] tpi2 = [0.3, 0.4, 0.3] th2 = mixture.ProductDistribution([ mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2) ]) mpi = [0.4, 0.6] hm = mixture.MixtureModel(2, mpi, [th1, th2]) data.internalInit(hm) hm.EM(data, 80, 0.1, silent=1)
def testinternalinitcomplexempty(self): # complex DataSet with HMM sequences only # sampling hmm data seq1 = self.h1.hmm.sample(40, 10) seq2 = self.h2.hmm.sample(60, 10) seq1.merge(seq2) data = mixtureHMM.SequenceDataSet() data.fromGHMM([], [seq1]) self.assertRaises(AssertionError, data.internalInit, self.m) c1 = mixture.ProductDistribution([self.h1]) c2 = mixture.ProductDistribution([self.h2]) mpi = [0.4, 0.6] hm = mixture.MixtureModel(2, mpi, [c1, c2]) data.internalInit(hm) self.assertEqual(str(data.complexFeature), '[1]') self.assertEqual(data.p, 1) self.assertEqual(data.suff_p, 1)
def getModel(G, p): """ Constructs a PWM MixtureModel. @param G: number of components @param p: number of positions of the binding site @return: MixtureModel object """ DNA = mixture.Alphabet(['A', 'C', 'G', 'T']) comps = [] for i in range(G): dlist = [] for j in range(p): phi = mixture.random_vector(4) dlist.append(mixture.DiscreteDistribution(4, phi, DNA)) comps.append(mixture.ProductDistribution(dlist)) pi = mixture.random_vector(G) m = mixture.MixtureModel(G, pi, comps) return m
def testdtree(): tree = {} tree[0] = -1 tree[1] = 0 tree[2] = 1 n1 = mixture.ProductDistribution([ mixture.ConditionalGaussDistribution(3, [0, 1, 0], [0, -0.1, 0.1], [0.5, 0.5, 0.5], tree) ]) tree2 = {} tree2[0] = -1 tree2[1] = 0 tree2[2] = 0 n2 = mixture.ProductDistribution([ mixture.ConditionalGaussDistribution(3, [-1, 0, 1], [0, 0.1, -0.1], [0.5, 0.5, 0.5], tree2) ]) pi = [0.4, 0.6] gen = mixture.MixtureModel(2, pi, [n1, n2]) random.seed(1) data = gen.sampleDataSet(1000) print data n1 = mixture.ProductDistribution([ mixture.DependenceTreeDistribution(3, [0.1, 1.1, 0.1], [0, 0, 0], [1.0, 1.0, 1.0]) ]) n2 = mixture.ProductDistribution([ mixture.DependenceTreeDistribution(3, [-1, 0, -0.1], [0, 0, 0], [1.0, 1.0, 1.0]) ]) n1 = mixture.ProductDistribution([ mixture.ConditionalGaussDistribution(3, [0, 1, 0], [0.0, 0.1, 0.1], [0.1, 0.1, 0.1], tree) ]) n2 = mixture.ProductDistribution([ mixture.ConditionalGaussDistribution(3, [-1, 0, 1], [0.0, 0.1, 0.1], [0.1, 0.1, 0.1], tree2) ]) train = mixture.MixtureModel(2, pi, [n1, n2]) train.modelInitialization(data) train.EM(data, 100, 0.01, silent=1)
def getBackgroundModel(p, dist=None): """ Construct background model @param p: number of positions of the binding site @param dist: background nucleotide frequencies, uniform is default @return: MixtureModel representing the background """ DNA = mixture.Alphabet(['A', 'C', 'G', 'T']) dlist = [] if dist == None: phi = [0.25] * 4 else: phi = dist for j in range(p): dlist.append(mixture.DiscreteDistribution(4, phi, DNA)) comps = [mixture.ProductDistribution(dlist)] m = mixture.MixtureModel(1, [1.0], comps) return m
d14 = mixture.DiscreteDistribution(4, [0.25, 0.25, 0.25, 0.25]) # initializing atomar distributions for second component n21 = mixture.NormalDistribution(4.0, 0.5) n22 = mixture.NormalDistribution(-6.0, 0.5) d23 = mixture.DiscreteDistribution(4, [0.7, 0.1, 0.1, 0.1]) d24 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.2, 0.6]) # initializing atomar distributions for second component n31 = mixture.NormalDistribution(2.0, 0.5) n32 = mixture.NormalDistribution(-3.0, 0.5) d33 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.1, 0.7]) d34 = mixture.DiscreteDistribution(4, [0.6, 0.1, 0.2, 0.1]) # creating component distributions c1 = mixture.ProductDistribution([n11, n12, d13, d14]) c2 = mixture.ProductDistribution([n21, n22, d23, d24]) c3 = mixture.ProductDistribution([n31, n32, d33, d34]) # setting up the mixture prior piPr = mixture.DirichletPrior( 3, [1.0, 1.0, 1.0]) # uniform prior of mixture coefficients # conjugate priors over the atomar distributions - Normal-Gamma for Normal distribution, Dirichlet for the discrete distribution compPrior = [ mixture.NormalGammaPrior(1.5, 0.01, 3.0, 1.0), mixture.NormalGammaPrior(-2.0, 0.01, 3.0, 1.0), mixture.DirichletPrior(4, [1.01, 1.01, 1.01, 1.01]), mixture.DirichletPrior(4, [1.01, 1.01, 1.01, 1.01]) ]
h2 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution(mixtureHMM.ghmm.IntegerRange(0, 4)), A2, B2, pi2) n1 = mixture.NormalDistribution(2.5, 0.5) n2 = mixture.NormalDistribution(6.0, 0.8) mult1 = mixture.MultinomialDistribution(3, 4, [0.23, 0.26, 0.26, 0.25], alphabet=DIAG) mult2 = mixture.MultinomialDistribution(3, 4, [0.7, 0.1, 0.1, 0.1], alphabet=DIAG) c1 = mixture.ProductDistribution([n1, mult1, h1]) c2 = mixture.ProductDistribution([n2, mult2, h2]) mpi = [0.4, 0.6] m = mixture.MixtureModel(2, mpi, [c1, c2]) #print m #print "-->",m.components[0].suff_dataRange # ----------- constructing complex DataSet ---------------- # mixture for sampling gc1 = mixture.ProductDistribution([n1, mult1]) gc2 = mixture.ProductDistribution([n2, mult2]) gen = mixture.MixtureModel(2, mpi, [gc1, gc2])
# First we generate a data set from a three component mixture with a CSI like structure # in the distribution parameters. Then a five component CSI mixture is trained. # The training should recover the true number of components (three), # the CSI structure of the generating model as well as the distribution parameters. # Setting up the generating model. This is a benign case in the # sense that the components are reasonably well separated and we # allow ourselves plenty of training data. # Component distributions n11 = mixture.NormalDistribution(1.0, 0.5) n12 = mixture.NormalDistribution(2.0, 1.5) n13 = mixture.NormalDistribution(3.0, 0.7) d14 = mixture.DiscreteDistribution(4, [0.4, 0.3, 0.1, 0.2]) c1 = mixture.ProductDistribution([n11, n12, n13, d14]) n21 = mixture.NormalDistribution(1.0, 0.5) n22 = mixture.NormalDistribution(-6.0, 0.5) n23 = mixture.NormalDistribution(3.0, 0.7) d24 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.4, 0.4]) c2 = mixture.ProductDistribution([n21, n22, n23, d24]) n31 = mixture.NormalDistribution(2.0, 0.5) n32 = mixture.NormalDistribution(-3.0, 0.5) n33 = mixture.NormalDistribution(3.0, 0.7) d34 = mixture.DiscreteDistribution(4, [0.4, 0.3, 0.1, 0.2]) c3 = mixture.ProductDistribution([n31, n32, n33, d34])
[dist_spelling, missing_spelling], compFix=[0, 2]) # diagnoses for cormobidit disorders #"ODD" "CONDUCT" "SOC PHO" "SEP ANX" "SPEC PHO" "ENUR NOC" "ENUR DIU" "ENCOPRES" "TOURET" "TIC CRON" "TIC TRAN" comor = [] for j in range(COMOR): p_comor = [0.0] + mixture.random_vector(3) comor_missing = mixture.MultinomialDistribution( 1, 4, [1.0, 0.0, 0.0, 0.0], DIAG) comor_mult = mixture.MultinomialDistribution(1, 4, p_comor, DIAG) comor_mix = mixture.MixtureModel(2, [0.999, 0.001], [comor_mult, comor_missing], compFix=[0, 2]) comor.append(comor_mix) pd_comor = mixture.ProductDistribution(comor) # the drd4 VNTR are represented as a discrete distribution over the observed lengths, # the specific repeat sequence tpyes are not considered at this time p_drd4_vntr_len = [0.0] + mixture.random_vector(10) dist_drd4_vntr_len = mixture.MultinomialDistribution( 1, 11, p_drd4_vntr_len, VNTR) vntr_missing = mixture.MultinomialDistribution(1, 11, [1.0] + [0.0] * 10, VNTR) mix_drd4_vntr_len = mixture.MixtureModel( 2, [0.999, 0.001], [dist_drd4_vntr_len, vntr_missing], compFix=[0, 2]) components.append( mixture.ProductDistribution([ mix_bd, mix_voc, mix_read, mix_math, mix_spelling, pd_comor,
compPrior.append( mixture.NormalGammaDistribution( 1.0,2.0,3.0,4.0 ) ) mixPrior = mixture.MixturePrior(0.7,0.7,piPrior, compPrior) DNA = mixture.Alphabet(['A','C','G','T']) comps = [] for i in range(G): dlist = [] for j in range(2): phi = mixture.random_vector(4) dlist.append( mixture.DiscreteDistribution(4,phi,DNA)) for j in range(2): mu = j+1.0 sigma = j+0.5 dlist.append( mixture.NormalDistribution(mu,sigma)) comps.append(mixture.ProductDistribution(dlist)) pi = mixture.random_vector(G) m = mixture.BayesMixtureModel(G,pi, comps, mixPrior, struct = 1) mixture.writeMixture(m, 'test.bmix') m2 = mixture.readMixture('test.bmix') print m2 print m2.prior
def getRandomCSIMixture_conditionalDists(G, p, KL_lower, KL_upper, M=8, dtypes='discgauss', seed=None, fullstruct=False, disc_sampling_dist=None): # if seed: # random.seed(seed) # mixture._C_mixextend.set_gsl_rng_seed(seed) # #print '*** seed=',seed # # else: # XXX debug # seed = random.randint(1,9999999) # mixture._C_mixextend.set_gsl_rng_seed(seed) # random.seed(seed) # #print '*** seed=',seed if disc_sampling_dist == None: discSamp = mixture.DirichletPrior(M, [1.0] * M) # uniform sampling else: discSamp = disc_sampling_dist min_sigma = 0.3 # minimal std for Normal max_sigma = 5.0 # maximal std for Normal min_mu = -25.0 # minimal mean max_mu = 25.0 # maximal mean assert dtypes in ['disc', 'gauss', 'discgauss'] if dtypes == 'disc': featureTypes = [0] * p elif dtypes == 'gauss': featureTypes = [1] * p elif dtypes == 'discgauss': # discrete or Normal features for now, chosen uniformly # 0 discrete, 1 Normal featureTypes = [random.choice((0, 1)) for i in range(p)] else: raise TypeError #print featureTypes # generate random CSI structures if G < 15: P = setPartitions.generate_all_partitions( G) # XXX too slow for large G #print P C = [] leaders = [] groups = [] for j in range(p): c_j = {} leaders_j = [] groups_j = {} if fullstruct == True: struct_j = [(i, ) for i in range(G)] elif G < 15: struct_j = random.choice(P) else: print 'WARNING: improper structure sampling !' struct_j = setPartitions.get_random_partition(G) #print '\nstruct',j,struct_j for i, grp in enumerate(struct_j): lg = list(grp) #print lg lgj = lg.pop(0) #print lgj leaders_j.append(lgj) groups_j[lgj] = lg max_tries = 100000 tries = 0 if featureTypes[j] == 0: acc = 0 while acc == 0: cand = discSamp.sample() #print 'Cand:', cand acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(c_j[d], cand) #print c_j[d],cand, KL_dist if KL_dist > KL_upper or KL_dist < KL_lower: acc = 0 tries += 1 break if tries >= max_tries: raise RuntimeError, 'Failed to find separated parameters !' for cind in grp: c_j[cind] = cand elif featureTypes[j] == 1: acc = 0 while acc == 0: mu = random.uniform(min_mu, max_mu) sigma = random.uniform(min_sigma, max_sigma) cand = mixture.NormalDistribution(mu, sigma) acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(c_j[d], cand) if KL_dist > KL_upper or KL_dist < KL_lower: acc = 0 tries += 1 break if tries >= max_tries: raise RuntimeError # print '.', #print for cind in grp: c_j[cind] = cand else: RuntimeError leaders.append(leaders_j) groups.append(groups_j) C.append(c_j) comps = [] for i in range(G): comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)])) pi = get_random_pi(G, 0.3 / G) #print '** pi =',pi # create prior piprior = mixture.DirichletPrior(G, [2.0] * G) cprior = [] for j in range(p): if featureTypes[j] == 0: cprior.append(mixture.DirichletPrior(M, [1.02] * M)) elif featureTypes[j] == 1: cprior.append(mixture.NormalGammaPrior( 0, 0, 0, 0)) # dummy parameters, to be set later else: RuntimeError mprior = mixture.MixtureModelPrior(0.1, 0.1, piprior, cprior) m = mixture.BayesMixtureModel(G, pi, comps, mprior, struct=1) m.leaders = leaders m.groups = groups m.identifiable() m.updateFreeParams() #print m return m
def getRandomMixture(G, p, KL_lower, KL_upper, dtypes='discgauss', M=4, seed=None): # if seed: # random.seed(seed) # mixture._C_mixextend.set_gsl_rng_seed(seed) # #print '*** seed=',seed # # else: # XXX debug # seed = random.randint(1,9000000) # mixture._C_mixextend.set_gsl_rng_seed(seed) # random.seed(seed) # #print '*** seed=',seed #M = 4 # Alphabet size for discrete distributions min_sigma = 0.1 # minimal std for Normal max_sigma = 1.0 # maximal std for Normal min_mu = -5.0 # minimal mean max_mu = 8.0 # maximal mean if dtypes == 'disc': featureTypes = [0] * p elif dtypes == 'gauss': featureTypes = [1] * p elif dtypes == 'discgauss': # discrete or Normal features for now, chosen uniformly # 0 discrete, 1 Normal featureTypes = [random.choice((0, 1)) for i in range(p)] else: raise TypeError #print featureTypes C = [] for j in range(p): c_j = [] for i in range(G): #print i,j if featureTypes[j] == 0: acc = 0 while acc == 0: cand = mixture.DiscreteDistribution( M, mixture.random_vector(M)) #print 'cand:',cand acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(d, cand) if KL_dist > KL_upper or KL_dist < KL_lower: #print ' *', cand, 'rejected:', d , KL_dist acc = 0 break c_j.append(cand) elif featureTypes[j] == 1: acc = 0 while acc == 0: mu = random.uniform(min_mu, max_mu) sigma = random.uniform(min_sigma, max_sigma) cand = mixture.NormalDistribution(mu, sigma) #print 'cand:',cand acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(d, cand) if KL_dist > KL_upper or KL_dist < KL_lower: #print ' *', cand, 'rejected:', d , KL_dist acc = 0 c_j.append(cand) else: RuntimeError C.append(c_j) # print '\n' # for cc in C: # print cc comps = [] for i in range(G): comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)])) pi = get_random_pi(G, 0.1) m = mixture.MixtureModel(G, pi, comps, struct=1) m.updateFreeParams() return m
def clustering(k, feature_cols, feature_domains, header, table, seeds, result_file): best_loglike = None best_model = None # Giant random seeding loop, data = mx.DataSet() data.fromArray(table) for r in range(1): # weights = np.random.random_sample(k) # weights_norm = weights / sum(weights) weights_norm = [1.0 / k] * k components = [] for i in range(k): products = [] for j in range(table.shape[1]): col_type = prep.get_col_type(feature_cols[j], header) col_id = feature_cols[j] if col_type == 'cat': vals = feature_domains[col_id].keys() cnt_vals = len(vals) rand_dist = np.random.random_sample(cnt_vals) dist = mx.DiscreteDistribution(cnt_vals, rand_dist / sum(rand_dist), mx.Alphabet(vals)) elif col_type == 'num': min_val = feature_domains[col_id]['min'] max_val = feature_domains[col_id]['max'] # mean = random.uniform(min_val, max_val) mean = seeds[header[col_id][0]][i] stdev = (max_val - min_val) / 2.0 / k dist = mx.NormalDistribution(mean, stdev) else: sys.exit(1) products.append(dist) comp = mx.ProductDistribution(products) components.append(comp) mix_table = mx.MixtureModel(k, weights_norm, components) print mix_table #loglike = mix_table.randMaxEM(data,1,50,50) #print loglike #print mix_table if not best_loglike or loglike > best_loglike: # best_loglike = loglike best_model = copy.copy(mix_table) #data.internalInit(mix) # mix_table.modelInitialization(data) # print best_loglike # print best_model labels = best_model.classify(data, None, None, 1) ## output clustering results # count cluster sizes on sampled data f = open(result_file + '.stats', 'w') cnt = {} for l in labels: cnt[l] = 1 if l not in cnt else cnt[l] + 1 for l in cnt: f.write('%s %d %f%%\n' % (l, cnt[l], cnt[l] * 100.0 / sum(cnt.values()))) f.close() mx.writeMixture(best_model, result_file + '.model') return best_model