def testem(self):
        # complex DataSet with HMM sequences and scalar data
        dat = self.gen.sampleSet(100)

        # sampling hmm data
        seq1 = self.h1.hmm.sample(40, 10)
        seq2 = self.h2.hmm.sample(60, 10)

        seq1.merge(seq2)

        data = mixtureHMM.SequenceDataSet()
        data.fromGHMM(dat, [seq1])
        data.internalInit(self.m)

        tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]]
        tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2],
              [0.4, 0.3, 0.15, 0.15]]
        tpi = [0.3, 0.3, 0.4]
        th1 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi)

        tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]]
        tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4],
               [0.2, 0.1, 0.6, 0.1]]
        tpi2 = [0.3, 0.4, 0.3]
        th2 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2)

        tn1 = mixture.NormalDistribution(-1.5, 1.5)
        tn2 = mixture.NormalDistribution(9.0, 1.2)

        tmult1 = mixture.MultinomialDistribution(3,
                                                 4, [0.1, 0.1, 0.55, 0.25],
                                                 alphabet=self.DIAG)
        tmult2 = mixture.MultinomialDistribution(3,
                                                 4, [0.4, 0.3, 0.1, 0.2],
                                                 alphabet=self.DIAG)

        tc1 = mixture.ProductDistribution([tn1, tmult1, th1])
        tc2 = mixture.ProductDistribution([tn2, tmult2, th2])

        tmpi = [0.7, 0.3]
        tm = mixture.MixtureModel(2, tmpi, [tc1, tc2])

        tm.EM(data, 80, 0.1, silent=1)
Exemplo n.º 2
0
def testLymphData():

    k = 5
    d = 11

    aux = [0] * d

    models = []

    for i in range(k):
        aux1 = [0] * d
        aux2 = [0] * d
        aux3 = [0] * d
        models.append(
            mixture.ProductDistribution(
                [mixture.DependenceTreeDistribution(d, aux1, aux2, aux3)]))

    pi = [1.0] * k
    pi = numpy.array(pi) / k

    train = mixture.MixtureModel(k, pi, models)

    data = mixture.DataSet()
    data.fromFiles(['data/ltree2_2fold.txt'], )

    train.modelInitialization(data)

    train.EM(data, 100, 0.01, silent=1)
Exemplo n.º 3
0
def getBayesModel(G, p, mixPrior=None):
    """
    Constructs a PWM CSI BayesMixtureModel.
    
    @param G: number of components
    @param p: number of positions of the binding site
    @return: BayesMixtureModel object
    """

    if not mixPrior:
        piPrior = mixture.DirichletPrior(G, [1.0] * G)
        compPrior = []
        for i in range(p):
            compPrior.append(
                mixture.DirichletPrior(4, [1.02, 1.02, 1.02, 1.02]))

        # arbitrary values of struct and comp parameters. Values should be
        # reset by user using the structPriorHeuristic method.
        mixPrior = mixture.MixtureModelPrior(0.05, 0.05, piPrior, compPrior)

    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    comps = []
    for i in range(G):
        dlist = []
        for j in range(p):
            phi = mixture.random_vector(4)
            dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
        comps.append(mixture.ProductDistribution(dlist))
    pi = mixture.random_vector(G)
    m = mixture.BayesMixtureModel(G, pi, comps, mixPrior, struct=1)
    return m
Exemplo n.º 4
0
def createDistribution(data, distribution):
    # creating a component
    p = data.p
    # type of distribution
    dist = None
    if distribution == 'normal':
        p = []
        for i in range(data.p):
            p.append(mixture.NormalDistribution(0, 1))
        dist = mixture.ProductDistribution(p)
    else:
        sigma = [1]
        beta = []
        for i in range(data.p):
            beta.append(random.normalvariate(0, 1))
        dist = mixture.ProductDistribution(
            [mixtureLinearGaussian.LinearGaussianDistribution(p, beta, sigma)])

    return dist
    def setUp(self):
        # building generating models
        self.DIAG = mixture.Alphabet(['.', '0', '8', '1'])

        A = [[0.3, 0.6, 0.1], [0.0, 0.5, 0.5], [0.4, 0.2, 0.4]]
        B = [[0.5, 0.2, 0.1, 0.2], [0.5, 0.4, 0.05, 0.05],
             [0.8, 0.1, 0.05, 0.05]]
        pi = [1.0, 0.0, 0.0]
        self.h1 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), A, B, pi)

        A2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]]
        B2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.5], [0.2, 0.2, 0.3, 0.3]]
        pi2 = [0.6, 0.4, 0.0]
        self.h2 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), A2, B2, pi2)

        n1 = mixture.NormalDistribution(2.5, 0.5)
        n2 = mixture.NormalDistribution(6.0, 0.8)

        mult1 = mixture.MultinomialDistribution(3,
                                                4, [0.23, 0.26, 0.26, 0.25],
                                                alphabet=self.DIAG)
        mult2 = mixture.MultinomialDistribution(3,
                                                4, [0.7, 0.1, 0.1, 0.1],
                                                alphabet=self.DIAG)

        c1 = mixture.ProductDistribution([n1, mult1, self.h1])
        c2 = mixture.ProductDistribution([n2, mult2, self.h2])

        mpi = [0.4, 0.6]
        self.m = mixture.MixtureModel(2, mpi, [c1, c2])

        # mixture for sampling
        gc1 = mixture.ProductDistribution([n1, mult1])
        gc2 = mixture.ProductDistribution([n2, mult2])
        self.gen = mixture.MixtureModel(2, mpi, [gc1, gc2])
    def testsimpleem(self):

        # sampling hmm data
        seq1 = self.h1.hmm.sample(40, 10)
        seq2 = self.h2.hmm.sample(60, 10)
        seq1.merge(seq2)

        data = mixtureHMM.SequenceDataSet()
        data.fromGHMM([], [seq1])

        tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]]
        tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2],
              [0.4, 0.3, 0.15, 0.15]]
        tpi = [0.3, 0.3, 0.4]
        th1 = mixture.ProductDistribution([
            mixtureHMM.getHMM(
                mixtureHMM.ghmm.IntegerRange(0, 4),
                mixtureHMM.ghmm.DiscreteDistribution(
                    mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi)
        ])

        tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]]
        tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4],
               [0.2, 0.1, 0.6, 0.1]]
        tpi2 = [0.3, 0.4, 0.3]
        th2 = mixture.ProductDistribution([
            mixtureHMM.getHMM(
                mixtureHMM.ghmm.IntegerRange(0, 4),
                mixtureHMM.ghmm.DiscreteDistribution(
                    mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2)
        ])

        mpi = [0.4, 0.6]
        hm = mixture.MixtureModel(2, mpi, [th1, th2])

        data.internalInit(hm)

        hm.EM(data, 80, 0.1, silent=1)
    def testinternalinitcomplexempty(self):
        # complex DataSet with HMM sequences only

        # sampling hmm data
        seq1 = self.h1.hmm.sample(40, 10)
        seq2 = self.h2.hmm.sample(60, 10)
        seq1.merge(seq2)

        data = mixtureHMM.SequenceDataSet()
        data.fromGHMM([], [seq1])

        self.assertRaises(AssertionError, data.internalInit, self.m)

        c1 = mixture.ProductDistribution([self.h1])
        c2 = mixture.ProductDistribution([self.h2])

        mpi = [0.4, 0.6]
        hm = mixture.MixtureModel(2, mpi, [c1, c2])

        data.internalInit(hm)

        self.assertEqual(str(data.complexFeature), '[1]')
        self.assertEqual(data.p, 1)
        self.assertEqual(data.suff_p, 1)
Exemplo n.º 8
0
def getModel(G, p):
    """
    Constructs a PWM MixtureModel.
    
    @param G: number of components
    @param p: number of positions of the binding site
    @return: MixtureModel object
    """
    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    comps = []
    for i in range(G):
        dlist = []
        for j in range(p):
            phi = mixture.random_vector(4)
            dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
        comps.append(mixture.ProductDistribution(dlist))
    pi = mixture.random_vector(G)
    m = mixture.MixtureModel(G, pi, comps)
    return m
Exemplo n.º 9
0
def testdtree():

    tree = {}
    tree[0] = -1
    tree[1] = 0
    tree[2] = 1

    n1 = mixture.ProductDistribution([
        mixture.ConditionalGaussDistribution(3, [0, 1, 0], [0, -0.1, 0.1],
                                             [0.5, 0.5, 0.5], tree)
    ])
    tree2 = {}
    tree2[0] = -1
    tree2[1] = 0
    tree2[2] = 0
    n2 = mixture.ProductDistribution([
        mixture.ConditionalGaussDistribution(3, [-1, 0, 1], [0, 0.1, -0.1],
                                             [0.5, 0.5, 0.5], tree2)
    ])

    pi = [0.4, 0.6]
    gen = mixture.MixtureModel(2, pi, [n1, n2])

    random.seed(1)
    data = gen.sampleDataSet(1000)

    print data

    n1 = mixture.ProductDistribution([
        mixture.DependenceTreeDistribution(3, [0.1, 1.1, 0.1], [0, 0, 0],
                                           [1.0, 1.0, 1.0])
    ])
    n2 = mixture.ProductDistribution([
        mixture.DependenceTreeDistribution(3, [-1, 0, -0.1], [0, 0, 0],
                                           [1.0, 1.0, 1.0])
    ])

    n1 = mixture.ProductDistribution([
        mixture.ConditionalGaussDistribution(3, [0, 1, 0], [0.0, 0.1, 0.1],
                                             [0.1, 0.1, 0.1], tree)
    ])
    n2 = mixture.ProductDistribution([
        mixture.ConditionalGaussDistribution(3, [-1, 0, 1], [0.0, 0.1, 0.1],
                                             [0.1, 0.1, 0.1], tree2)
    ])

    train = mixture.MixtureModel(2, pi, [n1, n2])
    train.modelInitialization(data)
    train.EM(data, 100, 0.01, silent=1)
Exemplo n.º 10
0
def getBackgroundModel(p, dist=None):
    """
    Construct background model
    
    @param p: number of positions of the binding site
    @param dist: background nucleotide frequencies, uniform is default
    
    @return: MixtureModel representing the background
    """
    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    dlist = []

    if dist == None:
        phi = [0.25] * 4
    else:
        phi = dist

    for j in range(p):
        dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
    comps = [mixture.ProductDistribution(dlist)]

    m = mixture.MixtureModel(1, [1.0], comps)
    return m
Exemplo n.º 11
0
d14 = mixture.DiscreteDistribution(4, [0.25, 0.25, 0.25, 0.25])

# initializing atomar distributions for second component
n21 = mixture.NormalDistribution(4.0, 0.5)
n22 = mixture.NormalDistribution(-6.0, 0.5)
d23 = mixture.DiscreteDistribution(4, [0.7, 0.1, 0.1, 0.1])
d24 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.2, 0.6])

# initializing atomar distributions for second component
n31 = mixture.NormalDistribution(2.0, 0.5)
n32 = mixture.NormalDistribution(-3.0, 0.5)
d33 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.1, 0.7])
d34 = mixture.DiscreteDistribution(4, [0.6, 0.1, 0.2, 0.1])

# creating component distributions
c1 = mixture.ProductDistribution([n11, n12, d13, d14])
c2 = mixture.ProductDistribution([n21, n22, d23, d24])
c3 = mixture.ProductDistribution([n31, n32, d33, d34])

# setting up the mixture prior
piPr = mixture.DirichletPrior(
    3, [1.0, 1.0, 1.0])  # uniform prior of mixture coefficients

# conjugate priors over the atomar distributions - Normal-Gamma for Normal distribution, Dirichlet for the discrete distribution
compPrior = [
    mixture.NormalGammaPrior(1.5, 0.01, 3.0, 1.0),
    mixture.NormalGammaPrior(-2.0, 0.01, 3.0, 1.0),
    mixture.DirichletPrior(4, [1.01, 1.01, 1.01, 1.01]),
    mixture.DirichletPrior(4, [1.01, 1.01, 1.01, 1.01])
]
Exemplo n.º 12
0
h2 = mixtureHMM.getHMM(
    mixtureHMM.ghmm.IntegerRange(0, 4),
    mixtureHMM.ghmm.DiscreteDistribution(mixtureHMM.ghmm.IntegerRange(0, 4)),
    A2, B2, pi2)

n1 = mixture.NormalDistribution(2.5, 0.5)
n2 = mixture.NormalDistribution(6.0, 0.8)

mult1 = mixture.MultinomialDistribution(3,
                                        4, [0.23, 0.26, 0.26, 0.25],
                                        alphabet=DIAG)
mult2 = mixture.MultinomialDistribution(3,
                                        4, [0.7, 0.1, 0.1, 0.1],
                                        alphabet=DIAG)

c1 = mixture.ProductDistribution([n1, mult1, h1])
c2 = mixture.ProductDistribution([n2, mult2, h2])

mpi = [0.4, 0.6]
m = mixture.MixtureModel(2, mpi, [c1, c2])

#print m
#print "-->",m.components[0].suff_dataRange

# ----------- constructing complex DataSet ----------------

# mixture for sampling
gc1 = mixture.ProductDistribution([n1, mult1])
gc2 = mixture.ProductDistribution([n2, mult2])
gen = mixture.MixtureModel(2, mpi, [gc1, gc2])
Exemplo n.º 13
0
# First we generate a data set from a three component mixture with a CSI like structure
# in the distribution parameters. Then a  five component CSI mixture is trained.
# The training should recover the true number of components (three),
# the CSI structure of the generating model as well as the distribution parameters.

# Setting up the generating model. This is a benign case in the
# sense that the components are reasonably well separated and we
# allow ourselves plenty of training data.

# Component distributions
n11 = mixture.NormalDistribution(1.0, 0.5)
n12 = mixture.NormalDistribution(2.0, 1.5)
n13 = mixture.NormalDistribution(3.0, 0.7)
d14 = mixture.DiscreteDistribution(4, [0.4, 0.3, 0.1, 0.2])

c1 = mixture.ProductDistribution([n11, n12, n13, d14])

n21 = mixture.NormalDistribution(1.0, 0.5)
n22 = mixture.NormalDistribution(-6.0, 0.5)
n23 = mixture.NormalDistribution(3.0, 0.7)
d24 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.4, 0.4])

c2 = mixture.ProductDistribution([n21, n22, n23, d24])

n31 = mixture.NormalDistribution(2.0, 0.5)
n32 = mixture.NormalDistribution(-3.0, 0.5)
n33 = mixture.NormalDistribution(3.0, 0.7)
d34 = mixture.DiscreteDistribution(4, [0.4, 0.3, 0.1, 0.2])

c3 = mixture.ProductDistribution([n31, n32, n33, d34])
Exemplo n.º 14
0
                                        [dist_spelling, missing_spelling],
                                        compFix=[0, 2])

    # diagnoses for cormobidit disorders
    #"ODD"	"CONDUCT"	"SOC PHO"	"SEP ANX"	"SPEC PHO"	"ENUR NOC"	"ENUR DIU"	"ENCOPRES"	"TOURET"	"TIC CRON"	"TIC TRAN"
    comor = []
    for j in range(COMOR):
        p_comor = [0.0] + mixture.random_vector(3)
        comor_missing = mixture.MultinomialDistribution(
            1, 4, [1.0, 0.0, 0.0, 0.0], DIAG)
        comor_mult = mixture.MultinomialDistribution(1, 4, p_comor, DIAG)
        comor_mix = mixture.MixtureModel(2, [0.999, 0.001],
                                         [comor_mult, comor_missing],
                                         compFix=[0, 2])
        comor.append(comor_mix)
    pd_comor = mixture.ProductDistribution(comor)

    # the drd4 VNTR are represented as a discrete distribution over the observed lengths,
    # the specific repeat sequence tpyes are not considered at this time
    p_drd4_vntr_len = [0.0] + mixture.random_vector(10)

    dist_drd4_vntr_len = mixture.MultinomialDistribution(
        1, 11, p_drd4_vntr_len, VNTR)
    vntr_missing = mixture.MultinomialDistribution(1, 11, [1.0] + [0.0] * 10,
                                                   VNTR)
    mix_drd4_vntr_len = mixture.MixtureModel(
        2, [0.999, 0.001], [dist_drd4_vntr_len, vntr_missing], compFix=[0, 2])

    components.append(
        mixture.ProductDistribution([
            mix_bd, mix_voc, mix_read, mix_math, mix_spelling, pd_comor,
Exemplo n.º 15
0
    compPrior.append( mixture.NormalGammaDistribution( 1.0,2.0,3.0,4.0 ) )

mixPrior = mixture.MixturePrior(0.7,0.7,piPrior, compPrior)

DNA = mixture.Alphabet(['A','C','G','T'])
comps = []
for i in range(G):
    dlist = []
    for j in range(2):
       phi = mixture.random_vector(4)
       dlist.append( mixture.DiscreteDistribution(4,phi,DNA))
    for j in range(2):
       mu = j+1.0
       sigma = j+0.5
       dlist.append( mixture.NormalDistribution(mu,sigma))


    comps.append(mixture.ProductDistribution(dlist))
pi = mixture.random_vector(G)

m = mixture.BayesMixtureModel(G,pi, comps, mixPrior, struct = 1)

mixture.writeMixture(m, 'test.bmix')


m2 = mixture.readMixture('test.bmix')


print m2
print m2.prior
Exemplo n.º 16
0
def getRandomCSIMixture_conditionalDists(G,
                                         p,
                                         KL_lower,
                                         KL_upper,
                                         M=8,
                                         dtypes='discgauss',
                                         seed=None,
                                         fullstruct=False,
                                         disc_sampling_dist=None):

    #    if seed:
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        #print '*** seed=',seed
    #
    #    else: # XXX debug
    #        seed = random.randint(1,9999999)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        random.seed(seed)
    #        #print '*** seed=',seed

    if disc_sampling_dist == None:
        discSamp = mixture.DirichletPrior(M, [1.0] * M)  # uniform sampling
    else:
        discSamp = disc_sampling_dist

    min_sigma = 0.3  # minimal std for Normal
    max_sigma = 5.0  # maximal std for Normal
    min_mu = -25.0  # minimal mean
    max_mu = 25.0  # maximal mean

    assert dtypes in ['disc', 'gauss', 'discgauss']

    if dtypes == 'disc':
        featureTypes = [0] * p
    elif dtypes == 'gauss':
        featureTypes = [1] * p
    elif dtypes == 'discgauss':
        # discrete or Normal features for now, chosen uniformly
        # 0 discrete, 1 Normal
        featureTypes = [random.choice((0, 1)) for i in range(p)]
    else:
        raise TypeError

    #print featureTypes

    # generate random CSI structures

    if G < 15:
        P = setPartitions.generate_all_partitions(
            G)  # XXX too slow for large G
    #print P

    C = []

    leaders = []
    groups = []
    for j in range(p):
        c_j = {}

        leaders_j = []
        groups_j = {}

        if fullstruct == True:
            struct_j = [(i, ) for i in range(G)]

        elif G < 15:
            struct_j = random.choice(P)
        else:
            print 'WARNING: improper structure sampling !'
            struct_j = setPartitions.get_random_partition(G)

        #print '\nstruct',j,struct_j

        for i, grp in enumerate(struct_j):

            lg = list(grp)

            #print lg

            lgj = lg.pop(0)

            #print lgj

            leaders_j.append(lgj)
            groups_j[lgj] = lg

            max_tries = 100000
            tries = 0

            if featureTypes[j] == 0:
                acc = 0

                while acc == 0:
                    cand = discSamp.sample()

                    #print 'Cand:', cand

                    acc = 1
                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(c_j[d], cand)

                        #print c_j[d],cand, KL_dist

                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            acc = 0
                            tries += 1
                            break

                    if tries >= max_tries:
                        raise RuntimeError, 'Failed to find separated parameters !'

                for cind in grp:
                    c_j[cind] = cand

            elif featureTypes[j] == 1:
                acc = 0
                while acc == 0:
                    mu = random.uniform(min_mu, max_mu)
                    sigma = random.uniform(min_sigma, max_sigma)
                    cand = mixture.NormalDistribution(mu, sigma)
                    acc = 1

                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(c_j[d], cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            acc = 0
                            tries += 1
                            break

                    if tries >= max_tries:
                        raise RuntimeError

                #    print '.',
                #print

                for cind in grp:
                    c_j[cind] = cand

            else:
                RuntimeError

        leaders.append(leaders_j)
        groups.append(groups_j)

        C.append(c_j)

    comps = []
    for i in range(G):
        comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)]))

    pi = get_random_pi(G, 0.3 / G)
    #print '** pi =',pi

    # create prior
    piprior = mixture.DirichletPrior(G, [2.0] * G)

    cprior = []
    for j in range(p):
        if featureTypes[j] == 0:
            cprior.append(mixture.DirichletPrior(M, [1.02] * M))

        elif featureTypes[j] == 1:
            cprior.append(mixture.NormalGammaPrior(
                0, 0, 0, 0))  # dummy parameters, to be set later

        else:
            RuntimeError

    mprior = mixture.MixtureModelPrior(0.1, 0.1, piprior, cprior)

    m = mixture.BayesMixtureModel(G, pi, comps, mprior, struct=1)
    m.leaders = leaders
    m.groups = groups

    m.identifiable()
    m.updateFreeParams()
    #print m

    return m
Exemplo n.º 17
0
def getRandomMixture(G,
                     p,
                     KL_lower,
                     KL_upper,
                     dtypes='discgauss',
                     M=4,
                     seed=None):

    #    if seed:
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        #print '*** seed=',seed
    #
    #    else: # XXX debug
    #        seed = random.randint(1,9000000)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        random.seed(seed)
    #        #print '*** seed=',seed

    #M = 4  # Alphabet size for discrete distributions

    min_sigma = 0.1  # minimal std for Normal
    max_sigma = 1.0  # maximal std for Normal
    min_mu = -5.0  # minimal mean
    max_mu = 8.0  # maximal mean

    if dtypes == 'disc':
        featureTypes = [0] * p
    elif dtypes == 'gauss':
        featureTypes = [1] * p
    elif dtypes == 'discgauss':
        # discrete or Normal features for now, chosen uniformly
        # 0 discrete, 1 Normal
        featureTypes = [random.choice((0, 1)) for i in range(p)]
    else:
        raise TypeError

    #print featureTypes

    C = []
    for j in range(p):
        c_j = []
        for i in range(G):
            #print i,j
            if featureTypes[j] == 0:
                acc = 0
                while acc == 0:
                    cand = mixture.DiscreteDistribution(
                        M, mixture.random_vector(M))

                    #print 'cand:',cand

                    acc = 1

                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(d, cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            #print '  *', cand, 'rejected:', d , KL_dist
                            acc = 0
                            break

                c_j.append(cand)
            elif featureTypes[j] == 1:
                acc = 0
                while acc == 0:
                    mu = random.uniform(min_mu, max_mu)
                    sigma = random.uniform(min_sigma, max_sigma)

                    cand = mixture.NormalDistribution(mu, sigma)

                    #print 'cand:',cand

                    acc = 1

                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(d, cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            #print '  *', cand, 'rejected:', d , KL_dist
                            acc = 0

                c_j.append(cand)

            else:
                RuntimeError

        C.append(c_j)


#    print '\n'
#    for cc in C:
#        print cc

    comps = []
    for i in range(G):
        comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)]))

    pi = get_random_pi(G, 0.1)

    m = mixture.MixtureModel(G, pi, comps, struct=1)
    m.updateFreeParams()

    return m
Exemplo n.º 18
0
def clustering(k, feature_cols, feature_domains, header, table, seeds,
               result_file):
    best_loglike = None
    best_model = None
    # Giant random seeding loop,

    data = mx.DataSet()
    data.fromArray(table)
    for r in range(1):
        #  weights = np.random.random_sample(k)
        #  weights_norm = weights / sum(weights)
        weights_norm = [1.0 / k] * k
        components = []
        for i in range(k):
            products = []
            for j in range(table.shape[1]):
                col_type = prep.get_col_type(feature_cols[j], header)
                col_id = feature_cols[j]

                if col_type == 'cat':
                    vals = feature_domains[col_id].keys()
                    cnt_vals = len(vals)
                    rand_dist = np.random.random_sample(cnt_vals)

                    dist = mx.DiscreteDistribution(cnt_vals,
                                                   rand_dist / sum(rand_dist),
                                                   mx.Alphabet(vals))

                elif col_type == 'num':
                    min_val = feature_domains[col_id]['min']
                    max_val = feature_domains[col_id]['max']
                    #  mean = random.uniform(min_val, max_val)
                    mean = seeds[header[col_id][0]][i]
                    stdev = (max_val - min_val) / 2.0 / k

                    dist = mx.NormalDistribution(mean, stdev)

                else:
                    sys.exit(1)
                products.append(dist)

            comp = mx.ProductDistribution(products)
            components.append(comp)

        mix_table = mx.MixtureModel(k, weights_norm, components)
        print mix_table

        #loglike = mix_table.randMaxEM(data,1,50,50)
        #print loglike
        #print mix_table
        if not best_loglike or loglike > best_loglike:
            #  best_loglike = loglike
            best_model = copy.copy(mix_table)


#data.internalInit(mix)
# mix_table.modelInitialization(data)
#  print best_loglike
#  print best_model

    labels = best_model.classify(data, None, None, 1)

    ## output clustering results

    # count cluster sizes on sampled data
    f = open(result_file + '.stats', 'w')
    cnt = {}
    for l in labels:
        cnt[l] = 1 if l not in cnt else cnt[l] + 1

    for l in cnt:
        f.write('%s %d %f%%\n' %
                (l, cnt[l], cnt[l] * 100.0 / sum(cnt.values())))
    f.close()

    mx.writeMixture(best_model, result_file + '.model')
    return best_model