forked from Hongyuy/rulelib
/
reuse_gmpy.py
704 lines (651 loc) · 27.7 KB
/
reuse_gmpy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
# ##CODE:
import gmpy
from numpy import *
import os,time,json,traceback,sys
from scipy.special import gammaln
from scipy.stats import poisson,beta
import cPickle as Pickle
from collections import defaultdict,Counter
from fim import fpgrowth #this is PyFIM, available from http://www.borgelt.net/pyfim.html
#from matplotlib import pyplot as plt #Uncomment to use the plot_chains function
def topscript():
fname = 'titanic'
#Prior hyperparameters
lbda = 3. #prior hyperparameter for expected list length (excluding null rule)
eta = 1. #prior hyperparameter for expected list average width (excluding null rule)
alpha = array([1.,1.]) #prior hyperparameter for multinomial pseudocounts
#rule mining parameters
maxlhs = 2 #maximum cardinality of an itemset
minsupport = 10 #minimum support (%) of an itemset
#mcmc parameters
numiters = 50000 # Uncomment plot_chains in run_bdl_multichain to visually check mixing and convergence
thinning = 1 #The thinning rate
burnin = numiters//2 #the number of samples to drop as burn-in in-simulation
nchains = 3 #number of MCMC chains. These are simulated in serial, and then merged after checking for convergence.
#End parameters
#Now we load data and do MCMC
permsdic = defaultdict(default_permsdic) #We will store here the MCMC results
Xtrain,Ytrain,nruleslen,lhs_len,itemsets = get_freqitemsets(fname+'_train',minsupport,maxlhs) #Do frequent itemset mining from the training data
Xtest,Ytest,Ylabels_test = get_testdata(fname+'_test',itemsets) #Load the test data
print 'Data loaded!'
#Do Simulated Annealing
res = {}
t1 = time.clock()
permsdic,res['perms'], best = bayesdl_simulated_annealing(numiters,thinning,alpha,lbda,eta,Xtrain,Ytrain,nruleslen,lhs_len,maxlhs,permsdic,burnin,None,None)
print 'Elapsed CPU time',time.clock()-t1
print "here is the best", best
d_star = best[0][:best[0].index(0)+1]
print "here is the d_star", d_star
xx = list(permsdic.viewvalues())
yy = []
for x in xx:
yy.append(x[0])
print len(yy)
print histogram(yy)
#res,Rhat = run_bdl_multichain_serial(numiters,thinning,alpha,lbda,eta,Xtrain,Ytrain,nruleslen,lhs_len,maxlhs,permsdic,burnin,nchains,[None]*nchains)
#Merge the chains
#permsdic = merge_chains(res)
###The point estimate, BRL-point
#d_star = get_point_estimate(permsdic,lhs_len,Xtrain,Ytrain,alpha,nruleslen,maxlhs,lbda,eta) #get the point estimate
if d_star:
#Compute the rule consequent
theta, ci_theta = get_rule_rhs(Xtrain,Ytrain,d_star,alpha,True)
#Print out the point estimate rule
print 'antecedent risk (credible interval for risk)'
for i,j in enumerate(d_star):
print itemsets[j],theta[i],ci_theta[i]
#Evaluate on the test data
preds_d_star = preds_d_t(Xtest,Ytest,d_star,theta) #Use d_star to make predictions on the test data
accur_d_star = preds_to_acc(preds_d_star,Ylabels_test)#Accuracy of the point estimate
print 'accuracy of point estimate',accur_d_star
###The full posterior, BRL-post
#preds_fullpost = preds_full_posterior(Xtest,Ytest,Xtrain,Ytrain,permsdic,alpha)
#accur_fullpost = preds_to_acc(preds_fullpost,Ylabels_test) #Accuracy of the full posterior
#print 'accuracy of full posterior',accur_fullpost
return
#return permsdic, d_star, itemsets, theta, ci_theta, preds_d_star, accur_d_star, preds_fullpost, accur_fullpost
###############BRL
#For producing the defaultdict used for storing MCMC results
def default_permsdic():
return [0.,0.]
#Resets the number of MCMC samples stored (value[1]) while maintaining the log-posterior value (so it doesn't need to be re-computed in future chains).
def reset_permsdic(permsdic):
for perm in permsdic:
permsdic[perm][1] = 0.
return permsdic
#Run mcmc for each of the chains, IN SERIAL!
def run_bdl_multichain_serial(numiters,thinning,alpha,lbda,eta,X,Y,nruleslen,lhs_len,maxlhs,permsdic,burnin,nchains,d_inits):
#Run each chain in serial.
res = {}
for n in range(nchains):
res[n] = {}
t1 = time.clock()
print 'Starting chain',n
permsdic,res[n]['perms'] = bayesdl_mcmc(numiters,thinning,alpha,lbda,eta,X,Y,nruleslen,lhs_len,maxlhs,permsdic,burnin,None,d_inits[n])
print 'Elapsed CPU time',time.clock()-t1
#Store the permsdic results
res[n]['permsdic'] = {perm:list(vals) for perm,vals in permsdic.iteritems() if vals[1]>0}
#Reset the permsdic
permsdic = reset_permsdic(permsdic)
#Continue with the next chain
#Check convergence
Rhat = gelmanrubin(res)
print 'Rhat for convergence:',Rhat
##plot?
#plot_chains(res)
return res,Rhat
#Check convergence with GR diagnostic
def gelmanrubin(res):
n = 0 #number of samples per chain - to be computed
m = len(res) #number of chains
phi_bar_j = {}
for chain in res:
phi_bar_j[chain] = 0.
for val in res[chain]['permsdic'].itervalues():
phi_bar_j[chain] += val[1]*val[0] #numsamples*log posterior
n += val[1]
#And normalize
n = n//m #Number of samples per chain (assuming all m chains have same number of samples)
#Normalize, and compute phi_bar
phi_bar = 0.
for chain in phi_bar_j:
phi_bar_j[chain] = phi_bar_j[chain]/float(n) #normalize
phi_bar += phi_bar_j[chain]
phi_bar = phi_bar/float(m) #phi_bar = average of phi_bar_j
#Now B
B = 0.
for chain in phi_bar_j:
B += (phi_bar_j[chain] - phi_bar)**2
B = B*(n/float(m-1))
#Now W.
W = 0.
for chain in res:
s2_j = 0.
for val in res[chain]['permsdic'].itervalues():
s2_j += val[1]*(val[0] -phi_bar_j[chain])**2
s2_j = (1./float(n-1))*s2_j
W += s2_j
W = W*(1./float(m))
#Next varhat
varhat = ((n-1)/float(n))*W + (1./float(n))*B
#And finally,
try:
Rhat = sqrt(varhat/float(W))
except RuntimeWarning:
print 'RuntimeWarning computing Rhat, W='+str(W)+', B='+str(B)
Rhat = 0.
return Rhat
#Plot the logposterior values for the samples in the chains.
def plot_chains(res):
for chain in res:
plt.plot([res[chain]['permsdic'][a][0] for a in res[chain]['perms']])
plt.show()
return
#Merge chains into a single collection of posterior samples
def merge_chains(res):
permsdic = defaultdict(default_permsdic)
for n in res:
for perm,vals in res[n]['permsdic'].iteritems():
permsdic[perm][0] = vals[0]
permsdic[perm][1] += vals[1]
return permsdic
#Get a point estimate with length and width similar to the posterior average, with highest likelihood
def get_point_estimate(permsdic,lhs_len,X,Y,alpha,nruleslen,maxlhs,lbda,eta):
#Figure out the posterior expected list length and average rule size
listlens = []
rulesizes = []
for perm in permsdic:
d_t = Pickle.loads(perm)
listlens.extend([len(d_t)] * int(permsdic[perm][1]))
rulesizes.extend([lhs_len[j] for j in d_t[:-1]] * int(permsdic[perm][1]))
#Now compute average
avglistlen = average(listlens)
print 'Posterior average length:',avglistlen
try:
avgrulesize = average(rulesizes)
print 'Posterior average width:',avgrulesize
#Prepare the intervals
minlen = int(floor(avglistlen))
maxlen = int(ceil(avglistlen))
minrulesize = int(floor(avgrulesize))
maxrulesize = int(ceil(avgrulesize))
#Run through all perms again
likelihds = []
d_ts = []
beta_Z,logalpha_pmf,logbeta_pmf = prior_calculations(lbda,len(X),eta,maxlhs) #get the constants needed to compute the prior
for perm in permsdic:
if permsdic[perm][1]>0:
d_t = Pickle.loads(perm) #this is the antecedent list
#Check the list length
if len(d_t) >= minlen and len(d_t) <= maxlen:
#Check the rule size
rulesize = average([lhs_len[j] for j in d_t[:-1]])
if rulesize >= minrulesize and rulesize <= maxrulesize:
d_ts.append(d_t)
#Compute the likelihood
R_t = d_t.index(0)
N_t, unused_t = compute_rule_usage(d_t,R_t,X,Y)
likelihds.append(fn_logposterior(d_t,R_t,N_t,alpha,logalpha_pmf,logbeta_pmf,maxlhs,beta_Z,nruleslen,lhs_len))
likelihds = array(likelihds)
d_star = d_ts[likelihds.argmax()]
except RuntimeWarning:
#This can happen if all perms are identically [0], or if no soln is found within the len and width bounds (probably the chains didn't converge)
print 'No suitable point estimate found'
d_star = None
return d_star
#################COMPUTING RESULTS
#Compute the posterior consequent distributions
def get_rule_rhs(Xtrain,Ytrain,d_t,alpha,intervals):
N_t, unused_t = compute_rule_usage(d_t,d_t.index(0),Xtrain,Ytrain)
theta = []
ci_theta = []
for i,j in enumerate(d_t):
#theta ~ Dirichlet(N[j,:] + alpha)
#E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha))
#NOTE this result is only for binary classification
#theta = p(y=1)
theta.append((N_t[i,1] + alpha[1])/float(sum(N_t[i,:] + alpha)))
#And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0])
if intervals:
ci_theta.append(beta.interval(0.95,N_t[i,1] + alpha[1],N_t[i,0] + alpha[0]))
return theta,ci_theta
#Get predictions from the list d_t
def preds_d_t(X,Y,d_t,theta):
#this is binary only. The score is the Prob of 1.
unused = set(range(Y.shape[0]))
preds = -1*ones(Y.shape[0])
for i,j in enumerate(d_t):
usedj = unused.intersection(X[j]) #these are the observations in X that make it to rule j
preds[list(usedj)] = theta[i]
unused = unused.difference(set(usedj))
if preds.min() < 0:
raise Exception #this means some observation wasn't given a prediction - shouldn't happen
return preds
#Make predictions using the full posterior
def preds_full_posterior(X,Y,Xtrain,Ytrain,permsdic,alpha):
#this is binary only. The score is the Prob of 1.
preds = zeros(Y.shape[0])
postcount = 0. #total number of posterior samples
for perm,vals in permsdic.iteritems():
#We will compute probabilities for this antecedent list d.
d_t = Pickle.loads(perm)
permcount = float(vals[1]) #number of copies of this perm in the posterior
postcount += float(vals[1])
#We will get the posterior E[theta]'s for this list
theta,jnk = get_rule_rhs(Xtrain,Ytrain,d_t,alpha,False)
#And assign observations a score
unused = set(range(Y.shape[0]))
for i,j in enumerate(d_t):
usedj = unused.intersection(X[j]) #these are the observations in X that make it to rule j
preds[list(usedj)] += theta[i]*permcount
unused = unused.difference(set(usedj))
if unused:
raise Exception #all observations should have been given predictions
#Done with this list, move on to the next one.
#Done with all lists. Normalize.
preds /= float(postcount)
return preds
#Compute accuracy
def preds_to_acc(y_score,y_true):
thr = 0.5 #we take label = 1 if y_score >= 0.5
accur = 0.
for i,prob in enumerate(y_score):
if prob >= thr and y_true[i] == 1:
accur+=1
elif prob < thr and y_true[i] == 0:
accur+=1
accur = accur/float(len(y_score))
return accur
##############MCMC core
#The Metropolis-Hastings algorithm
def bayesdl_mcmc(numiters,thinning,alpha,lbda,eta,X,Y,nruleslen,lhs_len,maxlhs,permsdic,burnin,rseed,d_init):
#initialize
perms = []
if rseed:
random.seed(rseed)
#Do some pre-computation for the prior
beta_Z,logalpha_pmf,logbeta_pmf = prior_calculations(lbda,len(X),eta,maxlhs)
if d_init: #If we want to begin our chain at a specific place (e.g. to continue a chain)
d_t = Pickle.loads(d_init)
d_t.extend([i for i in range(len(X)) if i not in d_t])
R_t = d_t.index(0)
N_t, unused_t = compute_rule_usage(d_t,R_t,X,Y)
else:
d_t,R_t,N_t, unused_t = initialize_d(X,Y,lbda,eta,lhs_len,maxlhs,nruleslen) #Otherwise sample the initial value from the prior
#Add to dictionary which will store the sampling results
a_t = Pickle.dumps(d_t[:R_t+1]) #The antecedent list in string form
if a_t not in permsdic:
permsdic[a_t][0] = fn_logposterior(d_t,R_t,N_t,alpha,logalpha_pmf,logbeta_pmf,maxlhs,beta_Z,nruleslen,lhs_len) #Compute its logposterior
if burnin == 0:
permsdic[a_t][1] += 1 #store the initialization sample
#iterate!
for itr in range(numiters):
#Sample from proposal distribution
d_star,Jratio,R_star,step,newItemPosition = proposal(d_t,R_t,X,Y,alpha)
#Compute the new posterior value, if necessary
a_star = Pickle.dumps(d_star[:R_star+1])
added = 0
if a_star not in permsdic:
added = 1
N_t, unused_t = compute_rule_update(d_star,R_star, d_t, R_t, N_t, unused_t, newItemPosition, X,Y)
permsdic[a_star][0] = fn_logposterior(d_star,R_star,N_star,alpha,logalpha_pmf,logbeta_pmf,maxlhs,beta_Z,nruleslen,lhs_len)
#Compute the metropolis acceptance probability
q = permsdic[a_star][0] - permsdic[a_t][0] + Jratio
u = log(random.random())
if u < q:
#then we accept the move
if added==0:
N_t, unused_t = compute_rule_update(d_star,R_star, d_t, R_t, N_t, unused_t, newItemPosition, X,Y)
else:
N_t = N_star
unused_t = unused_star
d_t = list(d_star)
R_t = int(R_star)
a_t = str(a_star)
#else: pass
if itr > burnin and itr % thinning == 0:
##store
permsdic[a_t][1] += 1
perms.append(a_t)
return permsdic,perms
#Samples a list from the prior
def initialize_d(X,Y,lbda,eta,lhs_len,maxlhs,nruleslen):
m = Inf
while m>=len(X):
m = poisson.rvs(lbda) #sample the length of the list from Poisson(lbda), truncated at len(X)
#prepare the list
d_t = []
empty_rulelens = [r for r in range(1,maxlhs+1) if r not in nruleslen]
used_rules = []
for i in range(m):
#Sample a rule size.
r = 0
while r==0 or r > maxlhs or r in empty_rulelens:
r = poisson.rvs(eta) #Sample the rule size from Poisson(eta), truncated at 0 and maxlhs and not using empty rule lens
#Now sample a rule of that size uniformly at random
rule_cands = [j for j,lhslen in enumerate(lhs_len) if lhslen == r and j not in used_rules]
random.shuffle(rule_cands)
j = rule_cands[0]
#And add it in
d_t.append(j)
used_rules.append(j)
assert lhs_len[j] == r
if len(rule_cands) == 1:
empty_rulelens.append(r)
#Done adding rules. We have added m rules. Finish up.
d_t.append(0) #all done
d_t.extend([i for i in range(len(X)) if i not in d_t])
R_t = d_t.index(0)
assert R_t == m
#Figure out what rules are used to classify what points
N_t, unused_t = compute_rule_usage(d_t,R_t,X,Y)
return d_t,R_t,N_t, unused_t
#Propose a new d_star
def proposal(d_t,R_t,X,Y,alpha):
d_star = list(d_t)
R_star = int(R_t)
move_probs_default = array([0.3333333333,0.3333333333,0.3333333333]) #We begin with these as the move probabilities, but will renormalize as needed if certain moves are unavailable.
#We have 3 moves: move, add, cut. Define the pdf for the probabilities of the moves, in that order:
if R_t == 0:
#List is empty. We must add.
move_probs = array([0.,1.,0.])
#This is an add transition. The probability of the reverse cut move is the prob of a list of len 1 having a cut (other option for list of len 1 is an add).
Jratios = array([0.,move_probs_default[2]/float(move_probs_default[1] + move_probs_default[2]),0.])
elif R_t == 1:
#List has one rule on it. We cannot move, must add or cut.
move_probs = array(move_probs_default) #copy
move_probs[0] = 0. #drop move move.
move_probs = move_probs/sum(move_probs) #renormalize
#If add, probability of the reverse cut is the default cut probability
#If cut, probability of the reverse add is 1.
inv_move_probs = array([0.,move_probs_default[2],1.])
Jratios = zeros_like(move_probs)
Jratios[1:] = inv_move_probs[1:]/move_probs[1:] #array elementwise division
elif R_t == len(d_t) - 1:
#List has all rules on it. We cannot add, must move or cut.
move_probs = array(move_probs_default) #copy
move_probs[1] = 0. #drop add move.
move_probs = move_probs/sum(move_probs) #renormalize
#If move, probability of reverse move is move_probs[0], so Jratio = 1.
#if cut, probability of reverse add is move_probs_default
Jratios = array([1., 0., move_probs_default[1]/move_probs[2]])
elif R_t == len(d_t) - 2:
#List has all rules but 1 on it.
#Move probabilities are the default, but the inverse are a little different.
move_probs = array(move_probs_default)
#If move, probability of reverse move is still default, so Jratio = 1.
#if cut, probability of reverse add is move_probs_default[1],
#if add, probability of reverse cut is,
Jratios = array([1., move_probs_default[2]/float(move_probs_default[0]+move_probs_default[2])/float(move_probs_default[1]), move_probs_default[1]/float(move_probs_default[2])])
else:
move_probs = array(move_probs_default)
Jratios = array([1.,move_probs[2]/float(move_probs[1]),move_probs[1]/float(move_probs[2])])
u = random.random()
#First we will find the indicies for the insertion-deletion. indx1 is the item to be moved, indx2 is the new location
if u < sum(move_probs[:1]):
#This is an on-list move.
step = 'move'
[indx1,indx2] = random.permutation(range(len(d_t[:R_t])))[:2] #value error if there are no on list entries
#print 'move',indx1,indx2
Jratio = Jratios[0] #ratio of move/move probabilities is 1.
newItemPosition = min(indx1, indx2)
elif u < sum(move_probs[:2]):
#this is an add
step = 'add'
indx1 = R_t+1+random.randint(0,len(d_t[R_t+1:])) #this will throw ValueError if there are no off list entries
indx2 = random.randint(0,len(d_t[:R_t+1])) #this one will always work
#print 'add',indx1,indx2
#the probability of going from d_star back to d_t is the probability of the corresponding cut.
#p(d*->d|cut) = 1/|d*| = 1/(|d|+1) = 1./float(R_t+1)
#p(d->d*|add) = 1/((|a|-|d|)(|d|+1)) = 1./(float(len(d_t)-1-R_t)*float(R_t+1))
Jratio = Jratios[1]*float(len(d_t)-1-R_t)
R_star+=1
newItemPosition = indx2
elif u < sum(move_probs[:3]):
#this is a cut
step = 'cut'
indx1 = random.randint(0,len(d_t[:R_t])) #this will throw ValueError if there are no on list entries
indx2 = R_t+random.randint(0,len(d_t[R_t:])) #this one will always work
#print 'cut',indx1,indx2
#the probability of going from d_star back to d_t is the probability of the corresponding add.
#p(d*->d|add) = 1/((|a|-|d*|)(|d*|+1)) = 1/((|a|-|d|+1)(|d|))
#p(d->d*|cut) = 1/|d|
#Jratio =
Jratio = Jratios[2]*(1./float(len(d_t)-1-R_t+1))
R_star -=1
newItemPosition = indx1
else:
raise Exception
#Now do the insertion-deletion
d_star.insert(indx2,d_star.pop(indx1))
return d_star,log(Jratio),R_star,step, newItemPosition
#Compute the normalization constants for the prior on rule cardinality
def prior_calculations(lbda,maxlen,eta,maxlhs):
#First normalization constants for beta
beta_Z = poisson.cdf(maxlhs,eta) - poisson.pmf(0,eta)
#Then the actual un-normalized pmfs
logalpha_pmf = {}
for i in range(maxlen+1):
try:
logalpha_pmf[i] = poisson.logpmf(i,lbda)
except RuntimeWarning:
logalpha_pmf[i] = -inf
logbeta_pmf = {}
for i in range(1,maxlhs+1):
logbeta_pmf[i] = poisson.logpmf(i,eta)
return beta_Z,logalpha_pmf,logbeta_pmf
#Compute log posterior
def fn_logposterior(d_t,R_t,N_t,alpha,logalpha_pmf,logbeta_pmf,maxlhs,beta_Z,nruleslen,lhs_len):
logliklihood = fn_logliklihood(d_t,N_t,R_t,alpha)
logprior = fn_logprior(d_t,R_t,logalpha_pmf,logbeta_pmf,maxlhs,beta_Z,nruleslen,lhs_len)
return logliklihood + logprior
#Compute log likelihood
def fn_logliklihood(d_t,N_t,R_t,alpha):
gammaln_Nt_jk = gammaln(N_t+alpha)
gammaln_Nt_j = gammaln(sum(N_t+alpha,1))
logliklihood = sum(gammaln_Nt_jk) - sum(gammaln_Nt_j)
return logliklihood
#Compute log prior
def fn_logprior(d_t,R_t,logalpha_pmf,logbeta_pmf,maxlhs,beta_Z,nruleslen,lhs_len):
#The prior will be _proportional_ to this -> we drop the normalization for alpha
#beta_Z is the normalization for beta, except the terms that need to be dropped due to running out of rules.
#log p(d_star) = log \alpha(m|lbda) + sum_{i=1...m} log beta(l_i | eta) + log gamma(r_i | l_i)
#The length of the list (m) is R_t
#Get logalpha (length of list) (overloaded notation in this code, unrelated to the prior hyperparameter alpha)
logprior = 0.
logalpha = logalpha_pmf[R_t] #this is proportional to logalpha - we have dropped the normalization for truncating based on total number of rules
logprior += logalpha
empty_rulelens = []
nlens = zeros(maxlhs+1)
for i in range(R_t):
l_i = lhs_len[d_t[i]]
logbeta = logbeta_pmf[l_i] - log(beta_Z - sum([logbeta_pmf[l_j] for l_j in empty_rulelens])) #The correction for exhausted rule lengths
#Finally loggamma
loggamma = -log(nruleslen[l_i] - nlens[l_i])
#And now check if we have exhausted all rules of a certain size
nlens[l_i] += 1
if nlens[l_i] == nruleslen[l_i]:
empty_rulelens.append(l_i)
elif nlens[l_i] > nruleslen[l_i]:
raise Exception
#Add 'em in
logprior += logbeta
logprior += loggamma
#All done
return logprior
def compute_rule_update(d_star,R_star, d_old, R_old, N_old, unused_old, newItemPosition, X,Y):
global trainingSize
N_star = zeros((R_star+1,2))
# print "newItemPosition is ", newItemPosition, R_star, len(N_old), len(N_star)
N_star[:newItemPosition] = N_old[:newItemPosition]
remaining_unused = unused_old[:newItemPosition+1]
i = min(newItemPosition, len(unused_old)-1)
while remaining_unused[i]:
j = d_star[i]
usedj = remaining_unused[i] & X[j]
remaining_unused.append(remaining_unused[i] - usedj)
N_star[i,0] = gmpy.popcount(Y[0] & usedj)
N_star[i,1] = gmpy.popcount(Y[1] & usedj)
i+=1
if int(sum(N_star)) != trainingSize:
raise Exception #bug check
return N_star, remaining_unused
#Compute which rules are being used to classify data points with what labels
def compute_rule_usage(d_star,R_star,X,Y):
global trainingSize
#N_star = zeros((R_star+1,Y.shape[1]))
#remaining_unused = 2**(Y.shape[0]) - 1
N_star = zeros((R_star+1, 2))
remaining_unused = [(1<<trainingSize) - 1]
i = 0
#print X
while remaining_unused[i]:
j = d_star[i]
usedj = remaining_unused[i] & X[j]
remaining_unused.append(remaining_unused[i] - usedj)
N_star[i,0] = gmpy.popcount(Y[0] & usedj)
N_star[i,1] = gmpy.popcount(Y[1] & usedj)
i+=1
if int(sum(N_star)) != trainingSize:
print "############"
print "not equal!!!", int(sum(N_star)), trainingSize, v1train, v2train, v3train
raise Exception #bug check
return N_star, remaining_unused
####Data loading
#Frequent itemset mining
def get_freqitemsets(fname,minsupport,maxlhs):
#minsupport is an integer percentage (e.g. 10 for 10%)
#maxlhs is the maximum size of the lhs
#first load the data
data,Ydata = load_data(fname)
#Now find frequent itemsets
#Mine separately for each class
data_pos = [x for i,x in enumerate(data) if Ydata[i,0]==0]
data_neg = [x for i,x in enumerate(data) if Ydata[i,0]==1]
assert len(data_pos)+len(data_neg) == len(data)
Y = [0,0]
Y[0] = sum([1<<i for i,x in enumerate(data) if Ydata[i,0]==1])
Y[1] = sum([1<<i for i,x in enumerate(data) if Ydata[i,1]==1])
itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,zmax=maxlhs)]
itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,zmax=maxlhs)])
itemsets = list(set(itemsets))
print len(itemsets),'rules mined'
#Now form the data-vs.-lhs set
#X[j] is the bit vector of data points that contain itemset j (that is, satisfy rule j)
X = [ 0 for j in range(len(itemsets)+1)]
global trainingSize
trainingSize = len(data)
X[0] = (1<<trainingSize) - 1 #the default rule satisfies all data, so all bits are 1's
for (j,lhs) in enumerate(itemsets):
X[j+1] = sum([1<<i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)])
#now form lhs_len
lhs_len = [0]
for lhs in itemsets:
lhs_len.append(len(lhs))
nruleslen = Counter(lhs_len)
lhs_len = array(lhs_len)
itemsets_all = ['null']
itemsets_all.extend(itemsets)
return X,Y,nruleslen,lhs_len,itemsets_all
#Load the test data, and determine which antecedents are satisfied by each test observation
def get_testdata(fname,itemsets):
#And now the test data.
#first load the data
data,Y = load_data(fname)
#Now form the data-vs.-lhs set
#X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
X = [set() for j in range(len(itemsets))]
X[0] = set(range(len(data))) #the default rule satisfies all data
for (j,lhs) in enumerate(itemsets):
if j>0:
X[j] = set([i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)])
Ylabels = [list(i).index(1) for i in Y]
return X,Y,Ylabels
#Read in the .tab file
def load_data(fname):
#Load data
with open(fname+'.tab','r') as fin:
A = fin.readlines()
data = []
for ln in A:
data.append(ln.split())
#Now load Y
Y = loadtxt(fname+'.Y')
if len(Y.shape)==1:
Y = array([Y])
return data,Y
#The Simulated Annealing algorithm
def bayesdl_simulated_annealing(numiters,thinning,alpha,lbda,eta,X,Y,nruleslen,lhs_len,maxlhs,permsdic,burnin,rseed,d_init):
#initialize
perms = []
if rseed:
random.seed(rseed)
#Do some pre-computation for the prior
beta_Z,logalpha_pmf,logbeta_pmf = prior_calculations(lbda,len(X),eta,maxlhs)
if d_init: #If we want to begin our chain at a specific place (e.g. to continue a chain)
d_t = Pickle.loads(d_init)
d_t.extend([i for i in range(len(X)) if i not in d_t])
R_t = d_t.index(0)
N_t, unused_t = compute_rule_usage(d_t,R_t,X,Y)
else:
d_t,R_t,N_t, unused_t = initialize_d(X,Y,lbda,eta,lhs_len,maxlhs,nruleslen) #Otherwise sample the initial value from the prior
#Add to dictionary which will store the sampling results
a_t = Pickle.dumps(d_t[:R_t+1]) #The antecedent list in string form
if a_t not in permsdic:
permsdic[a_t][0] = fn_logposterior(d_t,R_t,N_t,alpha,logalpha_pmf,logbeta_pmf,maxlhs,beta_Z,nruleslen,lhs_len) #Compute its logposterior
if burnin == 0:
permsdic[a_t][1] += 1 #store the initialization sample
#set time schedule
t = [1]
T = []
for i in range(30):
t.append(t[i]+exp(0.25*(i+1)))
for j in range(int(floor(t[i]-1e-6))+1, int(ceil(t[i+1]+1e-6))):
T.append(1.0/(i+1))
#iterate!
best = [d_t[:d_t.index(0)], permsdic[a_t][0]]
makeit = 0
numiterationsPerStep = 100
for k, tk in enumerate(T):
for iter in range(numiterationsPerStep):
#print k, tk
#Sample from proposal distribution
d_star,Jratio,R_star,step,newItemPosition = proposal(d_t,R_t,X,Y,alpha)
#Compute the new posterior value, if necessary
a_star = Pickle.dumps(d_star[:R_star+1])
added = 0
if a_star not in permsdic:
added = 1
N_star, unused_star = compute_rule_update(d_star,R_star, d_t, R_t, N_t, unused_t, newItemPosition, X,Y)
permsdic[a_star][0] = fn_logposterior(d_star,R_star,N_star,alpha,logalpha_pmf,logbeta_pmf,maxlhs,beta_Z,nruleslen,lhs_len)
#Compute the metropolis acceptance probability
#q = exp( (permsdic[a_star][0] - permsdic[a_t][0])/tk)
#if q > 0.1:
# print "difference q is ", q
u = random.random()
#if (q>=1) or (u<q):
if (permsdic[a_star][0] - permsdic[a_t][0]>0) or (log(u) < (permsdic[a_star][0] - permsdic[a_t][0])/tk):
makeit += 1
#then we accept the move
if added==0:
N_t, unused_t = compute_rule_update(d_star,R_star, d_t, R_t, N_t, unused_t, newItemPosition, X,Y)
else:
N_t = N_star
unused_t = unused_star
d_t = list(d_star)
R_t = int(R_star)
a_t = str(a_star)
if (permsdic[a_t][0] > best[1]):
best = [d_t, permsdic[a_t][0]]
#else: pass
#if itr > burnin and itr % thinning == 0:
##store
permsdic[a_t][1] += 1
perms.append(a_t)
print "The best BRL_point is [list of indices of rules, log_posterior]:\n", best
print "# of accepted proposals = ", makeit
print "############### len of permsdic, len of perms", len(permsdic), len(perms)
return permsdic,perms, best
#############END!!
if __name__ == '__main__':
topscript()