/
all_pairs_BIC_score.py
182 lines (152 loc) · 7.03 KB
/
all_pairs_BIC_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from cluster_mrtemplate import ClusterMRJob
import cluster_tools as tools
import numpy as np
import os
import cPickle as pickle
from stat import *
from gmm_specializer.gmm import compute_distance_BIC
from mrjob.protocol import PickleProtocol as protocol
import time
import sys
import mrjob.util as util
class AllPairsBicScoreMRJob(ClusterMRJob):
def job_runner_kwargs(self):
config = super(AllPairsBicScoreMRJob, self).job_runner_kwargs()
config['jobconf']['mapred.line.input.format.linespermap'] = 16
#config['upload_files'] += ["iter_gmm_list"]
config['upload_files'] += ['gmm.tgz']
return config
def mapper(self, key, value):
"""
Each mapper computes the BIC score for a GMM pair
"""
overall = t = time.time()
index1, index2 = key
didx1, didx2, em_iters = value
t = time.time()
# X = tools.binary_read('self_X')
# d1 = tools.get_data_from_indices(X, didx1)
# d2 = tools.get_data_from_indices(X, didx2)
# sys.stderr.write("get_data_from_indices: {0}\n".format(time.time()-t))
d1 = tools.get_data_from_file_from_indices('self_X', didx1)
d2 = tools.get_data_from_file_from_indices('self_X', didx2)
sys.stderr.write("get_data_from_file_from_indices: {0}\n".format(time.time()-t))
data = np.concatenate((d1, d2))
t = time.time()
util.unarchive('gmm.tgz', 'gmm')
g1 = pickle.load(open('gmm/'+str(index1), 'r'))
g2 = pickle.load(open('gmm/'+str(index2), 'r'))
sys.stderr.write("read iter_gmm_list: {0}\n".format(time.time()-t))
new_gmm = g1
score = 0
t = time.time()
try:
new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
except:
raise
#data_to_yield = (score, new_gmm, g1, g2, index1, index2)
data_to_yield = (score, index1, index2)
sys.stderr.write("compute_distance_BIC: {0}\n".format(time.time()-t))
sys.stderr.write("total BIC time: {0}\n".format(time.time()-overall))
yield 1, data_to_yield
def reducer(self, key, values):
"""
Finds the GMM pair with the highest BIC score
"""
best_score = 0.0
merged_tuple_indices = (0, 0)
for score, index1, index2 in values:
if score > best_score:
best_score = score
merged_tuple_indices = (index1, index2)
result = (merged_tuple_indices, best_score)
yield 1, result
def steps(self):
return [self.mr(mapper=self.mapper, reducer=self.reducer)]
class AllPairsBicScore(object):
def __init__(self):
self.pure_python = True
def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list):
"""
Computes the BIC score for all pairs by using MapReduce and returns
the pair with the best score
"""
print "Map-Reduce execution"
# iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list)
# pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w'))
# os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
# S_IRGRP | S_IXGRP | \
# S_IROTH | S_IXOTH )
from subprocess import call
call(["mkdir", "-p", "gmm"])
for i in range (0, len(iteration_bic_list)):
gidx, didx = iteration_bic_list[i]
pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w'))
os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
S_IRGRP | S_IXGRP | \
S_IROTH | S_IXOTH )
import mrjob.util as util
util.tar_and_gzip('gmm', 'gmm.tgz')
input = []
l = len(iteration_bic_list)
for gmm1idx in range(l):
for gmm2idx in range(gmm1idx+1, l):
gidx1, didx1 = iteration_bic_list[gmm1idx]
gidx2, didx2 = iteration_bic_list[gmm2idx]
an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters))
input.append(an_item+"\n")
mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input)
runner = job.make_runner()
runner.run()
kv_pairs = map(job.parse_output_line, runner.stream_output())
assert len(kv_pairs) == 1
merged_tuple_indices, best_score = kv_pairs[0][1]
# Re-merge the GMM pair with the highest score *here*, otherwise the next
# segment_majority_vote will crash (issue with data ownership). If we don't
# find a different workaround, we can simplify more the mapper and the reducer.
# Essentially, we can avoid moving from mappers to the reducer the GMM pairs and
# merged GMMs. Instead, we can move just indices and scores.
# However, this re-merging is serialized...
ind1, ind2 = merged_tuple_indices
gidx1, idx1 = iteration_bic_list[ind1]
gidx2, idx2 = iteration_bic_list[ind2]
d1 = tools.get_data_from_indices(X, idx1)
d2 = tools.get_data_from_indices(X, idx2)
data = np.concatenate((d1,d2))
g1 = gmm_list[gidx1]
g2 = gmm_list[gidx2]
new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
return new_gmm, (g1, g2), merged_tuple_indices, best_score
def all_pairs_BIC_serial(self, iter_bic_list, em_iters, X, gmm_list):
"""
Computes the BIC score for all pairs in a "serial" way and returns
the pair with the best score
"""
#print "Serial execution"
l = len(iter_bic_list)
best_merged_gmm = None
best_BIC_score = 0.0
merged_tuple = None
merged_tuple_indices = None
for gmm1idx in range(l):
for gmm2idx in range(gmm1idx+1, l):
score = 0.0
gidx1, idx1 = iter_bic_list[gmm1idx]
gidx2, idx2 = iter_bic_list[gmm2idx]
d1 = tools.get_data_from_indices(X, idx1)
d2 = tools.get_data_from_indices(X, idx2)
data = np.concatenate((d1, d2))
g1 = gmm_list[gidx1]
g2 = gmm_list[gidx2]
new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
if score > best_BIC_score:
best_merged_gmm = new_gmm
merged_tuple = (g1, g2)
merged_tuple_indices = (gmm1idx, gmm2idx)
best_BIC_score = score
return best_merged_gmm, merged_tuple, merged_tuple_indices, best_BIC_score
# this appears to be necessary because this script will be called as __main__ on
# every worker node.
if __name__ == '__main__':
AllPairsBicScoreMRJob().run()