def worker_reduce(self, args): rdd_id, hash_num, func, initializer, target = util.pls(args) func = util.decode_function(func) if target == 'keys': return reduce(func, self.data[(rdd_id, hash_num)].keys(), initializer) elif target == 'values': return reduce(func, self.data[(rdd_id, hash_num)].values(), initializer)
def run_task(self, pickled_args): (rdd_id, hash_num, rdd_type, action, data_src, parents, hash_func, peers) = util.pls(pickled_args) rdd_type = pickle.loads(rdd_type) action = rdd_type.unserialize_action(action) hash_func = util.decode_function(hash_func) filter_func = util.encode_function(lambda key: hash_func(key) == hash_num) if rdd_type == rdd.JoinRDD: working_data = [{}, {}] for index in [0, 1]: parent_uid = parents[index] assignment = data_src[index] key = (parent_uid, hash_num) with self.lock: data_is_local = self.data.has_key(key) if not data_is_local: # print "Join: Querying remote server" proxy = xmlrpclib.ServerProxy(assignment,transport=self.transport) try: working_data[index] = self.query_remote(key,proxy) except (socket.timeout,KeyError): # print "timeout or key error" return assignment else: with self.lock: working_data[index] = self.data[key] with self.lock: self.data[(rdd_id, hash_num)] = action(working_data[0], working_data[1]) return "OK" if rdd_type == rdd.PartitionByRDD: working_data = collections.defaultdict(list) for peer in peers: if peer != self.uri: proxy = xmlrpclib.ServerProxy(peer,transport=self.transport) else: proxy = self for parent_uid in parents: key = (parent_uid, hash_num) try: queried_data = self.query_remote(key,proxy,{}) except socket.timeout: return peer #print queried_data try: for k, v in queried_data.items(): if type(v) == list: working_data[k].extend(v) else: working_data[k].append(v) except ValueError as e: print key,queried_data raise e elif len(parents) > 0: ## number of parents should be 1 parent_uid = parents[0] assignment = data_src[0] key = (parent_uid, hash_num) with self.lock: data_is_local = self.data.has_key(key) if not data_is_local: # print "Querying remote server" proxy = xmlrpclib.ServerProxy(assignment,self.transport) try: working_data = self.query_remote(key,proxy) except (socket.timeout , KeyError): # print "fetch timeout or KeyError" return assignment else: with self.lock: working_data = self.data[key] else: working_data = {} output = action(working_data, hash_num) if (rdd_type == rdd.IntermediateFlatMapRDD or rdd_type == rdd.IntermediateMapRDD): ## Split output into partial partitions for k, v in output.items(): ## v should be a list key = (rdd_id, hash_func(k)) with self.lock: if self.data[key].has_key(k): self.data[key][k].extend(v) else: self.data[key][k] = v else: with self.lock: self.data[(rdd_id, hash_num)] = output return "OK"