def run_time_sync_master(group): pts_group = group + '-time_sync-v1' # the time source in the example is python time.time you can change this. # replace with an implementation that give your custom time in floating sec. clock_service = Clock_Sync_Master(time) # This example is a clock service only, not a clock follower. # Therefore the rank is designed to always trump all others. rank = 1000 discovery = Pyre('pupil-helper-service') discovery.join(pts_group) discovery.start() logger.info('Joining "{}" group with rank {}'.format(pts_group, rank)) def announce_clock_service_info(): discovery.shout(pts_group, [repr(rank).encode(), repr(clock_service.port).encode()]) try: for event in discovery.events(): if event.type == 'JOIN' and event.group == pts_group: logger.info('"{}" joined "{}" group. Announcing service.'.format(event.peer_name, pts_group)) announce_clock_service_info() except KeyboardInterrupt: pass finally: logger.info('Leaving "{}" group'.format(pts_group)) discovery.leave(pts_group) discovery.stop() clock_service.stop()
def run_time_sync_master(group): pts_group = group + '-time_sync-v1' # the time source in the example is python time.time you can change this. # replace with an implementation that give your custom time in floating sec. clock_service = Clock_Sync_Master(time) # This example is a clock service only, not a clock follower. # Therefore the rank is designed to always trump all others. rank = 1000 discovery = Pyre('pupil-helper-service') discovery.join(pts_group) discovery.start() logger.info('Joining "{}" group with rank {}'.format(pts_group, rank)) def announce_clock_service_info(): discovery.shout( pts_group, [repr(rank).encode(), repr(clock_service.port).encode()]) try: for event in discovery.events(): if event.type == 'JOIN' and event.group == pts_group: logger.info( '"{}" joined "{}" group. Announcing service.'.format( event.peer_name, pts_group)) announce_clock_service_info() except KeyboardInterrupt: pass finally: logger.info('Leaving "{}" group'.format(pts_group)) discovery.leave(pts_group) discovery.stop() clock_service.stop()
def run_time_sync_follower(time_fn, jump_fn, slew_fn, group): """Main follower logic""" # Start Pyre node and find clock services in `pts_group` pts_group = group + '-time_sync-v1' discovery = Pyre('pupil-helper-follower') discovery.join(pts_group) discovery.start() logger.info('Joining "{}" group'.format(pts_group)) # The leaderboard keeps track of all clock services # and is used to determine the clock master leaderboard = [] follower_service = None def update_leaderboard(uuid, name, rank, port): """Add or update an existing clock service on the leaderboard""" for cs in leaderboard: if cs.uuid == uuid: if (cs.rank != rank) or (cs.port != port): remove_from_leaderboard(cs.uuid) break else: # no changes. Just leave as is return # clock service was not encountered before or has changed adding it to leaderboard cs = Clock_Service(uuid, name, rank, port) heappush(leaderboard, cs) logger.debug('<{}> added'.format(cs)) def remove_from_leaderboard(uuid): """Remove an existing clock service from the leaderboard""" for cs in leaderboard: if cs.uuid == uuid: leaderboard.remove(cs) logger.debug('<{}> removed'.format(cs)) break def evaluate_leaderboard(follower_service): """ Starts/changes/stops the time follower service according to who the current clock master is. """ if not leaderboard: logger.debug("nobody on the leader board.") if follower_service is not None: follower_service.terminate() return None current_leader = leaderboard[0] leader_ep = discovery.peer_address(current_leader.uuid) leader_addr = urlparse(leader_ep).netloc.split(':')[0] logger.info('Following <{}>'.format(current_leader)) if follower_service is None: # make new follower follower_service = Clock_Sync_Follower(leader_addr, port=current_leader.port, interval=10, time_fn=time_fn, jump_fn=jump_fn, slew_fn=slew_fn) else: # update follower_service follower_service.host = leader_addr follower_service.port = current_leader.port return follower_service try: # wait for the next Pyre event for event in discovery.events(): if event.type == 'SHOUT': # clock service announcement # ill-formatted messages will be dropped try: update_leaderboard(event.peer_uuid, event.peer_name, float(event.msg[0]), int(event.msg[1])) except Exception as e: logger.debug('Garbage raised `{}` -- dropping.'.format(e)) follower_service = evaluate_leaderboard(follower_service) elif ((event.type == 'LEAVE' and event.group == pts_group) or event.type == 'EXIT'): remove_from_leaderboard(event.peer_uuid) follower_service = evaluate_leaderboard(follower_service) except KeyboardInterrupt: pass finally: discovery.leave(pts_group) discovery.stop() if follower_service is not None: follower_service.terminate()
class Agent: """ A class object that represents each app in the network""" def __init__(self, name, ctx, group_name, cpu_clock_rate, experiment_name): self.lock = threading.Lock() self.cpu_clock_rate = cpu_clock_rate self.cpu_load = random.random() self.group_name = group_name self.routing_table = None self.name = name + str(os.getpid()) self.tasks = Queue(-1) self.results = Queue(-1) self.exp_name = experiment_name self.task_duration_no_context = random.random() # compute duration using cpu load, etc self.task_duration_with_context = random.random() #self.weights = 'rnn-model-attention-weights.h5' #self.model = rnn_model() # self.model._make_predict_function() # self.model.load_weights(self.weights) self.agent = Pyre( name=self.name, ctx=ctx or zmq.Context.instance()) try: self.agent.join(group_name) self.agent.start() except Exception as err: logger.error(f'>>> Cant start node: {err}', exc_info=True) def routing_table_setter(self, table): self.lock.acquire() try: # create an ascending round robin routing principle self.routing_table = cycle( sorted(table.items(), key=lambda x: x[1])) finally: self.lock.release() def add_task(self): """populates the task queue with new data for inference""" logger.debug(f'>>> {threading.current_thread().name} started') self.data = cycle(load_data(self.exp_name, 0)) count = 0 while count < 100: task_dict = dict.fromkeys( ['input', 'target', 'task-type', 'task-uuid', 'task-owner-name', 'result', 'duration'], 0) try: input_data, target_data = next(self.data) task_dict['input'] = input_data task_dict['target'] = target_data task_dict['task-type'] = 1 task_dict['task-uuid'] = self.agent.uuid() task_dict['task-owner-name'] = self.agent.name() task_dict['duration'] = time.time() self.tasks.put(task_dict) count += 1 except Exception as err: logger.error(f'>>> Exception type: {err}', exc_info=True) self.agent.leave(self.group_name) self.agent.stop() # Vary the frequency of input tasks time.sleep(random.randint(1, 8)) def vary_cpu_load(self): logger.debug( f'>>> {threading.current_thread().name} thread started') while True: try: self.lock.acquire() self.cpu_load = random.random() self.lock.release() self.compute_duration_with_context() except Exception as err: logger.error(f'>>> Exception: {err}', exc_info=True) time.sleep(random.randint(10, 40)) def compute_duration_with_context(self): try: self.lock.acquire() cpu_load = self.cpu_load task_duration_no_context = self.task_duration_no_context self.task_duration_with_context = ( 1 / task_duration_no_context) / (cpu_load * self.cpu_clock_rate) self.lock.release() except Exception as identifier: logger.error(f'>>> Exception: {identifier}') def compute_local(self, task): """argument is task""" try: task = task task_data = task['input'] target = task['target'] uuid = task['task-uuid'] #predictions = self.model.predict(task_data, verbose=0) #predictions = predictions.flatten() # flatten the target average = mean(task_data.flatten()) # window = 5 # errors = self.regression_error(predictions, target, window) # mu, variance = np.mean(errors), np.var(errors) # probabilities = self.chebyshev_probability(mu, variance, errors) task['task-type'] = task['task-type'] + 1 if uuid == self.agent.uuid(): # put results in our queue if its our uuid self.results.put(average) self.lock.acquire() self.task_duration_no_context = time.time() - task['duration'] self.lock.release() self.compute_duration_with_context() else: task['result'] = average data_byte = pickle.dumps(task, -1) self.agent.whisper(uuid, data_byte) logger.error( f'>>> Results sent back to task owner peer: {task["task-owner-name"]}') except Exception as identifier: logger.error(f'>>> Exception type: {identifier}', exc_info=True) self.agent.leave(self.group_name) self.agent.stop() # clean up if there are issues. def check_results(self): logger.error(f'>>> {threading.current_thread().name} thread started') while True: try: if not self.results.empty(): result = self.results.get() if result <= 0.25: logger.error( f'>>> Critical anomaly detected: {result}') elif result > 0.25 and result < 0.5: logger.error( f'>>> Severe anomaly detected: {result}') elif result > 0.5 and result < 0.75: logger.error( f'>>> Serious anomaly detected: {result}') else: logger.error(f'>>> Mild anomaly detected: {result}') except Exception as err: logger.error(f'>>> Exception: {err}', exc_info=True) self.agent.leave(self.group_name) self.agent.stop() def outbox(self, task, peer_uuid): try: task = pickle.dumps(task, -1) self.agent.whisper(peer_uuid, task) except Exception as identifier: logger.error(f'>>> Exception: {identifier}',exc_info=True) self.agent.leave(self.group_name) self.agent.stop() def num_of_peers(self, table): seen = [] for peer in table: if peer[0] in seen: return len(seen) else: seen.append(peer[0]) def handle_task(self): # decide if to compute locally or offload logger.error(f'>>> {threading.current_thread().name} thread started') while True: try: if not self.tasks.empty(): task = self.tasks.get() self.lock.acquire() local_duration = self.task_duration_with_context table = self.routing_table if table: peer = next(table) # peer = (uuid, latency) if peer[1] < local_duration: self.outbox(task, peer[0]) logger.debug(f'>>> Task offloaded') else: num_of_peers = self.num_of_peers(table) peer = self.search_table( table, num_of_peers, local_duration) if peer: self.lock.release() self.outbox(task, peer[0]) logger.debug(f'>>> Task offloaded') else: self.compute_local(task) logger.debug(f'>>> Task computed locally') else: self.compute_local(task) logger.debug(f'>>> Task computed locally') except Exception as identifier: logger.error( f'>>> Exception type : {identifier}', exc_info=True) self.agent.leave(self.group_name) self.agent.stop() # stop if there are issues time.sleep(random.randint(0, 3)) def search_table(self, table, num_of_peers, local_dur): for id in range(num_of_peers): peer = next(table) if peer[1] < local_dur: return peer else: return None def inbox(self): logger.error(f'>>> {threading.current_thread().name} thread started') try: events = self.agent.events() # works like charm while True: if events: event = next(events) logger.error(f'>>> MSG TYPE: {event.type}') logger.error(f'>>> Sender Agent Name: {event.peer_name}') if event.type == 'WHISPER': msg = pickle.loads(event.msg[0]) if msg['task-type'] == 2: result = msg['result'] self.results.put(result) elif msg['task-type'] == 1: # peer sent us a task to execute self.tasks.put(msg) elif event.type == 'SHOUT': # message from the Access Point AP msg = pickle.loads(event.msg[0]) if msg['msg-type'] == 'REQUEST': msg['uuid'] = self.agent.uuid() self.lock.acquire() msg['processing-time'] = self.task_duration_with_context self.lock.release() msg_b = pickle.dumps(msg, -1) self.agent.whisper(event.peer_uuid, msg_b) elif msg['msg-type'] == 'UPDATE': table = msg['table'] own_uuid = self.agent.uuid() if own_uuid in table.keys(): # remove our own UUID to avoid offloading to ourselves del table[own_uuid] self.routing_table_setter(table) except Exception as identifier: logger.error(f'>>> Exception type: {identifier}', exc_info=True) self.agent.leave(self.group_name) self.agent.stop() # leave the cluster if you have issues # compute the chebyshev probability def chebyshev_probability(self, average, varianse, error_val): probability = [] for val in error_val: if val - average >= 1: prob = varianse / ((val - average)**2) probability.append(prob) return probability def regression_error(self, outcome, truth, window): n_data = len(truth) count = 0 errors = [] while count + window <= n_data: error = [abs(y_pred - y_truth) for y_pred, y_truth in zip( outcome[count:count + window], truth[count:count + window])] errors.append(np.mean(error)) count += window return errors def run(self): # start the threads here t1 = threading.Thread(target=self.add_task, name='add task') t2 = threading.Thread(target=self.vary_cpu_load, name='vary cpu load') t3 = threading.Thread(target=self.check_results, name='check results') t4 = threading.Thread(target=self.handle_task, name='handle task') t5 = threading.Thread(target=self.inbox, name='inbox') threads = [t1, t2, t3, t4, t5] try: for thread in threads: thread.start() except Exception as err: logger.error(f'>>> Exception: {err}', exc_info=True) self.agent.leave(self.group_name) self.agent.stop()