def _init_dask(self): if self._cluster is not None: print('WARNING: reinitiailizing dask') self._cluster.close() if self._client is not None: self._client.close() del self._dask_futures if self.pub_out is not None: del self.pub_out if self.sub_in is not None: del self.sub_in self._cluster = LocalCluster( n_workers=self.num_processes, # silence_logs=0, memory_limit=None) self._client = Client(self._cluster) # always define publishers first then subscribers pub_out = [ Pub('env{}_input'.format(env_idx)) for env_idx in range(self.num_processes) ] self._dask_futures = self._client_map() sub_in = Sub('observations') self.pub_out = pub_out self.sub_in = sub_in # wait until all the peers are created time.sleep(5)
def init_workers(env_name, num_processes): cluster = LocalCluster(n_workers=num_processes) client = Client(cluster) pubs_reset = [ Pub('env{}_reset'.format(seed)) for seed in range(num_processes) ] client.map(run_env, [env_name] * num_processes, range(num_processes)) sub_obs = Sub('observations') # sleep while sub/pub is initialized time.sleep(5) return client, pubs_reset, sub_obs
def worker2(): pub_try = Pub('lets') sub_try = Sub('receive') pub_try.put("I can send...") try: s = sub_try.get() print("Let see what i get,", s) except TimeoutError: print("I CANT") return
def __init__(self, args, obs_running_stats=None): self.parse_args(args) torch.set_num_threads(1) # create variables self.env = self.create_env(args['seed']) self.pub_out = Pub('observations') self.sub_in = Sub('env{}_input'.format(int(self.env_idx))) self.step_counter = 0 self.step_counter_after_new_action = 0 self.reset_env(reset_mime=False) # start the environment loop self.env_loop()
def run_env(env_name, seed): try: import gym import mime env = gym.make(env_name) env.seed(seed) pub_obs = Pub('observations') sub_reset = Sub('env{}_reset'.format(int(seed))) print('seed {} ready'.format(seed)) for counter in sub_reset: frames, scalars = get_frames_scalars(env, counter) pub_obs.put((frames, scalars, seed)) except Exception as e: print('Exeception: {}'.format(e))
def coordinator(clf,e,n_minibatch,total_workers): pub_results = Pub('results') pub_init = Pub('Initialize') pub_th = Pub('Theta') pub_endr = Pub('EndRound') pub_endsub = Pub('EndSubRound') #pub_ask_state = Pub('AskState') sub_incr = Sub('Increment') sub_f = Sub('Fs') sub_x = Sub('Xs') # sub_try=Sub('lets') # pub_try=Pub('receive') # get increments from workers def get_incr(): try: incr=sub_incr.get(timeout=5) print("Coo Received increments...",incr) if incr<0: # works as a flag to let coordinator know that chunks are out print("Coo received notice of chunks ended...") return incr except TimeoutError: return 0 # get fi's from all workers def get_fi(n_workers): fis=[] print("try to get fis workers:",n_workers) for i in range(n_workers): try: fi=sub_f.get(timeout=5) print("Coo received",i+1,"fi") fis.append(fi) except TimeoutError: print('Fis Lost worker/workers num=',len(fis)) break return fis # get xi's from all workers def get_xi(n_workers): drifts=[] print("try to get xi workers:",n_workers) for i in range(n_workers): try: xi=sub_x.get(timeout=6) print("Coo received",i+1,"xi") drifts.append(xi) except TimeoutError: print('Lost worker/workers') break print("Num of workers",len(drifts)) return drifts def check_subcribers(pub,n_workers): print("Check...") if n_workers==0: print("No workers left") return "end" while len(pub.subscribers)<n_workers: #if not all workers subscribe sleep time.sleep(0.01) print("OK Check") return "ok" #____________________________Start coordinator_________________________________ E=None th=0 fis=0 drifts=0 sum_xi=0 incr=0 e_y=0.01 workers=[] time_stamb=0 n_rounds=0 print("Coo started ...") client= get_client() for i in range(len(total_workers)-1): workers.append(client.submit(worker_f,i,clf,n_minibatch,e,workers=total_workers[i+1])) time.sleep(1) flag=True #use this flag to finish future if chunks are out start_time=time.time() while flag==True: n_subs=0 workers_status=[w.status for w in workers] k=workers_status.count('pending') print("NUMBER OF WORKERS...",k) if E is None: #if E=0 we need to update E pub_init.put(None) print("Warmup...Coo Sended E=0...") drifts=get_xi(k) #get local drifts (Xi's) print("Coo received xi's...workers=",k) sum_xi=add_x(drifts) e1=sum_xi[0]/len(drifts) e2=sum_xi[1]/len(drifts) E=[e1,e2] pub_init.put(E) print("Coo Sended E") else: pub_init.put(E) print("Coo Sended E") n_rounds+=1 y=k*f([[0],0],E,e) barrier=e_y*k*f([[0],0],E,e) #start of the round... print("START ROUND:",n_rounds," workers ",k) while y<=barrier: th=-y/(2*k) pub_th.put(th) #send theta print("Coo Sended theta") n_subs+=1 print("START SUBROUND:",n_subs," workers ",k) c=0 fis=[] #start of the subround... while c<k: incr=get_incr() #Get increments if incr<0: # works as a flag to let coordinator know that chunks are out incr=0 workers_status=[w.status for w in workers] k=workers_status.count('pending') if k==0: flag=False c=c+incr #subrounds ended... pub_endsub.put(0) #let workers know that subrounds ended print("Coo Sended endofSub... num_workers",k) workers_status=[w.status for w in workers] k=workers_status.count('pending') fis=get_fi(k) #get F(Xi)'s from workers if len(fis)==0: pub_endr.put(0) break print("Coo Received fi's workers=",k) y=add_f(fis) print("y",y) if flag==False: #if false chunks are out end future print("Coo Sended endofSub..") break #rounds ended... pub_endr.put(0) #let workers know that rounds ended print("Coo Sended endofround... num_workers",k) drifts=get_xi(len(fis)) #get local drifts (Xi's) print("len of drifts",len(drifts)) print("Coo Received xi's workers=",k) if len(drifts)==0: break sum_xi=add_x(drifts) e1=E[0]+(sum_xi[0]/len(drifts)) #len(drifts) e2=E[1]+(sum_xi[1]/len(drifts)) #len(drifts) E=[e1,e2] time_stamb=time.time()-start_time pub_results.put([E,n_subs,k,time_stamb]) if flag==False: break print("Coo ended...") return E,n_rounds,n_subs,k,time_stamb
def main(client, w, new, dataset_params, e, chunks, n_minibatch): print("-------------------------------------------------------------\n") print("Start with num of chunks:", chunks, ",e:", e) E = None Acc = [] worker = [] n_rounds = 0 time_stamps = [] total_time = [] total_acc = [] sub_results = Sub('results') #make a dataset and save training X and y ,give sample number and future number if new == 'yes': create_dataset(dataset_params, chunks, w) X_test = np.load("np_arrays/X_test.npy") y_test = np.load("np_arrays/y_test.npy") size = dataset_params["n_samples"] - len(X_test) print("Minibatch size:", size / (chunks * n_minibatch)) clf = linear_model.SGDClassifier(shuffle=False) clf_results = [clf] * (len(w) - 1) start_time = time.time() start_rounds = 0 for p in range(1): s_run_time = time.time() random_assign(len(w) - 1, chunks) # for i in range(2): # worker.append(client.submit(worker_f,i,clf_results[i],n_minibatch,e,workers=w[i+1])) #TAG Changed the call of random assign + added a for loop + return the clf and feed it again + print after finished coo = client.submit(coordinator, clf, e, n_minibatch, w, workers=w[0]) print("In progress...") # acc=pred(E,clf,X_test,y_test) # Acc.append(acc) while coo.status == 'pending': try: results = sub_results.get(timeout=0.01) E = results[0] time_stamps.append(results[3]) print(results[1:]) acc = pred(E, clf, X_test, y_test) Acc.append(acc) except TimeoutError: continue #return # t_run_time=time.time() # start_rounds=n_rounds # print("End of chunks...") # status_l=[w.status for w in worker] # clf_results=[x.result() for x in worker] # print("Coordinator:",coo.status,"...\nWorkers:",status_l) # if check_coo(coo)=="ok": # print("Finished ",p," pass with no error...\n\n") # del coo # for f in worker: del f # worker=[] # total_time.append(t_run_time-s_run_time) # total_acc.append(Acc[-1]) # name1="np_arrays/total_time"+str(len(w)-1) # name2="np_arrays/total_acc"+str(len(w)-1) # time.sleep(5) # np.save(name1,total_time) # np.save(name2,total_acc) return Acc, len(Acc), time_stamps
def worker_f(name, clf, parts, e): sub_init = Sub('Initialize') sub_th = Sub('Theta') sub_endr = Sub('EndRound') sub_endsub = Sub('EndSubRound') pub_incr = Pub('Increment') pub_f = Pub('Fs') pub_x = Pub('Xs') # get initial E value from coordinator def get_init(): w_id = get_worker().name try: print(w_id, "waits to receive E...") init = sub_init.get(timeout=20) print(w_id, "Received E") return init except TimeoutError: print(w_id, 'Error E not received') return False #get theta from cordinator def get_th(): w_id = get_worker().name try: print(w_id, "waits to receive th...") th = sub_th.get(timeout=1) print(w_id, "Received theta") return th except TimeoutError: print(w_id, 'Theta aknowlegment not received') return None #get aknowlegment for continue or stop the rounds def get_endr(): try: endr = sub_endr.get(timeout=1) print(w_id, 'End of round received') return endr except TimeoutError: return None #get aknowlegment for continue or stop the subrounds def get_endsub(): try: endsub = sub_endsub.get(timeout=1) print(w_id, 'End of subround received') return endsub except TimeoutError: return None # ____Start of worker____ th = 0 w_id = get_worker().name #get worker id print("worker", w_id, "started...") flag = True E = [[0], 0] Si = [0, 0] S_prev = [0, 0] Xi = [[0], 0] count_chunks = 0 minibatches = 0 #TAG chunks assigned and load first one X_chunk_array, y_chunk_array = load_chunks( name) #get the array with the chunk names assigned to this worker X_chunk, y_chunk = load_np(X_chunk_array, y_chunk_array, count_chunks) count_chunks += 1 while flag == True: #while this flag stays true there are chunks E = get_init() # get E from coordinator if E is False: pub_incr.put(-1) return clf if E is None: #if E=0 compute Xi and return Xi to update E #TODO make it prettier print(w_id, "Warmup....") temp = get_minibatch(X_chunk, y_chunk, minibatches, parts) #get_newSi(count_chunks,f_name) if temp is None: minibatches = 0 load = load_np(X_chunk_array, y_chunk_array, count_chunks) if load is None: print(w_id, "End of chunks") flag = False pub_incr.put(-1) break X_chunk, y_chunk = load count_chunks += 1 temp = get_minibatch(X_chunk, y_chunk, minibatches, parts) minibatches += 1 X, y = temp clf.partial_fit(X, y, np.unique(([0, 1]))) Si = [clf.coef_[0], clf.intercept_[0]] Xi = [clf.coef_[0], clf.intercept_[0]] while len(pub_x.subscribers) != 1: time.sleep(0.01) pub_x.put(Xi) print(w_id, "Sended Xi") E = get_init() # get E from coordinator if E is False: pub_incr.put(-1) break print(w_id, "Start of round") clf.coef_[0] = E[0] clf.intercept_[0] = E[1] S_prev[0] = np.array(list(E[0])) S_prev[1] = E[1] Xi = [[0], 0] #begin of round... #FIXME do not send message every time & check rounds and subrounds while get_endr() == None: ci = 0 # Xi=[[0],0] th = get_th() #get theta if th == None: print("nonreceive") continue print(w_id, "Received start of subround") #begin of subround... while get_endsub() == None: zi = f(Xi, E, e) temp = get_minibatch(X_chunk, y_chunk, minibatches, parts) while temp is None: load = load_np(X_chunk_array, y_chunk_array, count_chunks) if load is None: print(w_id, "End of chunks") flag = False break X_chunk, y_chunk = load count_chunks += 1 minibatches = 0 temp = get_minibatch(X_chunk, y_chunk, minibatches, parts) if flag == False: break else: minibatches += 1 X, y = temp clf.partial_fit(X, y, np.unique([0, 1])) Si[0] = clf.coef_[0] Si[1] = clf.intercept_[0] Xi = [Si[0] - S_prev[0], Si[1] - S_prev[1]] c_th = 0 if th != 0: #avoid division with 0 if th=0 c_th=0 c_th = (f(Xi, E, e) - zi) / th ci_new = max(ci, math.floor(c_th)) if ci != ci_new: #if we detect a difference send it to the coordinator incr = ci_new - ci pub_incr.put(incr) ci = ci_new print(w_id, "Sended...", incr) while len(pub_f.subscribers) != 1: time.sleep(0.01) pub_f.put(f(Xi, E, e)) print(w_id, "Sended Fi") print(w_id, "End of subround") if flag == False: break #end of subround... if all([v == 0 for v in Xi[0]]): print(w_id, "ZERO XI") else: pub_x.put(Xi) # send Xi print(w_id, "Sended Xi") Xi = [[0], 0] if flag == False: break # pub_incr.put(-1) print(w_id, "Ended...") return clf