def forkProcessFunction(matrix1, matrix2, result, n): global NUM_OF_PROCESS process_handle = [] row_range = int(n/NUM_OF_PROCESS) # Calculating the offset offset = n % NUM_OF_PROCESS # Start time counting start_resources = resource_usage(RUSAGE_SELF) for j in range(0, NUM_OF_PROCESS): # If the number if rows can be divided by the number of Process if offset == 0: t = Process(target=multiplyParallelMatrix, args=( int((row_range) * j), int((row_range) * (j+1)) , matrix1, matrix2, result) ) else: # if there are remaining rows if j == NUM_OF_PROCESS - 1: t = Process(target=multiplyParallelMatrix, args=( int((row_range) * j), int(((row_range) * (j+1))+offset) ,matrix1, matrix2, result ) ) else: t = Process(target=multiplyParallelMatrix, args=( int((row_range) * j), int((row_range) * (j+1)) , matrix1, matrix2, result )) process_handle.append(t) t.start() for j in range(0, NUM_OF_PROCESS): process_handle[j].join() # End time counting end_resources = resource_usage(RUSAGE_SELF) timeCPU = end_resources.ru_utime - start_resources.ru_utime # print("Sum in {0:.10f} seconds".format(timeCPU)) return timeCPU
def wrapper(*args, **kwargs): start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) func(*args, **kwargs) end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp() print({'消耗时间':{'real': end_time - start_time, 'sys': end_resources.ru_stime - start_resources.ru_stime, 'user': end_resources.ru_utime - start_resources.ru_utime}}) return True
def train_with_sparsebm( dataset_file, graph, nb_row_clusters, nb_column_clusters, row_clusters_index, column_clusters_index, use_gpu=False, gpu_index=None, ): results_files_already_done = glob.glob(results_folder + "*.pkl") save_f = (results_folder + dataset_file.split("/")[-1].split(".")[0] + "_sp.pkl") if use_gpu: save_f = (results_folder + dataset_file.split("/")[-1].split(".")[0] + "_sp_gpu.pkl") if save_f in results_files_already_done: print("Already Done") return None model = LBM( nb_row_clusters, nb_column_clusters, n_init=100, n_iter_early_stop=10, n_init_total_run=1, max_iter=5000, verbosity=1, use_gpu=use_gpu, gpu_index=gpu_index, ) start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) model.fit(graph) end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp() co_ari = CARI( row_clusters_index, column_clusters_index, model.row_labels, model.column_labels, ) icl = model.get_ICL() results = { "lib": "sparsebm", "gpu": use_gpu, "n1": graph.shape[0], "n2": graph.shape[1], "nq": nb_row_clusters, "nl": nb_column_clusters, "dataset_file": dataset_file, "icl": icl, "cari": co_ari, "real": end_time - start_time, "sys": end_resources.ru_stime - start_resources.ru_stime, "user": end_resources.ru_utime - start_resources.ru_utime, } print(f'SparseBM tt time {results["user"]+results["sys"]}') pickle.dump(results, open(save_f, "wb")) return results
def PowerModuloT(a,b,c): global slow r1 = resource_usage(RUSAGE_SELF) ret = PowerModulo(a,b,c) r2 = resource_usage(RUSAGE_SELF) t = r2.ru_utime - r1.ru_utime if (t > 0.001): print "PowerModulo(%d,%d,%d) is slow! %.4fs" % (a,b,c,t) slow=True return ret
def wrappedMethod(*args, **kwargs): from time import time as timestamp from resource import getrusage as resource_usage, RUSAGE_SELF start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) func = function(*args, **kwargs) end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp() results = {'real': end_time - start_time, 'sys': end_resources.ru_stime - start_resources.ru_stime, 'user': end_resources.ru_utime - start_resources.ru_utime} print("Execution time for {0}".format(function.__name__)) print(results) return func
def multiplyMatrix(matrix1, matrix2, result): start_resources = resource_usage(RUSAGE_SELF) #iterate through rows of matrix1 for i in range(n): # iterate through columns of matrix2 for j in range(n): # iterate through rows of matrix2 for k in range(n): result[i][j] += matrix1[i][k] * matrix2[k][j] end_resources = resource_usage(RUSAGE_SELF) timeCPU = end_resources.ru_utime - start_resources.ru_utime # print("Matrix multiplication in {0:.10f} seconds".format(timeCPU)) return timeCPU
def unix_time(function, args=tuple(), kwargs={}): '''Return `real`, `sys` and `user` elapsed time, like UNIX's command `time` You can calculate the amount of used CPU-time used by your function/callable by summing `user` and `sys`. `real` is just like the wall clock. Note that `sys` and `user`'s resolutions are limited by the resolution of the operating system's software clock (check `man 7 time` for more details). ''' start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) function(*args, **kwargs) end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp() return {'real': end_time - start_time, 'sys': end_resources.ru_stime - start_resources.ru_stime, 'user': end_resources.ru_utime - start_resources.ru_utime}
def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss): acc = session.run(accuracy, feed_dict=feed_dict_train) val_acc = session.run(accuracy, feed_dict=feed_dict_validate) msg = "Training Epoch {0} --- Training Accuracy: {1:>6.1%}, Validation Accuracy: {2:>6.1%}, Validation Loss: {3:.3f}, Real Time:{4:.4f}, CPU Time:{5:.4f}, SYS Time:{6:.4f}" log = open(trial_name,"a") end_resources = resource_usage(RUSAGE_SELF) log.write(msg.format(epoch + 1, acc, val_acc, val_loss,(time() - start_time),(end_resources.ru_utime-start_resources.ru_utime),(end_resources.ru_stime-start_resources.ru_stime))) log.write("\n") log.close() print(msg.format(epoch + 1, acc, val_acc, val_loss,(time() - start_time),(end_resources.ru_utime-start_resources.ru_utime),(end_resources.ru_stime-start_resources.ru_stime)))
def threadFunction(): arrayThreads=[] global n # Number of threads: 2,3 & 4 for NUM_OF_THREADS in range(2,5): thread_handle = [] row_range = int(n/NUM_OF_THREADS) # Calculating the offset offset = n % NUM_OF_THREADS # Start time counting start_resources = resource_usage(RUSAGE_SELF) for j in range(0, NUM_OF_THREADS): # If the number if rows can be divided by the number of threads if offset == 0: t = Thread(target=multiplyParallelMatrix, args=( int((row_range) * j), int((row_range) * (j+1)))) else: # if there are remaining rows if j == NUM_OF_THREADS - 1: t = Thread(target=multiplyParallelMatrix, args=( int((row_range) * j), int(((row_range) * (j+1))+offset))) else: t = Thread(target=multiplyParallelMatrix, args=( int((row_range) * j), int((row_range) * (j+1)))) thread_handle.append(t) t.start() for j in range(0, NUM_OF_THREADS): thread_handle[j].join() # End time counting end_resources = resource_usage(RUSAGE_SELF) timeCPU = end_resources.ru_utime - start_resources.ru_utime arrayThreads.append(timeCPU) return arrayThreads
def run(func, *args, **kwds): start_self = resource_usage(RUSAGE_SELF) start_child = resource_usage(RUSAGE_CHILDREN) start1 = time.perf_counter() start2 = time.process_time() value = func(*args, **kwds) end2 = time.process_time() end1 = time.perf_counter() end_child = resource_usage(RUSAGE_CHILDREN) end_self = resource_usage(RUSAGE_SELF) total_time = end1 - start1 process_time = end2 - start2 sys_self = end_self.ru_stime - start_self.ru_stime user_self = end_self.ru_utime - start_self.ru_utime return value, total_time, process_time, sys_self, user_self
def main(self,rank): os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3" options=read_conf() do_training=bool(int(options.do_training)) do_eval=bool(int(options.do_eval)) do_forward=bool(int(options.do_forward)) if do_forward: torch.cuda.set_device(0) device = "cuda:{}".format(0) else: torch.cuda.set_device(dist.get_rank()-1) device = "cuda:{}".format(dist.get_rank()-1) PS = Parameter_Server() if int(rank)==0 and do_training: PS.ps_server(rank) port = sys.argv[1] world_size = sys.argv[3] ip_add = sys.argv[4] fea_scp=options.fea_scp fea_opts=options.fea_opts lab_folder=options.lab_folder lab_opts=options.lab_opts dev_fea_scp="/home/slave3/kaldi/egs/timit/s5/pytorch-kaldi/exp/mfcc_shu/dev_split.000" dev_fea_opts="apply-cmvn --utt2spk=ark:$KALDI_ROOT/egs/timit/s5/data/dev/utt2spk ark:$PYTORCH_EXP/mfcc_shu/dev_cmvn_speaker.ark ark:- ark:- | add-deltas --delta-order=2 ark:- ark:- |" dev_lab_folder='/home/slave3/kaldi/egs/timit/s5/exp/dnn4_pretrain-dbn_dnn_ali_dev' dev_lab_opts='ali-to-pdf' out_file=options.out_file count_file=options.count_file pt_file=options.pt_file left=int(options.cw_left) right=int(options.cw_right) seed=int(options.seed) use_cuda=bool(int(options.use_cuda)) multi_gpu=bool(int(options.multi_gpu)) NN_type=options.NN_type batch_size=int(options.batch_size) lr=float(options.lr) save_gpumem=int(options.save_gpumem) opt=options.optimizer if NN_type=='RNN': from neural_nets import RNN as ann rnn=1 if NN_type=='LSTM': from neural_nets import LSTM as ann rnn=1 if NN_type=='GRU': from neural_nets import GRU as ann rnn=1 if NN_type=='MLP': from neural_nets import MLP as ann rnn=0 options.input_dim=429 options.num_classes=1944 net = ann(options) if use_cuda: net.cuda(device=device) update_time=0 sum_update_time=0 st_update_time=0 end_update_time=0 shu_time=0 sum_shu_time=0 st_shu_time=0 end_shu_time=0 model_time=0 sum_model_time=0 st_model_time=0 end_model_time=0 load_time=0 sum_load_time=0 st_load_time=0 end_load_time=0 val_time=0 sum_val_time=0 st_val_time=0 end_val_time=0 epoch_time=0 sum_epoch_time=0 st_epoch_time=0 end_epoch_time=0 data_time=0 st_data_time=0 end_data_time=0 train_time=0 st_train_time=0 end_train_time=0 _, st_train_time= timestamp(), resource_usage(RUSAGE_SELF) torch.manual_seed(seed) random.seed(seed) print("[INFO] Batch size: ",batch_size) if rnn or do_eval or do_forward: seed=-1 _, st_data_time= timestamp(), resource_usage(RUSAGE_SELF) if do_forward == 1: dev_data_name=[0] if do_forward == 0: [dev_data_name,dev_data_set_ori,dev_data_end_index]=load_chunk(dev_fea_scp,dev_fea_opts,dev_lab_folder,dev_lab_opts,left,right,-1) [data_name,data_set_ori,data_end_index]=load_chunk(fea_scp,fea_opts,lab_folder,lab_opts,left,right,seed) data_len = int(len(data_set_ori)/(int(world_size)-1)) if do_training: if int(world_size)-1==1: print("Partition data 1") elif int(world_size)-1==2: print("partition data 2") if int(rank)==1: data_set_ori = data_set_ori[0:data_len] elif int(rank)==2: data_set_ori = data_set_ori[data_len:] elif int(world_size)-1==3: print("partition data 3") if int(rank)==1: data_set_ori = data_set_ori[0:data_len] elif int(rank)==2: data_set_ori = data_set_ori[data_len:data_len*2] elif int(rank)==3: data_set_ori = data_set_ori[data_len*2:] elif int(world_size)-1==4: print("partition data 4") if int(rank)==1: data_set_ori = data_set_ori[0:data_len] elif int(rank)==2: data_set_ori = data_set_ori[data_len:data_len*2] elif int(rank)==3: data_set_ori = data_set_ori[data_len*2:data_len*3] elif int(rank)==4: data_set_ori = data_set_ori[data_len*3:] data_len = len(data_set_ori) end_data_time,_ = resource_usage(RUSAGE_SELF), timestamp() data_time = end_data_time.ru_utime - st_data_time.ru_utime print("data generate time: ", data_time) print(np.shape(data_set_ori)) if not(save_gpumem): data_set=torch.from_numpy(data_set_ori).float().cuda(device=device) else: data_set=torch.from_numpy(data_set_ori).float() if do_forward ==0: if not(save_gpumem): dev_data_set=torch.from_numpy(dev_data_set_ori).float().cuda(device=device) else: dev_data_set=torch.from_numpy(dev_data_set_ori).float() N_fea=data_set.shape[1]-1 options.input_dim=N_fea N_out=int(data_set[:,N_fea].max()-data_set[:,N_fea].min()+1) options.num_classes=N_out if multi_gpu: net = nn.DataParallel(net) optimizer_worker=None if optimizer_worker is None: optimizer_worker = optim.SGD(net.parameters(), lr=lr) else: optimizer_worker = optim.RMSprop(net.parameters(), lr=lr,alpha=0.95, eps=1e-8) if do_forward: if pt_file!='none': checkpoint_load = torch.load(pt_file) net.load_state_dict(checkpoint_load['model_par']) optimizer_worker.load_state_dict(checkpoint_load['optimizer_par']) optimizer_worker.param_groups[0]['lr']=lr dev_N_snt=len(dev_data_name) N_snt=len(data_name) if do_training: print("do training") net.train() test_flag=0 if do_training: N_batches=int((N_snt/batch_size)/(int(world_size)-1)) else: N_batches=int(N_snt/batch_size) if rnn==0: N_ex_tr=data_set.shape[0] N_batches=int(N_ex_tr/batch_size) if do_eval: N_batches=N_snt net.eval() test_flag=1 batch_size=1 if do_forward: post_file=kaldi_io.open_or_fd(out_file,'wb') counts = load_counts(count_file) beg_batch=0 end_batch=beg_batch+batch_size dev_beg_batch=0 dev_end_batch=dev_beg_batch+1 snt_index=0 beg_snt=0 dev_beg_snt=0 loss_sum=0 err_sum=0 dev_loss_sum=0 dev_err_sum=0 temp_err=0 dev_err_sum_tot=0 dev_N_batches=0 num_epoch=24 main_class = MAIN_CLASS() if do_forward: for i in range(N_batches): if do_training : if rnn==1: max_len=data_end_index[snt_index+batch_size-1]-data_end_index[snt_index+batch_size-2] inp= Variable(torch.zeros(max_len,batch_size,N_fea)).contiguous() lab= Variable(torch.zeros(max_len,batch_size)).contiguous().long() for k in range(batch_size): snt_len=data_end_index[snt_index]-beg_snt N_zeros=max_len-snt_len N_zeros_left=random.randint(0,N_zeros) inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,0:N_fea] lab[N_zeros_left:N_zeros_left+snt_len,k]=data_set[beg_snt:beg_snt+snt_len,-1] beg_snt=data_end_index[snt_index] snt_index=snt_index+1 else: inp= Variable(data_set[beg_batch:end_batch,0:N_fea]).contiguous().cuda(device=device) lab= Variable(data_set[beg_batch:end_batch,N_fea]).contiguous().long().cuda(device=device) if do_eval: end_snt=data_end_index[i] inp= Variable(data_set[beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device) lab= Variable(data_set[beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device) if rnn==1: inp=inp.view(inp.shape[0],1,inp.shape[1]) lab=lab.view(lab.shape[0],1) beg_snt=data_end_index[i] [loss,err,pout] = net(inp,lab,test_flag,rank) if multi_gpu: loss=loss.mean() err=err.mean() if do_forward: if rnn==1: pout=pout.view(pout.shape[0]*pout.shape[1],pout.shape[2]) if int(rank)==0: kaldi_io.write_mat(post_file, pout.data.cpu().numpy()-np.log(counts/np.sum(counts)), data_name[i]) if do_training: optimizer.zero_grad() loss.backward() optimizer.step() loss_sum=loss_sum+loss.data err_sum=err_sum+err.data beg_batch=end_batch end_batch=beg_batch+batch_size else: m=0 for e in range(num_epoch): print("Batch size: ",m) _, st_epoch_time= timestamp(), resource_usage(RUSAGE_SELF) if e>0: dev_N_batches=dev_N_snt if e>1: temp_err=dev_err_sum_tot net.eval() test_flag=1 dev_batch_size=1 dev_beg_batch=0 dev_end_batch=dev_beg_batch+1 dev_loss_sum=0 dev_err_sum=0 dev_beg_snt=0 _, st_val_time= timestamp(), resource_usage(RUSAGE_SELF) for j in range(dev_N_batches): end_snt=dev_data_end_index[j] dev_inp= Variable(dev_data_set[dev_beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device) dev_lab= Variable(dev_data_set[dev_beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device) if rnn==1: inp=inp.view(inp.shape[0],1,inp.shape[1]) lab=lab.view(lab.shape[0],1) dev_beg_snt=dev_data_end_index[j] [dev_loss,dev_err,dev_pout] = net(dev_inp,dev_lab,test_flag,rank) dev_loss_sum=dev_loss_sum+dev_loss.data dev_err_sum=dev_err_sum+dev_err.data dev_beg_batch=dev_end_batch dev_end_batch=dev_beg_batch+dev_batch_size end_val_time,_ = resource_usage(RUSAGE_SELF), timestamp() val_time = end_val_time.ru_utime - st_val_time.ru_utime sum_val_time=sum_val_time+val_time print('[INFO] EPOCH: %d, In Worker: %d, val_Err: %0.3f, val_loss: %0.3f, val_time: %0.3f' % ((e+1), int(rank),dev_err_sum/dev_N_batches, dev_loss_sum/dev_N_batches, sum_val_time)) dev_err_sum_tot=dev_err_sum/dev_N_batches if e>1: threshold = (temp_err-dev_err_sum_tot)/dev_err_sum_tot if threshold<0.0005: lr = lr * 0.5 net.train() beg_batch=0 end_batch=beg_batch+batch_size beg_snt=0 _, st_shu_time= timestamp(), resource_usage(RUSAGE_SELF) np.random.shuffle(data_set_ori) if not(save_gpumem): data_set=torch.from_numpy(data_set_ori).float().cuda(device=device) else: data_set=torch.from_numpy(data_set_ori).float() N_fea=data_set.shape[1]-1 options.input_dim=N_fea N_out=int(data_set[:,N_fea].max()-data_set[:,N_fea].min()+1) options.num_classes=N_out end_shu_time,_ = resource_usage(RUSAGE_SELF), timestamp() shu_time = end_shu_time.ru_utime - st_shu_time.ru_utime sum_shu_time=sum_shu_time+shu_time loss_sum=0 err_sum=0 for i in range(N_batches): _, st_load_time= timestamp(), resource_usage(RUSAGE_SELF) end_load_time,_ = resource_usage(RUSAGE_SELF), timestamp() load_time = end_load_time.ru_utime - st_load_time.ru_utime if do_training : if rnn==1: max_len=data_end_index[snt_index+batch_size-1]-data_end_index[snt_index+batch_size-2] inp= Variable(torch.zeros(max_len,batch_size,N_fea)).contiguous() lab= Variable(torch.zeros(max_len,batch_size)).contiguous().long() for k in range(batch_size): snt_len=data_end_index[snt_index]-beg_snt N_zeros=max_len-snt_len N_zeros_left=random.randint(0,N_zeros) inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,0:N_fea] lab[N_zeros_left:N_zeros_left+snt_len,k]=data_set[beg_snt:beg_snt+snt_len,-1] beg_snt=data_end_index[snt_index] snt_index=snt_index+1 else: inp= Variable(data_set[beg_batch:end_batch,0:N_fea]).contiguous().cuda(device=device) lab= Variable(data_set[beg_batch:end_batch,N_fea]).contiguous().long().cuda(device=device) if do_eval: end_snt=data_end_index[i] inp= Variable(data_set[beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device) lab= Variable(data_set[beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device) if rnn==1: inp=inp.view(inp.shape[0],1,inp.shape[1]) lab=lab.view(lab.shape[0],1) beg_snt=data_end_index[i] [loss,err,pout] = net(inp,lab,test_flag,rank) if multi_gpu: loss=loss.mean() err=err.mean() if do_forward: if rnn==1: pout=pout.view(pout.shape[0]*pout.shape[1],pout.shape[2]) if int(rank)==1: kaldi_io.write_mat(post_file, pout.data.cpu().numpy()-np.log(counts/np.sum(counts)), data_name[i]) if do_training: optimizer_worker.zero_grad() loss.backward() _,st_update_time = timestamp(), resource_usage(RUSAGE_SELF) main_class.ensure_shared_params(net,rank) end_update_time,_ = resource_usage(RUSAGE_SELF), timestamp() update_time = end_update_time.ru_utime-st_update_time.ru_utime cc=0 _,st_model_time = timestamp(), resource_usage(RUSAGE_SELF) end_model_time,_ = resource_usage(RUSAGE_SELF), timestamp() model_time = end_model_time.ru_utime-st_model_time.ru_utime b=0 sum_update_time=sum_update_time + update_time sum_load_time=sum_load_time+load_time sum_model_time= sum_model_time+model_time loss_sum=loss_sum+loss.data err_sum=err_sum+err.data if i%100==0: if i!=0: print('[INFO] EPOCH: %d, Batch: %d, In Worker: %d, Err: %0.3f, loss: %0.3f, update_time: %0.3f, load_time: %0.3f' % ((e+1),i, int(rank),err_sum/i, loss_sum/i,sum_update_time,sum_load_time)) beg_batch=end_batch end_batch=beg_batch+batch_size m=m+1 end_epoch_time,_ = resource_usage(RUSAGE_SELF), timestamp() epoch_time = end_epoch_time.ru_utime - st_epoch_time.ru_utime sum_epoch_time= sum_epoch_time+epoch_time if do_training: checkpoint={'model_par': net.state_dict(), 'optimizer_par' : optimizer_worker.state_dict()} torch.save(checkpoint,options.out_file) loss_tot=loss_sum/(N_batches) err_tot=err_sum/(N_batches) end_train_time,_ = resource_usage(RUSAGE_SELF), timestamp() train_time = end_train_time.ru_utime - st_train_time.ru_utime if do_training: checkpoint={'model_par': net.state_dict(), 'optimizer_par' : optimizer_worker.state_dict()} torch.save(checkpoint,options.out_file) info_file=out_file.replace(".pkl",".info") with open(info_file, "a") as inf: inf.write("model_in=%s\n" %(pt_file)) inf.write("fea_in=%s\n" %(fea_scp)) inf.write("loss=%f\n" %(loss_tot)) inf.write("err=%f\n" %(err_tot)) inf.write("all_time=%f\n" %(train_time)) inf.write("shu_time=%f\n" %(sum_shu_time)) inf.write("model load time=%f\n" %(sum_load_time)) inf.write("gradient send time=%f\n" %(sum_update_time)) inf.write("val data calculate time=%f\n" %(sum_val_time)) inf.write("data generate time=%f\n" %(data_time)) inf.write("model update time=%f\n" %(sum_model_time)) inf.write("epoch time=%f\n" %((sum_epoch_time-sum_load_time-sum_update_time-sum_model_time-sum_val_time)/num_epoch)) inf.write("training time=%f\n" %(train_time-sum_load_time-sum_update_time-sum_val_time-data_time-sum_model_time-sum_shu_time)) inf.close() if do_forward: post_file.close()
from argparse import ArgumentParser from subprocess import run from resource import getrusage as resource_usage, RUSAGE_CHILDREN from time import time as timestamp parser = ArgumentParser(description="Profile pzip execution time") parser.add_argument('program', type=str, help="program to profile") parser.add_argument('input', type=str, help="name of input file") parser.add_argument('output', type=str, help="name of output file") parser.add_argument('nThreads', type=int, help="Number of threads") res = parser.parse_args() cmd_list = [res.program, res.input, res.output, str(res.nThreads)] start_time, start_resources = timestamp(), resource_usage(RUSAGE_CHILDREN) run_result = run(cmd_list) end_resources, end_time = resource_usage(RUSAGE_CHILDREN), timestamp() real = end_time - start_time sys = end_resources.ru_stime - start_resources.ru_stime user = end_resources.ru_utime - start_resources.ru_utime result = ((user + sys) / real) / res.nThreads print(f"WALL_TIME: {real:.5f} seconds") print(f"CPU_TIME_SYS: {sys:.5f} seconds") print(f"CPU_TIME_USER: {user:.5f} seconds") print(f"N_THREADS: {res.nThreads}") print(f"PARALLEL_EFFICIENCY (PE): {result:.5f}")
def output_layer(sh_list, sh_test, sh_c_list, shm_list, train_loader, test_loader, model, loss_function, rank, split, batch_size, batch_num, test_batch_num, test_num, epoch_num, lamda, lr, cv): feed_q2 = sh_list[rank - 1] grad_q2 = sh_list[rank + split - 2] send_grad = sh_c_list[rank + split - 2] feed_test = sh_test[rank - 1] send_target = shm_list[0] start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) models = [] outputs = [] inputs = [] optim = [] loss_tot = 0 time_tot = 0 cuda_time = 0 #test_num = test_set_labels.size(0) #num_of_models = 1#2*n + 1 n = -1 * (rank - (split - 1)) #num_of_models = 2*split - 1 num_of_models = split #delay = n *(2)# + 1 #delay = 2* split -(rank+1) #- 1 delay = n #model.reset_parameters() for i in range(num_of_models): models.append(copy.deepcopy(model)) outputs.append(0) inputs.append(0) optim.append( torch.optim.SGD(models[i].parameters(), lr=lr, momentum=0.9, weight_decay=0.0005, nesterov=True)) #optim.append(torch.optim.Adam(models[i].parameters(),lr=1e-4)) #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr)) for i in models: i.cuda(rank) model.cuda(rank) #labels = data_set[:,-mnist_data.NUM_LABELS:].cuda(rank) ######### if have a error, cuda(rank) add steps = int(batch_num / lamda) if batch_num % lamda != 0: steps += 1 lamda_back = lamda t = 0 for epoch in range(epoch_num): #with torch.autograd.profiler.profile() as prof: s_t_u = resource_usage(RUSAGE_SELF) s_t = timestamp() start.record() loss_sum = 0 model.train() t = 0 t1 = 0 t2 = 0 t3 = 0 t4 = 0 t5 = 0 t6 = 0 t7 = 0 t8 = 0 td1 = 0 td2 = 0 td3 = 0 td4 = 0 td5 = 0 td6 = 0 train_data = train_loader.__iter__() #for time in range(1,(batch_num + 2*split - (rank+1) -1 + 1 )): for step in range(1, steps + 1): #for time in range(step , step+lamda + 2*split -(rank+1)- 1 + 1 ): #off = (step-1)*lamda #cv.acquire() #cv.wait() #cv.notify_all() #cv.release() #cv.sync(rank) lamda = lamda_back if step == steps: lamda = batch_num - (step - 1) * lamda #print(rank,'steps',steps,'step',step,'lamda',lamda) #print('sync',step,steps) for time in range(1, lamda + 1): #if time >= (rank +) : # t >= k ; k = 3 t1 = timestamp() # recv output offset = t * batch_size #data,target = next(train_data) #offset = (time - 1) * batch_size x = feed_q2.recv() x = x.cuda(rank, non_blocking=True) #print('recv',t,x) # label gpu load #data,target = next(train_data) #target = target.cuda(rank).long() target = send_target.recv() target = target.cuda(rank, non_blocking=True).long() #target = Variable(labels[offset:offset + batch_size,:]).long() #target = Variable(labels[offset:offset + batch_size,:]).cuda(rank) t2 = timestamp() model_idx = ( time % num_of_models ) - 1 ########################################### model idx correct #model_idx = 0 input_feat = Variable(x, requires_grad=True) output = models[model_idx].forward(input_feat) #print('rank',rank,time,target) #print(target.size()) loss = loss_function(output, target) t3 = timestamp() #loss = loss_function(output,torch.max(target,1)[1]) optimizer = optim[model_idx] optimizer.zero_grad() loss.backward() #a = list(models[model_idx].parameters())[0].clone() optimizer.step() t4 = timestamp() #b = list(models[model_idx].parameters())[0].clone() #print(torch.equal(a.data,b.data)) #grad = input_feat.grad.data.to('cpu') grad_q2.send_wait() send_grad.copy_(input_feat.grad.data) grad_q2.async_send_signal() t5 = timestamp() loss_sum = loss_sum + loss.data t += 1 td1 += t2 - t1 td2 += t3 - t2 td3 += t4 - t3 td4 += t5 - t4 #print(time) model.init_zero() with torch.cuda.device(rank): for i in range(num_of_models): j = models[i].parameters() for k in model.parameters(): #k = 0 l = j.__next__() k.requires_grad_(False) k.copy_(k.data + l.data / num_of_models) for i in range(num_of_models): j = model.parameters() for k in models[i].parameters(): l = j.__next__() k.requires_grad_(False) k.copy_(l.data) k.requires_grad_(True) loss_tot = loss_sum / batch_num e_t_u = resource_usage(RUSAGE_SELF) e_t = timestamp() u_t = e_t_u.ru_stime - s_t_u.ru_stime t = e_t - s_t end.record() torch.cuda.synchronize() cuda_time = cuda_time + start.elapsed_time(end) print( 'node3 user time = %f time = %f cuda time = %f cuda tot time = %f loss_tot = %f' % (u_t, t, start.elapsed_time(end), cuda_time, loss_tot)) #print('node3 user time = %f time = %f loss_tot = %f' % ( u_t , t,loss_tot)) #print(prof) time_tot = time_tot + t print('rank =', rank, 'recv output =', td1) print('rank =', rank, 'forward =', td2) print('rank =', rank, 'backward =', td3) print('rank =', rank, 'send grad =', td4) model.eval() total = 0 correct = 0 dev_loss_tot = 0 for data, target in test_loader: #for i in range(test_batch_num) : offset = i * batch_size ##################################### #print('rank',rank,i,target) x = feed_test.recv() #print(x) x = x.cuda(rank) target = target.cuda(rank) #target = Variable(test_set_labels[offset:offset+batch_size,:]).long() #target = Variable(test_set_labels[offset:offset+batch_size,:]) output = model.forward(x) _, pred = torch.max(output.data, 1) #dev_loss = loss_function(output,torch.max(target,1)[1]) dev_loss = loss_function(output, target) dev_loss_tot += dev_loss.item() #print('rank',rank,i,pred) #print(target,pred) #total += target.size(0) #print(total) #correct += (pred == torch.max(target,1)[1]).sum() correct += (pred == target).sum() #print('correct',correct) #i += 1 print('epoch=', epoch, 'tot_time =', time_tot, 'accuracy =', (100 * correct / test_num), 'test_loss', dev_loss_tot / test_batch_num) if epoch == 150 or epoch == 225: lr = lr * 0.1 for i in optim: for j in i.param_groups: j['lr'] = lr
def trace(): print("Memory:", resource_usage(RUSAGE_SELF).ru_maxrss/1024, "CPUTime:", str(resource_usage(RUSAGE_SELF).ru_utime))
def input_layer(sh_list, sh_test, sh_c_list, shm_list, train_loader, test_loader, model, rank, split, batch_size, batch_num, test_batch_num, epoch_num, lamda, lr, cv): update = 0 feed_q1 = sh_list[rank] grad_q1 = sh_list[rank + split - 1] #split = 3 send_output = sh_c_list[rank] feed_test = sh_test[rank] send_target = shm_list[0] models = [] outputs = [] inputs = [] optim = [] n = -1 * (rank - (split - 1)) #num_of_models = 2*split - 1 #num_of_models = n + 1 num_of_models = split #delay = n *(2)# + 1 delay = n #model.reset_parameters() for i in range(num_of_models): models.append(copy.deepcopy(model)) outputs.append(0) inputs.append(0) optim.append( torch.optim.SGD(models[i].parameters(), lr=lr, momentum=0.9, weight_decay=0.0005, nesterov=True)) #optim.append(torch.optim.Adam(models[i].parameters(),lr=1e-4)) #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr)) for i in models: i.cuda(rank) model.cuda(rank) #data = data_set[:,:-mnist_data.NUM_LABELS] time_tot = 0 steps = int(batch_num / lamda) if batch_num % lamda != 1: steps += 1 lamda_back = lamda for epoch in range(epoch_num): #with torch.autograd.profiler.profile() as prof: s_t_u = resource_usage(RUSAGE_SELF) s_t = timestamp() model.train() for i in models: i.train() train_data = train_loader.__iter__() t = 0 t1 = 0 t2 = 0 t3 = 0 t4 = 0 t5 = 0 t6 = 0 t7 = 0 t8 = 0 td1 = 0 td2 = 0 td3 = 0 td4 = 0 td5 = 0 td6 = 0 #for time in range(1,(batch_num + 2 * split - (rank + 1) - 1 + 1)): for step in range(1, steps + 1): #off = (step-1)*lamda #cv.acquire() #cv.wait() #cv.notify_all() #cv.release() #cv.sync(rank) lamda = lamda_back if step == steps: lamda = batch_num - (step - 1) * lamda #print('step',step,'lamda',lamda) for time in range(1, lamda + delay + 1): #if time <= off + lamda : if time <= lamda: #offset = (time-1) * batch_size t1 = timestamp() #offset = t * batch_size data, target = next(train_data) send_target.send(target) #print('rank',rank,time,target) #x = x.view(-1,784) #input_feat = Variable(data,requires_grad=True).to("cuda:0") data = data.cuda(rank, non_blocking=True) #input_feat = Variable(data[offset:offset+batch_size,:],requires_grad=True).cuda(rank) t2 = timestamp() #print(input_feat) #print(input_feat.size()) model_idx = (time % num_of_models) - 1 #output = models[model_idx].forward(input_feat) output = models[model_idx].forward(data) #inputs[model_idx] = input_feat outputs[model_idx] = output t3 = timestamp() #print(output.size()) #output_send = output.to("cpu") feed_q1.send_wait() send_output.copy_(output.data) #print('send',t,send_output) feed_q1.async_send_signal() #feed_q1.send(output.data.to("cpu")) t += 1 t4 = timestamp() if time > delay: # t-(2K-k-1) #if time >= 1+ delay : # t-(2K-k-1) t5 = timestamp() pg = grad_q1.recv() pg = pg.cuda(rank) t6 = timestamp() output_idx = ((time - delay) % num_of_models) - 1 optimizer = optim[output_idx] optimizer.zero_grad() output = outputs[output_idx] output.backward(pg) #a = list(models[output_idx].parameters())[0].clone() optimizer.step() t7 = timestamp() #b = list(models[output_idx].parameters())[0].clone() #print(torch.equal(a.data,b.data)) td1 += t2 - t1 td2 += t3 - t2 td3 += t4 - t3 td4 += t6 - t5 td5 += t7 - t6 #print(time) #feed_q1.init() #grad_q1.init() model.init_zero() with torch.cuda.device(rank): for i in range(num_of_models): j = models[i].parameters() for k in model.parameters(): #k = 0 l = j.__next__() k.requires_grad_(False) k.copy_(k.data + l.data / num_of_models) for i in range(num_of_models): j = model.parameters() for k in models[i].parameters(): l = j.__next__() k.requires_grad_(False) k.copy_(l.data) k.requires_grad_(True) #print('average_done worker 1') e_t_u = resource_usage(RUSAGE_SELF) e_t = timestamp() u_t = e_t_u.ru_stime - s_t_u.ru_stime t = e_t - s_t time_tot = time_tot + t #print('node1 user time = %f time = %f time_tot = %f' % ( u_t , t, time_tot)) #print(prof) print('rank =', rank, 'recv output =', td1) print('rank =', rank, 'forward =', td2) print('rank =', rank, 'send output', td3) print('rank =', rank, 'recv grad =', td4) print('rank =', rank, 'backward =', td5) model.eval() for i in models: i.eval() for data, target in test_loader: #for i in range(test_batch_num): #print(data,target) #print('rank',rank,target) #offset = i * batch_size #x = Variable(test_set[offset:offset+batch_size,:]) x = Variable(data).cuda(rank) #x = x.view(-1,784) #x = x.to("cuda:0") output = model.forward(x) #output = output.to("cpu") #print(output.size()) feed_test.send(output.data.to('cpu')) #i += 1 if epoch == 150 or epoch == 225: lr = lr * 0.1 for i in optim: for j in i.param_groups: j['lr'] = lr
def hidden_layer(sh_list, sh_test, sh_c_list, model, rank, split, batch_num, test_batch_num, epoch_num, lamda, lr, cv): feed_q1 = sh_list[rank - 1] grad_q1 = sh_list[rank + split - 2] send_output = sh_c_list[2 * rank] send_grad = sh_c_list[2 * rank - 1] feed_test = sh_test[rank - 1] if split > 2: feed_q2 = sh_list[rank] grad_q2 = sh_list[rank + split - 1] feed_test2 = sh_test[rank] models = [] outputs = [] inputs = [] optim = [] n = -1 * (rank - (split - 1)) #num_of_models = 2*split - 1 num_of_models = split #delay = n *(2)# + 1 #delay = 2* split -(rank+1) #- 1 delay = n #model.reset_parameters() for i in range(2 * split - 1): models.append(copy.deepcopy(model)) outputs.append(0) inputs.append(0) optim.append( torch.optim.SGD(models[i].parameters(), lr=lr, momentum=0.9, weight_decay=0.0005, nesterov=True)) #optim.append(torch.optim.Adam(models[i].parameters(),lr=1e-4)) #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr)) for i in models: i.cuda(rank) model.cuda(rank) time_tot = 0 steps = int(batch_num / lamda) if batch_num % lamda != 0: steps += 1 lamda_back = lamda t = 0 for epoch in range(epoch_num): #with torch.autograd.profiler.profile() as prof: s_t_u = resource_usage(RUSAGE_SELF) s_t = timestamp() model.train() for i in models: i.train() t = 0 t1 = 0 t2 = 0 t3 = 0 t4 = 0 t5 = 0 t6 = 0 t7 = 0 t8 = 0 td1 = 0 td2 = 0 td3 = 0 td4 = 0 td5 = 0 td6 = 0 ########################################################################################################## #for time in range(1,(batch_num + 2*split - (rank + 1) -1 + 1)): for step in range(1, steps + 1): #off = (step-1)*lamda #for time in range(off+1 , off+lamda + delay ): #cv.acquire() #cv.wait() #cv.release() #cv.sync(rank) lamda = lamda_back if step == steps: lamda = batch_num - (step - 1) * lamda #print(rank,'steps',steps,'step',step,'lamda',lamda) for time in range(1, lamda + delay + 1): #if time <= off + lamda: # k = 2 ; t >= k if time <= lamda: # k = 2 ; t >= k t1 = timestamp() x = feed_q1.recv() x = x.cuda(rank, non_blocking=True) #print('recv',x) t2 = timestamp() input_feat = Variable(x, requires_grad=True) #input_feat = input_feat.to("cuda:1") model_idx = (time % num_of_models) - 1 output = models[model_idx].forward(input_feat) inputs[model_idx] = input_feat outputs[model_idx] = output t3 = timestamp() feed_q2.send_wait() send_output.copy_(output.data) feed_q2.async_send_signal() t += 1 t4 = timestamp() #pg = grad_q2.get() #if len(pg) > 0: #if time > delay: # t-(2K-k-1) if time > delay: # t-(2K-k-1) t5 = timestamp() pg = grad_q2.recv() pg = pg.cuda(rank) t6 = timestamp() output_idx = ((time - delay) % num_of_models) - 1 optimizer = optim[output_idx] optimizer.zero_grad() output = outputs[output_idx] output.backward(pg) #outputs[output_idx].backward(pg) #a = list(models[output_idx].parameters())[0].clone() optimizer.step() t7 = timestamp() #outputs[output_idx].backward(pg) #b = list(models[output_idx].parameters())[0].clone() #print(torch.equal(a.data,b.data)) #grad = inputs[output_idx].grad.data.to('cpu') grad_q1.send_wait() send_grad.copy_(inputs[output_idx].grad.data) #grad = pg grad_q1.async_send_signal() t8 = timestamp() #outputs[output_idx].backward(pg) td1 += t2 - t1 td2 += t3 - t2 td3 += t4 - t3 td4 += t6 - t5 td5 += t7 - t6 td6 += t8 - t7 ############################################################################################################### #feed_q2.init() #grad_q2.init() #print(time) model.init_zero() with torch.cuda.device(rank): for i in range(num_of_models): j = models[i].parameters() for k in model.parameters(): #k = 0 l = j.__next__() k.requires_grad_(False) k.copy_(k.data + l.data / num_of_models) for i in range(num_of_models): j = model.parameters() for k in models[i].parameters(): l = j.__next__() k.requires_grad_(False) k.copy_(l.data) k.requires_grad_(True) #print('average_done') e_t_u = resource_usage(RUSAGE_SELF) e_t = timestamp() u_t = e_t_u.ru_stime - s_t_u.ru_stime t = e_t - s_t time_tot = time_tot + t #print('node2 user time = %f time = %f tot_time = %f' % ( u_t , t, time_tot)) #print(prof) print('rank =', rank, 'recv output =', td1) print('rank =', rank, 'forward =', td2) print('rank =', rank, 'send output', td3) print('rank =', rank, 'recv grad =', td4) print('rank =', rank, 'backward =', td5) print('rank =', rank, 'send grad =', td6) model.eval() for i in models: i.eval() #for data,target in test_loader: for i in range(test_batch_num): x = feed_test.recv() x = x.cuda(rank) output = model.forward(x) #output = output.to('cpu') feed_test2.send(output.data.to('cpu')) if epoch == 150 or epoch == 225: lr = lr * 0.1 for i in optim: for j in i.param_groups: j['lr'] = lr
def classify(domain, features_filter=None, keep_features=False): t0_time, t0_resources = timestamp(), resource_usage(RUSAGE_SELF) times = {} if features_filter == None: evaluation_features = pd.DataFrame(columns=app.features.columns) else: evaluation_features = pd.DataFrame(columns=app.featuresfs.columns) t1_resources, t1_time = resource_usage(RUSAGE_SELF), timestamp() times['setup'] = { 'wall': { 'total': t1_time - t0_time }, 'user': { 'total': t1_resources.ru_utime - t0_resources.ru_utime }, } res = process(domain, features_filter) res['times'].update(times) del (times) t1_resources, t1_time = resource_usage(RUSAGE_SELF), timestamp() evaluation_features = evaluation_features.append( res['features'], ignore_index=True).astype('float') if not keep_features: del (res['features']) t2_resources, t2_time = resource_usage(RUSAGE_SELF), timestamp() res['times']['features']['postprocess'] = { 'wall': { 'total': t2_time - t1_time }, 'user': { 'total': t2_resources.ru_utime - t1_resources.ru_utime }, } res['class'] = {} if features_filter == None: res['class']['code'] = app.lightgbm.predict(evaluation_features)[0] else: res['class']['code'] = app.lightgbmfs.predict(evaluation_features)[0] t3_resources, t3_time = resource_usage(RUSAGE_SELF), timestamp() res['times']['classification'] = { 'wall': { 'total': t3_time - t2_time }, 'user': { 'total': t3_resources.ru_utime - t2_resources.ru_utime }, } res['class']['label'] = app.category_map[res['class']['code']] res['class']['code'] = int(res['class']['code']) t4_resources, t4_time = resource_usage(RUSAGE_SELF), timestamp() res['times']['total'] = { 'wall': { 'total': t4_time - t0_time }, 'user': { 'total': t4_resources.ru_utime - t0_resources.ru_utime }, } return res
def process(domain, features_filter=None): headers = {'Content-Type': 'application/json'} times = {} times['features'] = {} try: t0_time, t0_resources = timestamp(), resource_usage(RUSAGE_SELF) data = {"fqdn": domain} endpoint = "http://155.54.210.169:8080/DGA/domain/features" if features_filter != None: data.update(features_filter) endpoint += "/filtered" r = requests.post(endpoint, data=json.dumps(data), headers=headers) t1_resources, t1_time = resource_usage(RUSAGE_SELF), timestamp() times['features']['request'] = { 'wall': { 'total': t1_time - t0_time }, 'user': { 'total': t1_resources.ru_utime - t0_resources.ru_utime }, } #print(r.json()) features = {} for elem in r.json(): for key in elem.keys(): features[key] = elem[key] try: del (features['class']) except Exception: pass try: del (features['domain']) except Exception: pass t2_resources, t2_time = resource_usage(RUSAGE_SELF), timestamp() times['features']['cleaning'] = { 'wall': { 'total': t2_time - t1_time }, 'user': { 'total': t2_resources.ru_utime - t1_resources.ru_utime }, } times['features']['server'] = { 'wall': { 'total': float(r.headers['wall-time-ms']) / 1000 }, 'user': { 'total': float(r.headers['cpu-time-ms']) / 1000 }, } return { 'domain': domain, 'status_code': r.status_code, 'features': features, 'times': times } except Exception as ex: return {'domain': domain, 'exception': ex}
# [3, 4]] # y1 = [[5, 6, 1], # [1, 3, 2]] # # print(multi(x1, y1)) # Sizes of the matrix # sizes = [250, 500, 1000, 1500, 2000] sizes = [25, 50] total = [None] * len(sizes) user = [None] * len(sizes) sys = [None] * len(sizes) count = 0 for size in sizes: m1 = [[random() for x in range(size)] for y in range(size)] m2 = [[random() for x in range(size)] for y in range(size)] start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) m3 = multi(m1, m2) end_time, end_resources = timestamp(), resource_usage(RUSAGE_SELF) total[count] = end_time - start_time sys[count] = end_resources.ru_stime - start_resources.ru_stime user[count] = end_resources.ru_utime - start_resources.ru_utime count += 1 print(user) print(sys) print(total) fig, ax = plt.subplots() ax.set_prop_cycle(color=['red', 'green', 'blue']) plt.plot(sizes, user) plt.plot(sizes, sys) plt.plot(sizes, total)
def input_layer(sh_list, sh_test, sh_c_list, shm_list, train_loader, test_loader, model, rank, split, batch_size, batch_num, test_batch_num, epoch_num, lamda, lr, gamma, cv): update = 0 feed_q1 = sh_list[rank] grad_q1 = sh_list[rank + split - 1] #split = 3 send_output = sh_c_list[rank] feed_test = sh_test[rank] send_target = shm_list[0] outputs = [] inputs = [] optim = [] n = -1 * (rank - (split - 1)) #num_of_models = 2*split - 1 #num_of_models = n + 1 num_of_models = n #delay = n *(2)# + 1 delay = split - 1 - rank #model.reset_parameters() for i in range(num_of_models): #models.append(copy.deepcopy(model)) outputs.append(0) #inputs.append(0) #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr,momentum=0.9,weight_decay=0.0005,nesterov=True)) #optim.append(torch.optim.Adam(models[i].parameters(),lr=1e-4)) #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr)) optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005, nesterov=True) model.cuda(rank) #data = data_set[:,:-mnist_data.NUM_LABELS] time_tot = 0 steps = int(batch_num / lamda) if batch_num % lamda != 1: steps += 1 lamda_back = lamda for epoch in range(epoch_num): #with torch.autograd.profiler.profile() as prof: s_t_u = resource_usage(RUSAGE_SELF) s_t = timestamp() model.train() train_data = train_loader.__iter__() t = 0 t1 = 0 t2 = 0 t3 = 0 t4 = 0 t5 = 0 t6 = 0 t7 = 0 t8 = 0 td1 = 0 td2 = 0 td3 = 0 td4 = 0 td5 = 0 td6 = 0 #for time in range(1,(batch_num + 2 * split - (rank + 1) - 1 + 1)): for step in range(1, steps + 1): #off = (step-1)*lamda #cv.acquire() #cv.wait() #cv.notify_all() #cv.release() #cv.sync(rank) lamda = lamda_back if step == steps: lamda = batch_num - (step - 1) * lamda #print('step',step,'lamda',lamda) #for time in range(1 , lamda + delay ): #for time in range(1 , lamda + delay +1 ): for time in range(1, lamda + 1): #if time <= off + lamda : if time <= lamda: t1 = timestamp() data, target = next(train_data) while len(data) != batch_size: inputs_copy_len = (batch_size - len(data)) if ( batch_size - len(data) <= len(data)) else len(data) data = torch.cat([data, data[0:inputs_copy_len]], 0) target = torch.cat([target, target[0:inputs_copy_len]], 0) send_target.send(target) #input_feat = Variable(data,requires_grad=True).to("cuda:0") data = data.cuda(rank, non_blocking=True) #input_feat = Variable(data[offset:offset+batch_size,:],requires_grad=True).cuda(rank) t2 = timestamp() model_idx = (time % num_of_models) - 1 #output = models[model_idx].forward(input_feat) output = model.forward(data) #inputs[model_idx] = input_feat outputs[model_idx] = output t3 = timestamp() feed_q1.send_wait() send_output.copy_(output.data) feed_q1.async_send_signal() t += 1 t4 = timestamp() if time > delay: t5 = timestamp() pg = grad_q1.recv() pg = pg.cuda(rank) t6 = timestamp() output_idx = ((time - delay) % num_of_models) - 1 #optimizer = optim[output_idx] optimizer.zero_grad() output = outputs[output_idx] output.backward(pg) #a = list(models[output_idx].parameters())[0].clone() optimizer.step() t7 = timestamp() #b = list(models[output_idx].parameters())[0].clone() #print(torch.equal(a.data,b.data)) td1 += t2 - t1 td2 += t3 - t2 td3 += t4 - t3 td4 += t6 - t5 td5 += t7 - t6 e_t_u = resource_usage(RUSAGE_SELF) e_t = timestamp() u_t = e_t_u.ru_stime - s_t_u.ru_stime t = e_t - s_t time_tot = time_tot + t #print('node1 user time = %f time = %f time_tot = %f' % ( u_t , t, time_tot)) #print(prof) print('rank =', rank, 'recv output =', td1) print('rank =', rank, 'forward =', td2) print('rank =', rank, 'send output', td3) print('rank =', rank, 'recv grad =', td4) print('rank =', rank, 'backward =', td5) model.eval() for data, target in test_loader: x = Variable(data).cuda(rank) output = model.forward(x) #print(output.size()) feed_test.send(output.data.to('cpu')) #i += 1 if epoch == 400 or epoch == 500: lr = lr * gamma for i in optim: for j in i.param_groups: j['lr'] = lr feed_q1.terminate.value = 1
def train_with_blockcluster( dataset_file, graph, nb_row_clusters, nb_column_clusters, row_clusters_index, column_clusters_index, ): results_files_already_done = glob.glob(results_folder + "*.pkl") if (results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bc.pkl" in results_files_already_done): print("Already Done") return None print("BlockCluster :") # Convert sparse matrix to R matrix. B = graph.todense() nr, nc = B.shape Br = ro.r.matrix(B, nrow=nr, ncol=nc) # initmethod Method to initialize model parameters. The valid values are "cemInitStep", "emInitStep" and "randomInit" # nbiterationsxem : Number of EM iterations used during xem step. Default value is 50. # nbinitmax : Maximal number initialization to try. Default value is 100 # nbinititerations : Number of Global iterations used in initialization step. Default value is 10. # initepsilon : Tolerance value used while initialization. Default value is 1e-2. # nbxem : Number of xem steps. Default value is 5. strategy = blockcluster.coclusterStrategy( initmethod="randomInit", nbinitmax=100, nbinititerations=10, nbiterationsXEM=5000, nbiterationsxem=10, initepsilon=1e-2, epsilonxem=1e-4, epsilonXEM=1e-10, stopcriteria="Likelihood", nbtry=1, nbxem=100, ) start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) results = blockcluster.cocluster( Br, "binary", nbcocluster=robjects.IntVector([nb_row_clusters, nb_column_clusters]), nbCore=1, strategy=strategy, ) end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp() print(end_time - start_time) rowclass = np.array(results.slots["rowclass"]) colclass = np.array(results.slots["colclass"]) icl = results.slots["ICLvalue"][0] co_ari = CARI(row_clusters_index, column_clusters_index, rowclass, colclass) """Return `real`, `sys` and `user` elapsed time, like UNIX's command `time` You can calculate the amount of used CPU-time used by summing `user` and `sys`. `real` is just like the wall clock. """ results = { "lib": "blockcluster", "n1": graph.shape[0], "n2": graph.shape[1], "nq": nb_row_clusters, "nl": nb_column_clusters, "dataset_file": dataset_file, "icl": icl, "cari": co_ari, "real": end_time - start_time, "sys": end_resources.ru_stime - start_resources.ru_stime, "user": end_resources.ru_utime - start_resources.ru_utime, } print(f'BlockCluster tt time {results["user"]+results["sys"]}') pickle.dump( results, open( results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bc.pkl", "wb", ), ) return results
def train_with_blockmodels( dataset_file, graph, nb_row_clusters, nb_column_clusters, row_clusters_index, column_clusters_index, ): results_files_already_done = glob.glob(results_folder + "*.pkl") if (results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bm.pkl" in results_files_already_done): print("Already Done") return None print("blockmodels :") # Convert sparse matrix to R matrix. n1, n2 = graph.shape B = graph.todense() nr, nc = B.shape Br = ro.r.matrix(B, nrow=nr, ncol=nc) network = robjects.ListVector({"adjacency": Br}) model = LBM( nb_row_clusters, nb_column_clusters, n_init=1, n_iter_early_stop=1, n_init_total_run=1, max_iter=1, verbosity=0, ) model.fit(graph) init_list = [] for _ in range(100): _, _, tau_1_init, tau_2_init, _ = model._init_LBM_random( n1, n2, nb_row_clusters, nb_column_clusters, graph.nnz) nr, nc = tau_1_init.shape t1_init = ro.r.matrix(tau_1_init, nrow=nr, ncol=nc) nr, nc = tau_2_init.shape t2_init = ro.r.matrix(tau_2_init, nrow=nr, ncol=nc) init_list.append(robjects.ListVector({"Z1": t1_init, "Z2": t2_init})) start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) best_icl = -np.inf best_init = None for i, init in enumerate(init_list): print(f"Init {i}/{len(init_list)}", end="\r") results = blockmodels.dispatcher("LBM", init, "bernoulli", network, False) icl_or_ll = results[2][0] if icl_or_ll > best_icl: best_init = icl_or_ll best_init = init print("\n Start training best") results = blockmodels.dispatcher("LBM", best_init, "bernoulli", network, True) print("End training best") end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp() icl = results[2][0] res_tau_1 = np.array(results[0][0]) res_tau_2 = np.array(results[0][2]) co_ari = CARI( row_clusters_index, column_clusters_index, res_tau_1.argmax(1), res_tau_2.argmax(1), ) results = { "lib": "blockmodels", "n1": graph.shape[0], "n2": graph.shape[1], "nq": nb_row_clusters, "nl": nb_column_clusters, "dataset_file": dataset_file, "icl": icl, "cari": co_ari, "real": end_time - start_time, "sys": end_resources.ru_stime - start_resources.ru_stime, "user": end_resources.ru_utime - start_resources.ru_utime, } print(f'Blockmodels tt time {results["user"]+results["sys"]}') pickle.dump( results, open( results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bm.pkl", "wb", ), ) return results
def trueit_queue(q): q.put(True) def trueit_pipe(conn): conn.send(True) conn.close() if __name__ == '__main__': reps = int(sys.argv[1]) if reps != 0: for test in sys.argv[2:]: start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) if test == '1': print('os.system(\'true\') =') for i in xrange(0, reps): os.system('true') elif test == '2': print('python local call, \'True\' =') for i in xrange(0, reps): trueit_local() elif test == '3': print('os.system, python executable =') for i in xrange(0, reps): command = sys.argv[0] + " 0 2" os.system(command)
""" Code originally lifted from http://cv-tricks.com/tensorflow-tutorial/training-convolutional-neural-network-for-image-classification/ """ #Adding Seed so that random initialization is consistent from numpy.random import seed seed(1) from tensorflow import set_random_seed set_random_seed(3) from time import time import datetime st = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S') batch_size = 32 start_time = time() start_resources = resource_usage(RUSAGE_SELF) trial_name = "pc_no_unidentified_rs_3"+st+".txt" #Prepare input data classes = ['Asterionella','Aulocoseira','Colonial Cyanobacteria','Cryptomonas','Detritus','Dolichospermum','Filamentous cyanobacteria','Romeria','Staurastrum'] #classes = ['Snowella', 'Staurastrum'] num_classes = len(classes) # 20% of the data will automatically be used for validation validation_size = 0.20 img_size = 256 num_channels = 3 os.chdir('..') train_path=os.getcwd() train_path += '/extracted_images/' # We shall load all the training and validation images and labels into memory using openCV and use that during training #data = dataset.read_train_sets(train_path, img_size, classes, validation_size=validation_size)
def _unix_runtime(self, function, args=tuple(), kwargs={}): start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) function(*args, **kwargs) end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()