Python resource_usage示例，resource.resource_usage Python示例

示例#1

0

显示文件

文件： matrixProcessConcurrent.py 项目： m3lissaeg/hpc

def forkProcessFunction(matrix1, matrix2, result, n):
    global NUM_OF_PROCESS
    process_handle = []
    row_range = int(n/NUM_OF_PROCESS)
    # Calculating the offset
    offset = n % NUM_OF_PROCESS
    # Start time counting
    start_resources = resource_usage(RUSAGE_SELF)

    for j in range(0, NUM_OF_PROCESS):

        # If the number if rows can be divided by the number of Process
        if offset == 0:
            t = Process(target=multiplyParallelMatrix, args=(
                int((row_range) * j), int((row_range) * (j+1)) , matrix1, matrix2, result) )
        else:
            # if there are remaining rows
            if j == NUM_OF_PROCESS - 1:
                t = Process(target=multiplyParallelMatrix, args=(
                    int((row_range) * j), int(((row_range) * (j+1))+offset) ,matrix1, matrix2, result ) )
            else:
                t = Process(target=multiplyParallelMatrix, args=(
                    int((row_range) * j), int((row_range) * (j+1)) , matrix1, matrix2, result  ))

        process_handle.append(t)
        t.start()

    for j in range(0, NUM_OF_PROCESS):
        process_handle[j].join()

    # End time counting
    end_resources = resource_usage(RUSAGE_SELF)
    timeCPU = end_resources.ru_utime - start_resources.ru_utime
   # print("Sum in {0:.10f} seconds".format(timeCPU))
    return timeCPU

示例#2

0

显示文件

文件： __init__.py 项目： teeerrytan/ForecastingQA

 def wrapper(*args, **kwargs):
     start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
     func(*args, **kwargs)
     end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()
     print({'消耗时间':{'real': end_time - start_time,
             'sys': end_resources.ru_stime - start_resources.ru_stime,
             'user': end_resources.ru_utime - start_resources.ru_utime}})
     return True

示例#3

0

显示文件

文件： benchmark_libraries.py 项目： gfrisch/sparsebm

def train_with_sparsebm(
    dataset_file,
    graph,
    nb_row_clusters,
    nb_column_clusters,
    row_clusters_index,
    column_clusters_index,
    use_gpu=False,
    gpu_index=None,
):
    results_files_already_done = glob.glob(results_folder + "*.pkl")
    save_f = (results_folder + dataset_file.split("/")[-1].split(".")[0] +
              "_sp.pkl")
    if use_gpu:
        save_f = (results_folder + dataset_file.split("/")[-1].split(".")[0] +
                  "_sp_gpu.pkl")
    if save_f in results_files_already_done:
        print("Already Done")
        return None
    model = LBM(
        nb_row_clusters,
        nb_column_clusters,
        n_init=100,
        n_iter_early_stop=10,
        n_init_total_run=1,
        max_iter=5000,
        verbosity=1,
        use_gpu=use_gpu,
        gpu_index=gpu_index,
    )
    start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
    model.fit(graph)
    end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()
    co_ari = CARI(
        row_clusters_index,
        column_clusters_index,
        model.row_labels,
        model.column_labels,
    )
    icl = model.get_ICL()
    results = {
        "lib": "sparsebm",
        "gpu": use_gpu,
        "n1": graph.shape[0],
        "n2": graph.shape[1],
        "nq": nb_row_clusters,
        "nl": nb_column_clusters,
        "dataset_file": dataset_file,
        "icl": icl,
        "cari": co_ari,
        "real": end_time - start_time,
        "sys": end_resources.ru_stime - start_resources.ru_stime,
        "user": end_resources.ru_utime - start_resources.ru_utime,
    }
    print(f'SparseBM tt time {results["user"]+results["sys"]}')
    pickle.dump(results, open(save_f, "wb"))
    return results

示例#4

0

显示文件

文件： test_modn.py 项目： taraspiotr/python_task0

def PowerModuloT(a,b,c):
  global slow
  r1 = resource_usage(RUSAGE_SELF)
  ret = PowerModulo(a,b,c)
  r2 = resource_usage(RUSAGE_SELF)
  t = r2.ru_utime - r1.ru_utime
  if (t > 0.001):
    print "PowerModulo(%d,%d,%d) is slow! %.4fs" % (a,b,c,t)
    slow=True
  return ret

示例#5

0

显示文件

 def wrappedMethod(*args, **kwargs):
     from time import time as timestamp
     from resource import getrusage as resource_usage, RUSAGE_SELF
     start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
     func = function(*args, **kwargs)
     end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()
     results = {'real': end_time - start_time,
                'sys': end_resources.ru_stime - start_resources.ru_stime,
                'user': end_resources.ru_utime - start_resources.ru_utime}
     print("Execution time for {0}".format(function.__name__))
     print(results)
     return func

示例#6

0

显示文件

文件： matrixProcess.py 项目： m3lissaeg/hpc

def multiplyMatrix(matrix1, matrix2, result):
    start_resources = resource_usage(RUSAGE_SELF)
    #iterate through rows of matrix1
    for i in range(n):
        # iterate through columns of matrix2
        for j in range(n):
            # iterate through rows of matrix2
            for k in range(n):
                result[i][j] += matrix1[i][k] * matrix2[k][j]
    end_resources = resource_usage(RUSAGE_SELF)
    timeCPU = end_resources.ru_utime - start_resources.ru_utime
    # print("Matrix multiplication in {0:.10f} seconds".format(timeCPU))
    return timeCPU

示例#7

0

显示文件

文件： unix_time.py 项目： xc611/chiron

def unix_time(function, args=tuple(), kwargs={}):
    '''Return `real`, `sys` and `user` elapsed time, like UNIX's command `time`
    You can calculate the amount of used CPU-time used by your
    function/callable by summing `user` and `sys`. `real` is just like the wall
    clock.
    Note that `sys` and `user`'s resolutions are limited by the resolution of
    the operating system's software clock (check `man 7 time` for more
    details).
    '''
    start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
    function(*args, **kwargs)
    end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()

    return {'real': end_time - start_time,
            'sys': end_resources.ru_stime - start_resources.ru_stime,
            'user': end_resources.ru_utime - start_resources.ru_utime}

示例#8

0

显示文件

文件： train.py 项目： szaman19/Phytoplankton_Classifier

def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss):
    acc = session.run(accuracy, feed_dict=feed_dict_train)
    val_acc = session.run(accuracy, feed_dict=feed_dict_validate)
    msg = "Training Epoch {0} --- Training Accuracy: {1:>6.1%}, Validation Accuracy: {2:>6.1%},  Validation Loss: {3:.3f}, Real Time:{4:.4f}, CPU Time:{5:.4f}, SYS Time:{6:.4f}"
    log = open(trial_name,"a")
    end_resources = resource_usage(RUSAGE_SELF)
    log.write(msg.format(epoch + 1, acc, val_acc, val_loss,(time() - start_time),(end_resources.ru_utime-start_resources.ru_utime),(end_resources.ru_stime-start_resources.ru_stime)))
    log.write("\n")
    log.close()
    print(msg.format(epoch + 1, acc, val_acc, val_loss,(time() - start_time),(end_resources.ru_utime-start_resources.ru_utime),(end_resources.ru_stime-start_resources.ru_stime)))

示例#9

0

显示文件

def threadFunction():
    arrayThreads=[]
    global n
    # Number of threads: 2,3 & 4
    for NUM_OF_THREADS in range(2,5):

        thread_handle = []
        row_range = int(n/NUM_OF_THREADS)
        # Calculating the offset
        offset = n % NUM_OF_THREADS
        # Start time counting

        start_resources = resource_usage(RUSAGE_SELF)
        for j in range(0, NUM_OF_THREADS):

            # If the number if rows can be divided by the number of threads
            if offset == 0:
                t = Thread(target=multiplyParallelMatrix, args=(
                    int((row_range) * j), int((row_range) * (j+1))))
            else:
                # if there are remaining rows
                if j == NUM_OF_THREADS - 1:
                    t = Thread(target=multiplyParallelMatrix, args=(
                        int((row_range) * j), int(((row_range) * (j+1))+offset)))
                else:
                    t = Thread(target=multiplyParallelMatrix, args=(
                        int((row_range) * j), int((row_range) * (j+1))))

            thread_handle.append(t)
            t.start()

        for j in range(0, NUM_OF_THREADS):
            thread_handle[j].join()

        # End time counting

        end_resources = resource_usage(RUSAGE_SELF)
        timeCPU = end_resources.ru_utime - start_resources.ru_utime
        arrayThreads.append(timeCPU)

    return arrayThreads

示例#10

0

显示文件

文件： timer.py 项目： 4d105s25p3/ibrowser2

def run(func, *args, **kwds):

    start_self = resource_usage(RUSAGE_SELF)
    start_child = resource_usage(RUSAGE_CHILDREN)
    start1 = time.perf_counter()
    start2 = time.process_time()

    value = func(*args, **kwds)

    end2 = time.process_time()
    end1 = time.perf_counter()
    end_child = resource_usage(RUSAGE_CHILDREN)
    end_self = resource_usage(RUSAGE_SELF)

    total_time = end1 - start1
    process_time = end2 - start2

    sys_self = end_self.ru_stime - start_self.ru_stime
    user_self = end_self.ru_utime - start_self.ru_utime

    return value, total_time, process_time, sys_self, user_self

示例#11

0

显示文件

文件： run_nn_MPI.py 项目： gunnyeong0813/ASGD_TIMIT

  def main(self,rank):
      os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"
      options=read_conf()

      do_training=bool(int(options.do_training))
      do_eval=bool(int(options.do_eval))
      do_forward=bool(int(options.do_forward))
      if do_forward:
        torch.cuda.set_device(0)
        device = "cuda:{}".format(0)
      else:
        torch.cuda.set_device(dist.get_rank()-1)
        device = "cuda:{}".format(dist.get_rank()-1)
      PS = Parameter_Server()
      if int(rank)==0 and do_training:
        PS.ps_server(rank)
      port = sys.argv[1]
      world_size = sys.argv[3]
      ip_add = sys.argv[4]


      fea_scp=options.fea_scp
      fea_opts=options.fea_opts
      lab_folder=options.lab_folder
      lab_opts=options.lab_opts
      
      dev_fea_scp="/home/slave3/kaldi/egs/timit/s5/pytorch-kaldi/exp/mfcc_shu/dev_split.000"
      dev_fea_opts="apply-cmvn --utt2spk=ark:$KALDI_ROOT/egs/timit/s5/data/dev/utt2spk  ark:$PYTORCH_EXP/mfcc_shu/dev_cmvn_speaker.ark ark:- ark:- | add-deltas --delta-order=2 ark:- ark:- |"
      dev_lab_folder='/home/slave3/kaldi/egs/timit/s5/exp/dnn4_pretrain-dbn_dnn_ali_dev'
      dev_lab_opts='ali-to-pdf'
      
      
      
      out_file=options.out_file
      

      count_file=options.count_file
      pt_file=options.pt_file

      left=int(options.cw_left)
      right=int(options.cw_right)
      seed=int(options.seed)
      use_cuda=bool(int(options.use_cuda))
      multi_gpu=bool(int(options.multi_gpu))
      NN_type=options.NN_type
      

      batch_size=int(options.batch_size)
      lr=float(options.lr)
      save_gpumem=int(options.save_gpumem)
      opt=options.optimizer
      if NN_type=='RNN':
         from neural_nets import RNN as ann
         rnn=1
      
      if NN_type=='LSTM':
         from neural_nets import LSTM as ann
         rnn=1
         
      if NN_type=='GRU':
        from neural_nets import GRU as ann
        rnn=1
      if NN_type=='MLP':
         from neural_nets import MLP as ann
         rnn=0

      options.input_dim=429
      options.num_classes=1944

      net = ann(options)
      if use_cuda:
            net.cuda(device=device)
      update_time=0
      sum_update_time=0
      st_update_time=0
      end_update_time=0
      
      
      shu_time=0
      sum_shu_time=0
      st_shu_time=0
      end_shu_time=0
      
      model_time=0
      sum_model_time=0
      st_model_time=0
      end_model_time=0
      
      load_time=0
      sum_load_time=0
      st_load_time=0
      end_load_time=0
      
      val_time=0
      sum_val_time=0
      st_val_time=0
      end_val_time=0
      
      epoch_time=0
      sum_epoch_time=0
      st_epoch_time=0
      end_epoch_time=0  
      
      data_time=0

      st_data_time=0
      end_data_time=0 
      
      
      train_time=0

      st_train_time=0
      end_train_time=0 
      _, st_train_time= timestamp(), resource_usage(RUSAGE_SELF)   

      torch.manual_seed(seed)
      random.seed(seed)
      print("[INFO] Batch size: ",batch_size)
      if rnn or do_eval or do_forward:
         seed=-1
      _, st_data_time= timestamp(), resource_usage(RUSAGE_SELF)   
      if do_forward == 1:
        dev_data_name=[0]
      if do_forward == 0:
        [dev_data_name,dev_data_set_ori,dev_data_end_index]=load_chunk(dev_fea_scp,dev_fea_opts,dev_lab_folder,dev_lab_opts,left,right,-1)   

      [data_name,data_set_ori,data_end_index]=load_chunk(fea_scp,fea_opts,lab_folder,lab_opts,left,right,seed)

      data_len = int(len(data_set_ori)/(int(world_size)-1))
      if do_training:
        if int(world_size)-1==1:
          print("Partition data 1")
        elif int(world_size)-1==2:
          print("partition data 2")
          if int(rank)==1:
            data_set_ori = data_set_ori[0:data_len]
          elif int(rank)==2:
            data_set_ori = data_set_ori[data_len:]
        elif int(world_size)-1==3:
          print("partition data 3")
          if int(rank)==1:
            data_set_ori = data_set_ori[0:data_len]
          elif int(rank)==2:
            data_set_ori = data_set_ori[data_len:data_len*2]
          elif int(rank)==3:
            data_set_ori = data_set_ori[data_len*2:]
        elif int(world_size)-1==4:
          print("partition data 4")
          if int(rank)==1:
            data_set_ori = data_set_ori[0:data_len]
          elif int(rank)==2:
            data_set_ori = data_set_ori[data_len:data_len*2]
          elif int(rank)==3:
            data_set_ori = data_set_ori[data_len*2:data_len*3]
          elif int(rank)==4:
            data_set_ori = data_set_ori[data_len*3:]
        data_len = len(data_set_ori)

      end_data_time,_  = resource_usage(RUSAGE_SELF), timestamp()
      data_time = end_data_time.ru_utime - st_data_time.ru_utime
      print("data generate time: ", data_time)


      print(np.shape(data_set_ori))

      if not(save_gpumem):
         data_set=torch.from_numpy(data_set_ori).float().cuda(device=device)
      else:
         data_set=torch.from_numpy(data_set_ori).float()   
      if do_forward ==0:  
        if not(save_gpumem):
           dev_data_set=torch.from_numpy(dev_data_set_ori).float().cuda(device=device)
        else:
           dev_data_set=torch.from_numpy(dev_data_set_ori).float()  

      N_fea=data_set.shape[1]-1
      options.input_dim=N_fea
      N_out=int(data_set[:,N_fea].max()-data_set[:,N_fea].min()+1) 
      options.num_classes=N_out
      

      if multi_gpu:
       net = nn.DataParallel(net)
       
       

      
      optimizer_worker=None       

      if optimizer_worker is None:
              optimizer_worker = optim.SGD(net.parameters(), lr=lr)
      else:
        optimizer_worker = optim.RMSprop(net.parameters(), lr=lr,alpha=0.95, eps=1e-8) 
      if do_forward:     
        if pt_file!='none':
          checkpoint_load = torch.load(pt_file)
          net.load_state_dict(checkpoint_load['model_par'])
          optimizer_worker.load_state_dict(checkpoint_load['optimizer_par'])
          optimizer_worker.param_groups[0]['lr']=lr

      dev_N_snt=len(dev_data_name)
      N_snt=len(data_name)
      
      
      if do_training:
        print("do training")
        net.train()
        test_flag=0   

        if do_training:
          N_batches=int((N_snt/batch_size)/(int(world_size)-1))
        else:
          N_batches=int(N_snt/batch_size) 
 
        if rnn==0:
         N_ex_tr=data_set.shape[0]
         N_batches=int(N_ex_tr/batch_size)
         
      if do_eval:
       N_batches=N_snt
       net.eval()
       test_flag=1
       batch_size=1
       
       if do_forward:
        post_file=kaldi_io.open_or_fd(out_file,'wb')
        counts = load_counts(count_file)
        

      beg_batch=0
      end_batch=beg_batch+batch_size   
      
      dev_beg_batch=0
      dev_end_batch=dev_beg_batch+1
      
      
      snt_index=0
      beg_snt=0 
      dev_beg_snt=0
      loss_sum=0
      err_sum=0
      dev_loss_sum=0
      dev_err_sum=0
      temp_err=0
      dev_err_sum_tot=0
      dev_N_batches=0

      num_epoch=24
      main_class = MAIN_CLASS()
      if do_forward:
        for i in range(N_batches):
           if do_training :
            
            if rnn==1:
             max_len=data_end_index[snt_index+batch_size-1]-data_end_index[snt_index+batch_size-2]
           
             inp= Variable(torch.zeros(max_len,batch_size,N_fea)).contiguous()
             lab= Variable(torch.zeros(max_len,batch_size)).contiguous().long()
             
           
             for k in range(batch_size):
              snt_len=data_end_index[snt_index]-beg_snt
              N_zeros=max_len-snt_len
              N_zeros_left=random.randint(0,N_zeros)
              inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,0:N_fea] 
              lab[N_zeros_left:N_zeros_left+snt_len,k]=data_set[beg_snt:beg_snt+snt_len,-1]
              
              beg_snt=data_end_index[snt_index]
              snt_index=snt_index+1
           
            else: 

             inp= Variable(data_set[beg_batch:end_batch,0:N_fea]).contiguous().cuda(device=device)
             lab= Variable(data_set[beg_batch:end_batch,N_fea]).contiguous().long().cuda(device=device)
             
            
           if do_eval:
              end_snt=data_end_index[i]
              inp= Variable(data_set[beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device)
              lab= Variable(data_set[beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device)
              if rnn==1:
                inp=inp.view(inp.shape[0],1,inp.shape[1])
                lab=lab.view(lab.shape[0],1)
              beg_snt=data_end_index[i]
            
           
           [loss,err,pout] = net(inp,lab,test_flag,rank)
           
           if multi_gpu:
             loss=loss.mean()
             err=err.mean()
        
           if do_forward:
            if rnn==1:
               pout=pout.view(pout.shape[0]*pout.shape[1],pout.shape[2]) 
            if int(rank)==0:
              kaldi_io.write_mat(post_file, pout.data.cpu().numpy()-np.log(counts/np.sum(counts)), data_name[i])
            
           if do_training:

            optimizer.zero_grad()  
          

            loss.backward()


            optimizer.step()

           
           loss_sum=loss_sum+loss.data
           err_sum=err_sum+err.data

           beg_batch=end_batch
           end_batch=beg_batch+batch_size

      else:

       m=0 
       for e in range(num_epoch):
        print("Batch size: ",m)
        _, st_epoch_time= timestamp(), resource_usage(RUSAGE_SELF)
        if e>0:
          
          dev_N_batches=dev_N_snt
          if e>1:
              temp_err=dev_err_sum_tot

          net.eval()
          test_flag=1
          dev_batch_size=1
          dev_beg_batch=0
          dev_end_batch=dev_beg_batch+1
          dev_loss_sum=0
          dev_err_sum=0
          dev_beg_snt=0
          _, st_val_time= timestamp(), resource_usage(RUSAGE_SELF)
          
          
          for j in range(dev_N_batches):
               
                end_snt=dev_data_end_index[j]
                dev_inp= Variable(dev_data_set[dev_beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device)
                dev_lab= Variable(dev_data_set[dev_beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device)
                if rnn==1:
                  inp=inp.view(inp.shape[0],1,inp.shape[1])
                  lab=lab.view(lab.shape[0],1)
                dev_beg_snt=dev_data_end_index[j]

                [dev_loss,dev_err,dev_pout] = net(dev_inp,dev_lab,test_flag,rank)

                dev_loss_sum=dev_loss_sum+dev_loss.data
                dev_err_sum=dev_err_sum+dev_err.data
                         
                dev_beg_batch=dev_end_batch
             
                dev_end_batch=dev_beg_batch+dev_batch_size
                
          end_val_time,_  = resource_usage(RUSAGE_SELF), timestamp()
          val_time = end_val_time.ru_utime - st_val_time.ru_utime
          sum_val_time=sum_val_time+val_time
          print('[INFO] EPOCH: %d, In Worker: %d, val_Err: %0.3f, val_loss: %0.3f, val_time: %0.3f' % ((e+1), int(rank),dev_err_sum/dev_N_batches, dev_loss_sum/dev_N_batches, sum_val_time))
          dev_err_sum_tot=dev_err_sum/dev_N_batches   
          if e>1:
              threshold = (temp_err-dev_err_sum_tot)/dev_err_sum_tot

              if threshold<0.0005:
                lr = lr * 0.5
          
          net.train()

          beg_batch=0
          end_batch=beg_batch+batch_size
          
          beg_snt=0

          _, st_shu_time= timestamp(), resource_usage(RUSAGE_SELF)
          
          np.random.shuffle(data_set_ori)
          
          if not(save_gpumem):
             data_set=torch.from_numpy(data_set_ori).float().cuda(device=device)
          else:
             data_set=torch.from_numpy(data_set_ori).float()  

          N_fea=data_set.shape[1]-1
          options.input_dim=N_fea
          N_out=int(data_set[:,N_fea].max()-data_set[:,N_fea].min()+1) 
          options.num_classes=N_out
          end_shu_time,_  = resource_usage(RUSAGE_SELF), timestamp()
          shu_time = end_shu_time.ru_utime - st_shu_time.ru_utime
          sum_shu_time=sum_shu_time+shu_time
          loss_sum=0
          err_sum=0

        for i in range(N_batches):

           _, st_load_time= timestamp(), resource_usage(RUSAGE_SELF)

           end_load_time,_  = resource_usage(RUSAGE_SELF), timestamp()
           load_time = end_load_time.ru_utime - st_load_time.ru_utime
           if do_training :
            
            if rnn==1:
             max_len=data_end_index[snt_index+batch_size-1]-data_end_index[snt_index+batch_size-2]
           
             inp= Variable(torch.zeros(max_len,batch_size,N_fea)).contiguous()
             lab= Variable(torch.zeros(max_len,batch_size)).contiguous().long()
           
           
             for k in range(batch_size):
              snt_len=data_end_index[snt_index]-beg_snt
              N_zeros=max_len-snt_len

              N_zeros_left=random.randint(0,N_zeros)

              inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,0:N_fea] 
              lab[N_zeros_left:N_zeros_left+snt_len,k]=data_set[beg_snt:beg_snt+snt_len,-1]
              
              beg_snt=data_end_index[snt_index]
              snt_index=snt_index+1
           
           
            else:

             inp= Variable(data_set[beg_batch:end_batch,0:N_fea]).contiguous().cuda(device=device)
             lab= Variable(data_set[beg_batch:end_batch,N_fea]).contiguous().long().cuda(device=device)
            
            
           if do_eval:
              end_snt=data_end_index[i]
              inp= Variable(data_set[beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device)
              lab= Variable(data_set[beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device)
              if rnn==1:
                inp=inp.view(inp.shape[0],1,inp.shape[1])
                lab=lab.view(lab.shape[0],1)
              beg_snt=data_end_index[i]
              

           [loss,err,pout] = net(inp,lab,test_flag,rank)

           if multi_gpu:
             loss=loss.mean()
             err=err.mean()
            
           if do_forward:
            if rnn==1:
               pout=pout.view(pout.shape[0]*pout.shape[1],pout.shape[2]) 
            if int(rank)==1:
              kaldi_io.write_mat(post_file, pout.data.cpu().numpy()-np.log(counts/np.sum(counts)), data_name[i])
            
           if do_training:

            optimizer_worker.zero_grad()  
          

            loss.backward()

            _,st_update_time = timestamp(), resource_usage(RUSAGE_SELF)
            
            main_class.ensure_shared_params(net,rank)
            end_update_time,_  = resource_usage(RUSAGE_SELF), timestamp()
            update_time = end_update_time.ru_utime-st_update_time.ru_utime
            
            
            cc=0
            _,st_model_time = timestamp(), resource_usage(RUSAGE_SELF)

            end_model_time,_  = resource_usage(RUSAGE_SELF), timestamp()
            model_time = end_model_time.ru_utime-st_model_time.ru_utime

            b=0
             

           sum_update_time=sum_update_time + update_time
           sum_load_time=sum_load_time+load_time
           sum_model_time= sum_model_time+model_time
           loss_sum=loss_sum+loss.data
           err_sum=err_sum+err.data

           if i%100==0:
             
             if i!=0:

               print('[INFO] EPOCH: %d, Batch: %d, In Worker: %d, Err: %0.3f, loss: %0.3f, update_time: %0.3f, load_time: %0.3f' % ((e+1),i, int(rank),err_sum/i, loss_sum/i,sum_update_time,sum_load_time))           

           beg_batch=end_batch
           end_batch=beg_batch+batch_size

           m=m+1
        end_epoch_time,_  = resource_usage(RUSAGE_SELF), timestamp()
        epoch_time = end_epoch_time.ru_utime - st_epoch_time.ru_utime
        sum_epoch_time= sum_epoch_time+epoch_time

        if do_training:
            checkpoint={'model_par': net.state_dict(),
                    'optimizer_par' : optimizer_worker.state_dict()}
            torch.save(checkpoint,options.out_file)    

      loss_tot=loss_sum/(N_batches)
      err_tot=err_sum/(N_batches)
      end_train_time,_  = resource_usage(RUSAGE_SELF), timestamp() 
      train_time = end_train_time.ru_utime - st_train_time.ru_utime

      if do_training:
        checkpoint={'model_par': net.state_dict(),
                    'optimizer_par' : optimizer_worker.state_dict()}
        torch.save(checkpoint,options.out_file)

      info_file=out_file.replace(".pkl",".info")

      with open(info_file, "a") as inf:
           inf.write("model_in=%s\n" %(pt_file))
           inf.write("fea_in=%s\n" %(fea_scp))
           inf.write("loss=%f\n" %(loss_tot))
           inf.write("err=%f\n" %(err_tot))
           inf.write("all_time=%f\n" %(train_time))
           inf.write("shu_time=%f\n" %(sum_shu_time))
           inf.write("model load time=%f\n" %(sum_load_time))
           inf.write("gradient send time=%f\n" %(sum_update_time))
           inf.write("val data calculate time=%f\n" %(sum_val_time))
           inf.write("data generate time=%f\n" %(data_time))
           inf.write("model update time=%f\n" %(sum_model_time))
           inf.write("epoch time=%f\n" %((sum_epoch_time-sum_load_time-sum_update_time-sum_model_time-sum_val_time)/num_epoch))
           inf.write("training time=%f\n" %(train_time-sum_load_time-sum_update_time-sum_val_time-data_time-sum_model_time-sum_shu_time))
           
      inf.close()
      
      if do_forward:
          post_file.close()

示例#12

0

显示文件

from argparse import ArgumentParser
from subprocess import run
from resource import getrusage as resource_usage, RUSAGE_CHILDREN
from time import time as timestamp

parser = ArgumentParser(description="Profile pzip execution time")
parser.add_argument('program', type=str, help="program to profile")
parser.add_argument('input', type=str, help="name of input file")
parser.add_argument('output', type=str, help="name of output file")
parser.add_argument('nThreads', type=int, help="Number of threads")
res = parser.parse_args()

cmd_list = [res.program, res.input, res.output, str(res.nThreads)]

start_time, start_resources = timestamp(), resource_usage(RUSAGE_CHILDREN)
run_result = run(cmd_list)
end_resources, end_time = resource_usage(RUSAGE_CHILDREN), timestamp()

real = end_time - start_time
sys = end_resources.ru_stime - start_resources.ru_stime
user = end_resources.ru_utime - start_resources.ru_utime
result = ((user + sys) / real) / res.nThreads

print(f"WALL_TIME: {real:.5f} seconds")
print(f"CPU_TIME_SYS: {sys:.5f}  seconds")
print(f"CPU_TIME_USER: {user:.5f} seconds")
print(f"N_THREADS: {res.nThreads}")

print(f"PARALLEL_EFFICIENCY (PE): {result:.5f}")

示例#13

0

显示文件

文件： bp_pip_ver3.5_20190504_sendasync_dataloader.py 项目： hwlee11/PipelinedASGD

def output_layer(sh_list, sh_test, sh_c_list, shm_list, train_loader,
                 test_loader, model, loss_function, rank, split, batch_size,
                 batch_num, test_batch_num, test_num, epoch_num, lamda, lr,
                 cv):

    feed_q2 = sh_list[rank - 1]
    grad_q2 = sh_list[rank + split - 2]

    send_grad = sh_c_list[rank + split - 2]

    feed_test = sh_test[rank - 1]
    send_target = shm_list[0]

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    models = []
    outputs = []
    inputs = []
    optim = []
    loss_tot = 0
    time_tot = 0
    cuda_time = 0
    #test_num = test_set_labels.size(0)
    #num_of_models = 1#2*n + 1
    n = -1 * (rank - (split - 1))
    #num_of_models = 2*split - 1
    num_of_models = split
    #delay = n *(2)# + 1
    #delay = 2* split -(rank+1) #- 1
    delay = n
    #model.reset_parameters()
    for i in range(num_of_models):
        models.append(copy.deepcopy(model))
        outputs.append(0)
        inputs.append(0)
        optim.append(
            torch.optim.SGD(models[i].parameters(),
                            lr=lr,
                            momentum=0.9,
                            weight_decay=0.0005,
                            nesterov=True))
        #optim.append(torch.optim.Adam(models[i].parameters(),lr=1e-4))
        #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr))
    for i in models:
        i.cuda(rank)
    model.cuda(rank)

    #labels = data_set[:,-mnist_data.NUM_LABELS:].cuda(rank) ######### if have a error, cuda(rank) add

    steps = int(batch_num / lamda)
    if batch_num % lamda != 0:
        steps += 1
    lamda_back = lamda
    t = 0

    for epoch in range(epoch_num):
        #with torch.autograd.profiler.profile() as prof:
        s_t_u = resource_usage(RUSAGE_SELF)
        s_t = timestamp()
        start.record()
        loss_sum = 0
        model.train()
        t = 0

        t1 = 0
        t2 = 0
        t3 = 0
        t4 = 0
        t5 = 0
        t6 = 0
        t7 = 0
        t8 = 0
        td1 = 0
        td2 = 0
        td3 = 0
        td4 = 0
        td5 = 0
        td6 = 0

        train_data = train_loader.__iter__()
        #for time in range(1,(batch_num + 2*split - (rank+1) -1 + 1 )):
        for step in range(1, steps + 1):
            #for time in range(step , step+lamda + 2*split -(rank+1)- 1 + 1 ):
            #off = (step-1)*lamda
            #cv.acquire()
            #cv.wait()
            #cv.notify_all()
            #cv.release()
            #cv.sync(rank)
            lamda = lamda_back
            if step == steps:
                lamda = batch_num - (step - 1) * lamda
            #print(rank,'steps',steps,'step',step,'lamda',lamda)
            #print('sync',step,steps)
            for time in range(1, lamda + 1):

                #if time >= (rank +) : # t >=  k ; k = 3
                t1 = timestamp()

                # recv output
                offset = t * batch_size
                #data,target = next(train_data)
                #offset = (time - 1) * batch_size
                x = feed_q2.recv()
                x = x.cuda(rank, non_blocking=True)
                #print('recv',t,x)

                #  label gpu load
                #data,target = next(train_data)
                #target = target.cuda(rank).long()
                target = send_target.recv()
                target = target.cuda(rank, non_blocking=True).long()
                #target = Variable(labels[offset:offset + batch_size,:]).long()
                #target = Variable(labels[offset:offset + batch_size,:]).cuda(rank)

                t2 = timestamp()
                model_idx = (
                    time % num_of_models
                ) - 1  ########################################### model idx correct
                #model_idx = 0
                input_feat = Variable(x, requires_grad=True)

                output = models[model_idx].forward(input_feat)
                #print('rank',rank,time,target)
                #print(target.size())
                loss = loss_function(output, target)
                t3 = timestamp()
                #loss = loss_function(output,torch.max(target,1)[1])
                optimizer = optim[model_idx]
                optimizer.zero_grad()
                loss.backward()
                #a = list(models[model_idx].parameters())[0].clone()
                optimizer.step()
                t4 = timestamp()
                #b = list(models[model_idx].parameters())[0].clone()
                #print(torch.equal(a.data,b.data))
                #grad = input_feat.grad.data.to('cpu')
                grad_q2.send_wait()
                send_grad.copy_(input_feat.grad.data)
                grad_q2.async_send_signal()
                t5 = timestamp()
                loss_sum = loss_sum + loss.data
                t += 1
                td1 += t2 - t1
                td2 += t3 - t2
                td3 += t4 - t3
                td4 += t5 - t4

            #print(time)
            model.init_zero()

            with torch.cuda.device(rank):

                for i in range(num_of_models):
                    j = models[i].parameters()
                    for k in model.parameters():
                        #k = 0
                        l = j.__next__()
                        k.requires_grad_(False)
                        k.copy_(k.data + l.data / num_of_models)

                for i in range(num_of_models):
                    j = model.parameters()
                    for k in models[i].parameters():
                        l = j.__next__()
                        k.requires_grad_(False)
                        k.copy_(l.data)
                        k.requires_grad_(True)
        loss_tot = loss_sum / batch_num

        e_t_u = resource_usage(RUSAGE_SELF)
        e_t = timestamp()
        u_t = e_t_u.ru_stime - s_t_u.ru_stime
        t = e_t - s_t
        end.record()
        torch.cuda.synchronize()
        cuda_time = cuda_time + start.elapsed_time(end)
        print(
            'node3 user time = %f time = %f cuda time = %f cuda tot time = %f loss_tot = %f'
            % (u_t, t, start.elapsed_time(end), cuda_time, loss_tot))
        #print('node3 user time = %f time = %f loss_tot = %f' % ( u_t , t,loss_tot))
        #print(prof)
        time_tot = time_tot + t
        print('rank =', rank, 'recv output =', td1)
        print('rank =', rank, 'forward =', td2)
        print('rank =', rank, 'backward =', td3)
        print('rank =', rank, 'send grad =', td4)

        model.eval()
        total = 0
        correct = 0
        dev_loss_tot = 0

        for data, target in test_loader:
            #for i in range(test_batch_num) :

            offset = i * batch_size  #####################################
            #print('rank',rank,i,target)
            x = feed_test.recv()
            #print(x)
            x = x.cuda(rank)
            target = target.cuda(rank)

            #target = Variable(test_set_labels[offset:offset+batch_size,:]).long()
            #target = Variable(test_set_labels[offset:offset+batch_size,:])

            output = model.forward(x)
            _, pred = torch.max(output.data, 1)
            #dev_loss = loss_function(output,torch.max(target,1)[1])
            dev_loss = loss_function(output, target)
            dev_loss_tot += dev_loss.item()
            #print('rank',rank,i,pred)

            #print(target,pred)
            #total += target.size(0)
            #print(total)
            #correct += (pred == torch.max(target,1)[1]).sum()
            correct += (pred == target).sum()
            #print('correct',correct)
            #i += 1
        print('epoch=', epoch, 'tot_time =', time_tot, 'accuracy =',
              (100 * correct / test_num), 'test_loss',
              dev_loss_tot / test_batch_num)

        if epoch == 150 or epoch == 225:
            lr = lr * 0.1
            for i in optim:
                for j in i.param_groups:
                    j['lr'] = lr

示例#14

0

显示文件

def trace():
    print("Memory:", resource_usage(RUSAGE_SELF).ru_maxrss/1024, 
        "CPUTime:", str(resource_usage(RUSAGE_SELF).ru_utime))

示例#15

0

显示文件

文件： bp_pip_ver3.5_20190504_sendasync_dataloader.py 项目： hwlee11/PipelinedASGD

def input_layer(sh_list, sh_test, sh_c_list, shm_list, train_loader,
                test_loader, model, rank, split, batch_size, batch_num,
                test_batch_num, epoch_num, lamda, lr, cv):

    update = 0
    feed_q1 = sh_list[rank]
    grad_q1 = sh_list[rank + split - 1]  #split = 3

    send_output = sh_c_list[rank]

    feed_test = sh_test[rank]

    send_target = shm_list[0]

    models = []
    outputs = []
    inputs = []
    optim = []

    n = -1 * (rank - (split - 1))
    #num_of_models = 2*split - 1
    #num_of_models = n + 1
    num_of_models = split
    #delay = n *(2)# + 1
    delay = n

    #model.reset_parameters()
    for i in range(num_of_models):
        models.append(copy.deepcopy(model))
        outputs.append(0)
        inputs.append(0)
        optim.append(
            torch.optim.SGD(models[i].parameters(),
                            lr=lr,
                            momentum=0.9,
                            weight_decay=0.0005,
                            nesterov=True))
        #optim.append(torch.optim.Adam(models[i].parameters(),lr=1e-4))
        #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr))
    for i in models:
        i.cuda(rank)
    model.cuda(rank)

    #data = data_set[:,:-mnist_data.NUM_LABELS]
    time_tot = 0

    steps = int(batch_num / lamda)
    if batch_num % lamda != 1:
        steps += 1
    lamda_back = lamda

    for epoch in range(epoch_num):

        #with torch.autograd.profiler.profile() as prof:
        s_t_u = resource_usage(RUSAGE_SELF)
        s_t = timestamp()
        model.train()
        for i in models:
            i.train()
        train_data = train_loader.__iter__()
        t = 0

        t1 = 0
        t2 = 0
        t3 = 0
        t4 = 0
        t5 = 0
        t6 = 0
        t7 = 0
        t8 = 0
        td1 = 0
        td2 = 0
        td3 = 0
        td4 = 0
        td5 = 0
        td6 = 0

        #for time in range(1,(batch_num + 2 * split - (rank + 1) - 1 + 1)):
        for step in range(1, steps + 1):
            #off = (step-1)*lamda

            #cv.acquire()
            #cv.wait()
            #cv.notify_all()
            #cv.release()
            #cv.sync(rank)

            lamda = lamda_back
            if step == steps:
                lamda = batch_num - (step - 1) * lamda
            #print('step',step,'lamda',lamda)

            for time in range(1, lamda + delay + 1):

                #if time <= off + lamda :
                if time <= lamda:

                    #offset = (time-1) * batch_size
                    t1 = timestamp()
                    #offset = t * batch_size
                    data, target = next(train_data)
                    send_target.send(target)
                    #print('rank',rank,time,target)

                    #x = x.view(-1,784)
                    #input_feat = Variable(data,requires_grad=True).to("cuda:0")
                    data = data.cuda(rank, non_blocking=True)
                    #input_feat = Variable(data[offset:offset+batch_size,:],requires_grad=True).cuda(rank)
                    t2 = timestamp()
                    #print(input_feat)
                    #print(input_feat.size())

                    model_idx = (time % num_of_models) - 1
                    #output = models[model_idx].forward(input_feat)
                    output = models[model_idx].forward(data)
                    #inputs[model_idx] = input_feat
                    outputs[model_idx] = output
                    t3 = timestamp()
                    #print(output.size())
                    #output_send = output.to("cpu")
                    feed_q1.send_wait()
                    send_output.copy_(output.data)
                    #print('send',t,send_output)
                    feed_q1.async_send_signal()
                    #feed_q1.send(output.data.to("cpu"))
                    t += 1
                    t4 = timestamp()

                if time > delay:  #  t-(2K-k-1)
                    #if time >= 1+ delay :   #  t-(2K-k-1)
                    t5 = timestamp()
                    pg = grad_q1.recv()
                    pg = pg.cuda(rank)
                    t6 = timestamp()
                    output_idx = ((time - delay) % num_of_models) - 1
                    optimizer = optim[output_idx]
                    optimizer.zero_grad()
                    output = outputs[output_idx]
                    output.backward(pg)
                    #a = list(models[output_idx].parameters())[0].clone()
                    optimizer.step()
                    t7 = timestamp()
                    #b = list(models[output_idx].parameters())[0].clone()
                    #print(torch.equal(a.data,b.data))
                td1 += t2 - t1
                td2 += t3 - t2
                td3 += t4 - t3

                td4 += t6 - t5
                td5 += t7 - t6

            #print(time)
            #feed_q1.init()
            #grad_q1.init()
            model.init_zero()

            with torch.cuda.device(rank):

                for i in range(num_of_models):
                    j = models[i].parameters()
                    for k in model.parameters():
                        #k = 0
                        l = j.__next__()
                        k.requires_grad_(False)
                        k.copy_(k.data + l.data / num_of_models)

                for i in range(num_of_models):
                    j = model.parameters()
                    for k in models[i].parameters():
                        l = j.__next__()
                        k.requires_grad_(False)
                        k.copy_(l.data)
                        k.requires_grad_(True)

        #print('average_done worker 1')

        e_t_u = resource_usage(RUSAGE_SELF)
        e_t = timestamp()
        u_t = e_t_u.ru_stime - s_t_u.ru_stime
        t = e_t - s_t
        time_tot = time_tot + t
        #print('node1 user time = %f time = %f time_tot = %f' % ( u_t , t, time_tot))
        #print(prof)
        print('rank =', rank, 'recv output =', td1)
        print('rank =', rank, 'forward =', td2)
        print('rank =', rank, 'send output', td3)
        print('rank =', rank, 'recv grad =', td4)
        print('rank =', rank, 'backward =', td5)

        model.eval()
        for i in models:
            i.eval()

        for data, target in test_loader:
            #for i in range(test_batch_num):
            #print(data,target)
            #print('rank',rank,target)
            #offset = i * batch_size
            #x = Variable(test_set[offset:offset+batch_size,:])
            x = Variable(data).cuda(rank)

            #x = x.view(-1,784)
            #x = x.to("cuda:0")
            output = model.forward(x)
            #output = output.to("cpu")
            #print(output.size())
            feed_test.send(output.data.to('cpu'))
            #i += 1
        if epoch == 150 or epoch == 225:
            lr = lr * 0.1
            for i in optim:
                for j in i.param_groups:
                    j['lr'] = lr

示例#16

0

显示文件

文件： bp_pip_ver3.5_20190504_sendasync_dataloader.py 项目： hwlee11/PipelinedASGD

def hidden_layer(sh_list, sh_test, sh_c_list, model, rank, split, batch_num,
                 test_batch_num, epoch_num, lamda, lr, cv):

    feed_q1 = sh_list[rank - 1]
    grad_q1 = sh_list[rank + split - 2]
    send_output = sh_c_list[2 * rank]
    send_grad = sh_c_list[2 * rank - 1]

    feed_test = sh_test[rank - 1]

    if split > 2:
        feed_q2 = sh_list[rank]
        grad_q2 = sh_list[rank + split - 1]
        feed_test2 = sh_test[rank]

    models = []
    outputs = []
    inputs = []
    optim = []

    n = -1 * (rank - (split - 1))
    #num_of_models = 2*split - 1
    num_of_models = split
    #delay = n *(2)# + 1
    #delay = 2* split -(rank+1) #- 1
    delay = n

    #model.reset_parameters()
    for i in range(2 * split - 1):
        models.append(copy.deepcopy(model))
        outputs.append(0)
        inputs.append(0)
        optim.append(
            torch.optim.SGD(models[i].parameters(),
                            lr=lr,
                            momentum=0.9,
                            weight_decay=0.0005,
                            nesterov=True))
        #optim.append(torch.optim.Adam(models[i].parameters(),lr=1e-4))
        #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr))
    for i in models:
        i.cuda(rank)
    model.cuda(rank)
    time_tot = 0

    steps = int(batch_num / lamda)
    if batch_num % lamda != 0:
        steps += 1
    lamda_back = lamda
    t = 0

    for epoch in range(epoch_num):
        #with torch.autograd.profiler.profile() as prof:
        s_t_u = resource_usage(RUSAGE_SELF)
        s_t = timestamp()

        model.train()
        for i in models:
            i.train()
        t = 0

        t1 = 0
        t2 = 0
        t3 = 0
        t4 = 0
        t5 = 0
        t6 = 0
        t7 = 0
        t8 = 0
        td1 = 0
        td2 = 0
        td3 = 0
        td4 = 0
        td5 = 0
        td6 = 0

        ##########################################################################################################
        #for time in range(1,(batch_num + 2*split - (rank + 1) -1 + 1)):
        for step in range(1, steps + 1):
            #off = (step-1)*lamda
            #for time in range(off+1 , off+lamda + delay ):

            #cv.acquire()
            #cv.wait()
            #cv.release()
            #cv.sync(rank)
            lamda = lamda_back
            if step == steps:
                lamda = batch_num - (step - 1) * lamda

            #print(rank,'steps',steps,'step',step,'lamda',lamda)
            for time in range(1, lamda + delay + 1):

                #if time <= off + lamda: # k = 2 ; t >= k
                if time <= lamda:  # k = 2 ; t >= k

                    t1 = timestamp()
                    x = feed_q1.recv()
                    x = x.cuda(rank, non_blocking=True)
                    #print('recv',x)
                    t2 = timestamp()

                    input_feat = Variable(x, requires_grad=True)
                    #input_feat = input_feat.to("cuda:1")

                    model_idx = (time % num_of_models) - 1
                    output = models[model_idx].forward(input_feat)
                    inputs[model_idx] = input_feat
                    outputs[model_idx] = output
                    t3 = timestamp()
                    feed_q2.send_wait()
                    send_output.copy_(output.data)
                    feed_q2.async_send_signal()
                    t += 1
                    t4 = timestamp()

                #pg = grad_q2.get()
                #if len(pg) > 0:
                #if time > delay:   #  t-(2K-k-1)
                if time > delay:  #  t-(2K-k-1)

                    t5 = timestamp()
                    pg = grad_q2.recv()
                    pg = pg.cuda(rank)
                    t6 = timestamp()

                    output_idx = ((time - delay) % num_of_models) - 1
                    optimizer = optim[output_idx]
                    optimizer.zero_grad()
                    output = outputs[output_idx]
                    output.backward(pg)
                    #outputs[output_idx].backward(pg)
                    #a = list(models[output_idx].parameters())[0].clone()
                    optimizer.step()
                    t7 = timestamp()
                    #outputs[output_idx].backward(pg)
                    #b = list(models[output_idx].parameters())[0].clone()
                    #print(torch.equal(a.data,b.data))
                    #grad = inputs[output_idx].grad.data.to('cpu')

                    grad_q1.send_wait()
                    send_grad.copy_(inputs[output_idx].grad.data)
                    #grad = pg
                    grad_q1.async_send_signal()
                    t8 = timestamp()
                    #outputs[output_idx].backward(pg)
                td1 += t2 - t1
                td2 += t3 - t2
                td3 += t4 - t3

                td4 += t6 - t5
                td5 += t7 - t6
                td6 += t8 - t7
        ###############################################################################################################

        #feed_q2.init()
        #grad_q2.init()
        #print(time)
            model.init_zero()

            with torch.cuda.device(rank):

                for i in range(num_of_models):
                    j = models[i].parameters()
                    for k in model.parameters():
                        #k = 0
                        l = j.__next__()
                        k.requires_grad_(False)
                        k.copy_(k.data + l.data / num_of_models)

                for i in range(num_of_models):
                    j = model.parameters()
                    for k in models[i].parameters():
                        l = j.__next__()
                        k.requires_grad_(False)
                        k.copy_(l.data)
                        k.requires_grad_(True)

        #print('average_done')
        e_t_u = resource_usage(RUSAGE_SELF)
        e_t = timestamp()
        u_t = e_t_u.ru_stime - s_t_u.ru_stime
        t = e_t - s_t
        time_tot = time_tot + t
        #print('node2 user time = %f time = %f tot_time = %f' % ( u_t , t, time_tot))
        #print(prof)
        print('rank =', rank, 'recv output =', td1)
        print('rank =', rank, 'forward =', td2)
        print('rank =', rank, 'send output', td3)
        print('rank =', rank, 'recv grad =', td4)
        print('rank =', rank, 'backward =', td5)
        print('rank =', rank, 'send grad =', td6)

        model.eval()
        for i in models:
            i.eval()
        #for data,target in test_loader:
        for i in range(test_batch_num):
            x = feed_test.recv()
            x = x.cuda(rank)
            output = model.forward(x)
            #output = output.to('cpu')
            feed_test2.send(output.data.to('cpu'))

        if epoch == 150 or epoch == 225:
            lr = lr * 0.1
            for i in optim:
                for j in i.param_groups:
                    j['lr'] = lr

示例#17

0

显示文件

文件： app.py 项目： Cyberdefence-Lab-Murcia/UMUDGA

def classify(domain, features_filter=None, keep_features=False):
    t0_time, t0_resources = timestamp(), resource_usage(RUSAGE_SELF)
    times = {}
    if features_filter == None:
        evaluation_features = pd.DataFrame(columns=app.features.columns)
    else:
        evaluation_features = pd.DataFrame(columns=app.featuresfs.columns)

    t1_resources, t1_time = resource_usage(RUSAGE_SELF), timestamp()
    times['setup'] = {
        'wall': {
            'total': t1_time - t0_time
        },
        'user': {
            'total': t1_resources.ru_utime - t0_resources.ru_utime
        },
    }

    res = process(domain, features_filter)

    res['times'].update(times)
    del (times)

    t1_resources, t1_time = resource_usage(RUSAGE_SELF), timestamp()
    evaluation_features = evaluation_features.append(
        res['features'], ignore_index=True).astype('float')
    if not keep_features:
        del (res['features'])

    t2_resources, t2_time = resource_usage(RUSAGE_SELF), timestamp()
    res['times']['features']['postprocess'] = {
        'wall': {
            'total': t2_time - t1_time
        },
        'user': {
            'total': t2_resources.ru_utime - t1_resources.ru_utime
        },
    }

    res['class'] = {}
    if features_filter == None:
        res['class']['code'] = app.lightgbm.predict(evaluation_features)[0]
    else:
        res['class']['code'] = app.lightgbmfs.predict(evaluation_features)[0]
    t3_resources, t3_time = resource_usage(RUSAGE_SELF), timestamp()
    res['times']['classification'] = {
        'wall': {
            'total': t3_time - t2_time
        },
        'user': {
            'total': t3_resources.ru_utime - t2_resources.ru_utime
        },
    }

    res['class']['label'] = app.category_map[res['class']['code']]
    res['class']['code'] = int(res['class']['code'])

    t4_resources, t4_time = resource_usage(RUSAGE_SELF), timestamp()
    res['times']['total'] = {
        'wall': {
            'total': t4_time - t0_time
        },
        'user': {
            'total': t4_resources.ru_utime - t0_resources.ru_utime
        },
    }

    return res

示例#18

0

显示文件

文件： app.py 项目： Cyberdefence-Lab-Murcia/UMUDGA

def process(domain, features_filter=None):
    headers = {'Content-Type': 'application/json'}
    times = {}
    times['features'] = {}
    try:
        t0_time, t0_resources = timestamp(), resource_usage(RUSAGE_SELF)
        data = {"fqdn": domain}
        endpoint = "http://155.54.210.169:8080/DGA/domain/features"
        if features_filter != None:
            data.update(features_filter)
            endpoint += "/filtered"

        r = requests.post(endpoint, data=json.dumps(data), headers=headers)
        t1_resources, t1_time = resource_usage(RUSAGE_SELF), timestamp()
        times['features']['request'] = {
            'wall': {
                'total': t1_time - t0_time
            },
            'user': {
                'total': t1_resources.ru_utime - t0_resources.ru_utime
            },
        }
        #print(r.json())
        features = {}
        for elem in r.json():
            for key in elem.keys():
                features[key] = elem[key]

        try:
            del (features['class'])
        except Exception:
            pass
        try:
            del (features['domain'])
        except Exception:
            pass

        t2_resources, t2_time = resource_usage(RUSAGE_SELF), timestamp()
        times['features']['cleaning'] = {
            'wall': {
                'total': t2_time - t1_time
            },
            'user': {
                'total': t2_resources.ru_utime - t1_resources.ru_utime
            },
        }
        times['features']['server'] = {
            'wall': {
                'total': float(r.headers['wall-time-ms']) / 1000
            },
            'user': {
                'total': float(r.headers['cpu-time-ms']) / 1000
            },
        }

        return {
            'domain': domain,
            'status_code': r.status_code,
            'features': features,
            'times': times
        }
    except Exception as ex:
        return {'domain': domain, 'exception': ex}

示例#19

0

显示文件

文件： matrix_multiplication.py 项目： tuanhiep/expertpy

#       [3, 4]]
# y1 = [[5, 6, 1],
#       [1, 3, 2]]
#
# print(multi(x1, y1))
# Sizes of the matrix
# sizes = [250, 500, 1000, 1500, 2000]
sizes = [25, 50]
total = [None] * len(sizes)
user = [None] * len(sizes)
sys = [None] * len(sizes)
count = 0
for size in sizes:
    m1 = [[random() for x in range(size)] for y in range(size)]
    m2 = [[random() for x in range(size)] for y in range(size)]
    start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
    m3 = multi(m1, m2)
    end_time, end_resources = timestamp(), resource_usage(RUSAGE_SELF)
    total[count] = end_time - start_time
    sys[count] = end_resources.ru_stime - start_resources.ru_stime
    user[count] = end_resources.ru_utime - start_resources.ru_utime
    count += 1
print(user)
print(sys)
print(total)

fig, ax = plt.subplots()
ax.set_prop_cycle(color=['red', 'green', 'blue'])
plt.plot(sizes, user)
plt.plot(sizes, sys)
plt.plot(sizes, total)

示例#20

0

显示文件

def input_layer(sh_list, sh_test, sh_c_list, shm_list, train_loader,
                test_loader, model, rank, split, batch_size, batch_num,
                test_batch_num, epoch_num, lamda, lr, gamma, cv):

    update = 0
    feed_q1 = sh_list[rank]
    grad_q1 = sh_list[rank + split - 1]  #split = 3

    send_output = sh_c_list[rank]

    feed_test = sh_test[rank]

    send_target = shm_list[0]

    outputs = []
    inputs = []
    optim = []

    n = -1 * (rank - (split - 1))
    #num_of_models = 2*split - 1
    #num_of_models = n + 1
    num_of_models = n
    #delay = n *(2)# + 1
    delay = split - 1 - rank

    #model.reset_parameters()
    for i in range(num_of_models):
        #models.append(copy.deepcopy(model))
        outputs.append(0)
        #inputs.append(0)
        #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr,momentum=0.9,weight_decay=0.0005,nesterov=True))
        #optim.append(torch.optim.Adam(models[i].parameters(),lr=1e-4))
        #optim.append(torch.optim.SGD(models[i].parameters(),lr=lr))
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=lr,
                                momentum=0.9,
                                weight_decay=0.0005,
                                nesterov=True)
    model.cuda(rank)
    #data = data_set[:,:-mnist_data.NUM_LABELS]
    time_tot = 0

    steps = int(batch_num / lamda)
    if batch_num % lamda != 1:
        steps += 1
    lamda_back = lamda

    for epoch in range(epoch_num):

        #with torch.autograd.profiler.profile() as prof:
        s_t_u = resource_usage(RUSAGE_SELF)
        s_t = timestamp()
        model.train()
        train_data = train_loader.__iter__()
        t = 0

        t1 = 0
        t2 = 0
        t3 = 0
        t4 = 0
        t5 = 0
        t6 = 0
        t7 = 0
        t8 = 0
        td1 = 0
        td2 = 0
        td3 = 0
        td4 = 0
        td5 = 0
        td6 = 0

        #for time in range(1,(batch_num + 2 * split - (rank + 1) - 1 + 1)):
        for step in range(1, steps + 1):
            #off = (step-1)*lamda

            #cv.acquire()
            #cv.wait()
            #cv.notify_all()
            #cv.release()
            #cv.sync(rank)

            lamda = lamda_back
            if step == steps:
                lamda = batch_num - (step - 1) * lamda
            #print('step',step,'lamda',lamda)

            #for time in range(1 , lamda + delay  ):
            #for time in range(1 , lamda + delay +1 ):
            for time in range(1, lamda + 1):

                #if time <= off + lamda :
                if time <= lamda:

                    t1 = timestamp()
                    data, target = next(train_data)

                    while len(data) != batch_size:
                        inputs_copy_len = (batch_size - len(data)) if (
                            batch_size - len(data) <= len(data)) else len(data)
                        data = torch.cat([data, data[0:inputs_copy_len]], 0)
                        target = torch.cat([target, target[0:inputs_copy_len]],
                                           0)

                    send_target.send(target)

                    #input_feat = Variable(data,requires_grad=True).to("cuda:0")
                    data = data.cuda(rank, non_blocking=True)
                    #input_feat = Variable(data[offset:offset+batch_size,:],requires_grad=True).cuda(rank)
                    t2 = timestamp()

                    model_idx = (time % num_of_models) - 1
                    #output = models[model_idx].forward(input_feat)
                    output = model.forward(data)
                    #inputs[model_idx] = input_feat
                    outputs[model_idx] = output
                    t3 = timestamp()

                    feed_q1.send_wait()
                    send_output.copy_(output.data)
                    feed_q1.async_send_signal()

                    t += 1
                    t4 = timestamp()

                if time > delay:
                    t5 = timestamp()

                    pg = grad_q1.recv()
                    pg = pg.cuda(rank)

                    t6 = timestamp()
                    output_idx = ((time - delay) % num_of_models) - 1
                    #optimizer = optim[output_idx]
                    optimizer.zero_grad()
                    output = outputs[output_idx]
                    output.backward(pg)
                    #a = list(models[output_idx].parameters())[0].clone()
                    optimizer.step()
                    t7 = timestamp()
                    #b = list(models[output_idx].parameters())[0].clone()
                    #print(torch.equal(a.data,b.data))
                td1 += t2 - t1
                td2 += t3 - t2
                td3 += t4 - t3

                td4 += t6 - t5
                td5 += t7 - t6

        e_t_u = resource_usage(RUSAGE_SELF)
        e_t = timestamp()
        u_t = e_t_u.ru_stime - s_t_u.ru_stime
        t = e_t - s_t
        time_tot = time_tot + t
        #print('node1 user time = %f time = %f time_tot = %f' % ( u_t , t, time_tot))
        #print(prof)
        print('rank =', rank, 'recv output =', td1)
        print('rank =', rank, 'forward =', td2)
        print('rank =', rank, 'send output', td3)
        print('rank =', rank, 'recv grad =', td4)
        print('rank =', rank, 'backward =', td5)

        model.eval()

        for data, target in test_loader:
            x = Variable(data).cuda(rank)

            output = model.forward(x)
            #print(output.size())
            feed_test.send(output.data.to('cpu'))
            #i += 1
        if epoch == 400 or epoch == 500:
            lr = lr * gamma
            for i in optim:
                for j in i.param_groups:
                    j['lr'] = lr

    feed_q1.terminate.value = 1

示例#21

0

显示文件

文件： benchmark_libraries.py 项目： gfrisch/sparsebm

def train_with_blockcluster(
    dataset_file,
    graph,
    nb_row_clusters,
    nb_column_clusters,
    row_clusters_index,
    column_clusters_index,
):
    results_files_already_done = glob.glob(results_folder + "*.pkl")
    if (results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bc.pkl"
            in results_files_already_done):
        print("Already Done")
        return None

    print("BlockCluster :")
    # Convert sparse matrix to R matrix.
    B = graph.todense()
    nr, nc = B.shape
    Br = ro.r.matrix(B, nrow=nr, ncol=nc)
    # initmethod Method to initialize model parameters. The valid values are "cemInitStep", "emInitStep" and "randomInit"
    #  nbiterationsxem : Number of EM iterations used during xem step. Default value is 50.
    # nbinitmax : Maximal number initialization to try. Default value is 100
    # nbinititerations : Number of Global iterations used in initialization step. Default value is 10.
    # initepsilon : Tolerance value used while initialization. Default value is 1e-2.
    # nbxem : Number of xem steps. Default value is 5.
    strategy = blockcluster.coclusterStrategy(
        initmethod="randomInit",
        nbinitmax=100,
        nbinititerations=10,
        nbiterationsXEM=5000,
        nbiterationsxem=10,
        initepsilon=1e-2,
        epsilonxem=1e-4,
        epsilonXEM=1e-10,
        stopcriteria="Likelihood",
        nbtry=1,
        nbxem=100,
    )

    start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
    results = blockcluster.cocluster(
        Br,
        "binary",
        nbcocluster=robjects.IntVector([nb_row_clusters, nb_column_clusters]),
        nbCore=1,
        strategy=strategy,
    )
    end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()
    print(end_time - start_time)
    rowclass = np.array(results.slots["rowclass"])
    colclass = np.array(results.slots["colclass"])
    icl = results.slots["ICLvalue"][0]
    co_ari = CARI(row_clusters_index, column_clusters_index, rowclass,
                  colclass)
    """Return `real`, `sys` and `user` elapsed time, like UNIX's command `time`
    You can calculate the amount of used CPU-time used by summing `user`
    and `sys`. `real` is just like the wall clock.
    """
    results = {
        "lib": "blockcluster",
        "n1": graph.shape[0],
        "n2": graph.shape[1],
        "nq": nb_row_clusters,
        "nl": nb_column_clusters,
        "dataset_file": dataset_file,
        "icl": icl,
        "cari": co_ari,
        "real": end_time - start_time,
        "sys": end_resources.ru_stime - start_resources.ru_stime,
        "user": end_resources.ru_utime - start_resources.ru_utime,
    }
    print(f'BlockCluster tt time {results["user"]+results["sys"]}')
    pickle.dump(
        results,
        open(
            results_folder + dataset_file.split("/")[-1].split(".")[0] +
            "_bc.pkl",
            "wb",
        ),
    )
    return results

示例#22

0

显示文件

文件： benchmark_libraries.py 项目： gfrisch/sparsebm

def train_with_blockmodels(
    dataset_file,
    graph,
    nb_row_clusters,
    nb_column_clusters,
    row_clusters_index,
    column_clusters_index,
):
    results_files_already_done = glob.glob(results_folder + "*.pkl")
    if (results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bm.pkl"
            in results_files_already_done):
        print("Already Done")
        return None

    print("blockmodels :")
    # Convert sparse matrix to R matrix.
    n1, n2 = graph.shape
    B = graph.todense()
    nr, nc = B.shape
    Br = ro.r.matrix(B, nrow=nr, ncol=nc)
    network = robjects.ListVector({"adjacency": Br})

    model = LBM(
        nb_row_clusters,
        nb_column_clusters,
        n_init=1,
        n_iter_early_stop=1,
        n_init_total_run=1,
        max_iter=1,
        verbosity=0,
    )
    model.fit(graph)
    init_list = []
    for _ in range(100):
        _, _, tau_1_init, tau_2_init, _ = model._init_LBM_random(
            n1, n2, nb_row_clusters, nb_column_clusters, graph.nnz)
        nr, nc = tau_1_init.shape
        t1_init = ro.r.matrix(tau_1_init, nrow=nr, ncol=nc)
        nr, nc = tau_2_init.shape
        t2_init = ro.r.matrix(tau_2_init, nrow=nr, ncol=nc)
        init_list.append(robjects.ListVector({"Z1": t1_init, "Z2": t2_init}))

    start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
    best_icl = -np.inf
    best_init = None
    for i, init in enumerate(init_list):
        print(f"Init {i}/{len(init_list)}", end="\r")
        results = blockmodels.dispatcher("LBM", init, "bernoulli", network,
                                         False)
        icl_or_ll = results[2][0]
        if icl_or_ll > best_icl:
            best_init = icl_or_ll
            best_init = init
    print("\n Start training best")
    results = blockmodels.dispatcher("LBM", best_init, "bernoulli", network,
                                     True)
    print("End training best")
    end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()
    icl = results[2][0]
    res_tau_1 = np.array(results[0][0])
    res_tau_2 = np.array(results[0][2])
    co_ari = CARI(
        row_clusters_index,
        column_clusters_index,
        res_tau_1.argmax(1),
        res_tau_2.argmax(1),
    )
    results = {
        "lib": "blockmodels",
        "n1": graph.shape[0],
        "n2": graph.shape[1],
        "nq": nb_row_clusters,
        "nl": nb_column_clusters,
        "dataset_file": dataset_file,
        "icl": icl,
        "cari": co_ari,
        "real": end_time - start_time,
        "sys": end_resources.ru_stime - start_resources.ru_stime,
        "user": end_resources.ru_utime - start_resources.ru_utime,
    }
    print(f'Blockmodels tt time {results["user"]+results["sys"]}')
    pickle.dump(
        results,
        open(
            results_folder + dataset_file.split("/")[-1].split(".")[0] +
            "_bm.pkl",
            "wb",
        ),
    )
    return results

示例#23

0

显示文件

文件： launch-perf.py 项目： edlane/python-debug-harness

def trueit_queue(q):
    q.put(True)

def trueit_pipe(conn):
    conn.send(True)
    conn.close()


if __name__ == '__main__':


    reps = int(sys.argv[1])
    if reps != 0:
        for test in sys.argv[2:]:
            start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
            if test == '1':
                print('os.system(\'true\') =')
                for i in xrange(0, reps):
                    os.system('true')

            elif test == '2':
                print('python local call, \'True\' =')
                for i in xrange(0, reps):
                    trueit_local()

            elif test == '3':
                print('os.system, python executable =')
                for i in xrange(0, reps):
                    command = sys.argv[0] + " 0 2"
                    os.system(command)

示例#24

0

显示文件

文件： train.py 项目： szaman19/Phytoplankton_Classifier

"""
    Code originally lifted from
    http://cv-tricks.com/tensorflow-tutorial/training-convolutional-neural-network-for-image-classification/
"""

#Adding Seed so that random initialization is consistent
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(3)
from time import time
import datetime
st = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
batch_size = 32
start_time = time()
start_resources = resource_usage(RUSAGE_SELF)
trial_name = "pc_no_unidentified_rs_3"+st+".txt"
#Prepare input data
classes = ['Asterionella','Aulocoseira','Colonial Cyanobacteria','Cryptomonas','Detritus','Dolichospermum','Filamentous cyanobacteria','Romeria','Staurastrum']
#classes = ['Snowella', 'Staurastrum']
num_classes = len(classes)

# 20% of the data will automatically be used for validation
validation_size = 0.20
img_size = 256
num_channels = 3
os.chdir('..')
train_path=os.getcwd()
train_path += '/extracted_images/'
# We shall load all the training and validation images and labels into memory using openCV and use that during training
#data = dataset.read_train_sets(train_path, img_size, classes, validation_size=validation_size)

示例#25

0

显示文件

文件： PRIMeval_local.py 项目： rczms/primeval

 def _unix_runtime(self, function, args=tuple(), kwargs={}):
     start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
     function(*args, **kwargs)
     end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()