示例#1
0
def run_nn(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,cfg_file,processed_first,next_config_file):
    
    # This function processes the current chunk using the information in cfg_file. In parallel, the next chunk is load into the CPU memory
    
    # Reading chunk-specific cfg file (first argument-mandatory file) 
    if not(os.path.exists(cfg_file)):
         sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file))
         sys.exit(0)
    else:
        config = configparser.ConfigParser()
        config.read(cfg_file)
    
    # Setting torch seed
    seed=int(config['exp']['seed'])
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    
    
    # Reading config parameters
    output_folder=config['exp']['out_folder']
    use_cuda=strtobool(config['exp']['use_cuda'])
    multi_gpu=strtobool(config['exp']['multi_gpu'])
    
    to_do=config['exp']['to_do']
    info_file=config['exp']['out_info']
    
    model=config['model']['model'].split('\n')
    
    forward_outs=config['forward']['forward_out'].split(',')
    forward_normalize_post=list(map(strtobool,config['forward']['normalize_posteriors'].split(',')))
    forward_count_files=config['forward']['normalize_with_counts_from'].split(',')
    require_decodings=list(map(strtobool,config['forward']['require_decoding'].split(',')))
    
    use_cuda=strtobool(config['exp']['use_cuda'])
    save_gpumem=strtobool(config['exp']['save_gpumem'])
    is_production=strtobool(config['exp']['production'])

    if to_do=='train':
        batch_size=int(config['batches']['batch_size_train'])
    
    if to_do=='valid':
        batch_size=int(config['batches']['batch_size_valid'])
    
    if to_do=='forward':
        batch_size=1
        
    
    # ***** Reading the Data********
    if processed_first:
        
        # Reading all the features and labels for this chunk
        shared_list=[]
        
        p=threading.Thread(target=read_lab_fea, args=(cfg_file,is_production,shared_list,output_folder,))
        p.start()
        p.join()
        
        data_name=shared_list[0]
        data_end_index=shared_list[1]
        fea_dict=shared_list[2]
        lab_dict=shared_list[3]
        arch_dict=shared_list[4]
        data_set=shared_list[5]


        
        # converting numpy tensors into pytorch tensors and put them on GPUs if specified
        if not(save_gpumem) and use_cuda:
           data_set=torch.from_numpy(data_set).float().cuda()
        else:
           data_set=torch.from_numpy(data_set).float()
                           
    # Reading all the features and labels for the next chunk
    shared_list=[]
    p=threading.Thread(target=read_lab_fea, args=(next_config_file,is_production,shared_list,output_folder,))
    p.start()
    
    # Reading model and initialize networks
    inp_out_dict=fea_dict
    
    [nns,costs]=model_init(inp_out_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do)
       
    # optimizers initialization
    optimizers=optimizer_init(nns,config,arch_dict)
           
    
    # pre-training and multi-gpu init
    for net in nns.keys():
      pt_file_arch=config[arch_dict[net][0]]['arch_pretrain_file']
            
      if pt_file_arch!='none':        
          checkpoint_load = torch.load(pt_file_arch)
          nns[net].load_state_dict(checkpoint_load['model_par'])
          optimizers[net].load_state_dict(checkpoint_load['optimizer_par'])
          optimizers[net].param_groups[0]['lr']=float(config[arch_dict[net][0]]['arch_lr']) # loading lr of the cfg file for pt
       
      if multi_gpu:
        nns[net] = torch.nn.DataParallel(nns[net])
          
    
    
    
    if to_do=='forward':
        
        post_file={}
        for out_id in range(len(forward_outs)):
            if require_decodings[out_id]:
                out_file=info_file.replace('.info','_'+forward_outs[out_id]+'_to_decode.ark')
            else:
                out_file=info_file.replace('.info','_'+forward_outs[out_id]+'.ark')
            post_file[forward_outs[out_id]]=open_or_fd(out_file,output_folder,'wb')


    # check automatically if the model is sequential
    seq_model=is_sequential_dict(config,arch_dict)
    
    # ***** Minibatch Processing loop********
    if seq_model or to_do=='forward':
        N_snt=len(data_name)
        N_batches=int(N_snt/batch_size)
    else:
        N_ex_tr=data_set.shape[0]
        N_batches=int(N_ex_tr/batch_size)
        
    
    beg_batch=0
    end_batch=batch_size 
    
    snt_index=0
    beg_snt=0 
    

    start_time = time.time()
    
    # array of sentence lengths
    arr_snt_len=shift(shift(data_end_index, -1,0)-data_end_index,1,0)
    arr_snt_len[0]=data_end_index[0]
    
    
    loss_sum=0
    err_sum=0
    
    inp_dim=data_set.shape[1]
    for i in range(N_batches):   
        
        max_len=0
    
        if seq_model:
         
         max_len=int(max(arr_snt_len[snt_index:snt_index+batch_size]))  
         inp= torch.zeros(max_len,batch_size,inp_dim).contiguous()
    
            
         for k in range(batch_size):
              
                  snt_len=data_end_index[snt_index]-beg_snt
                  N_zeros=max_len-snt_len
                  
                  # Appending a random number of initial zeros, tge others are at the end. 
                  N_zeros_left=random.randint(0,N_zeros)
                 
                  # randomizing could have a regularization effect
                  inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,:]
                  
                  beg_snt=data_end_index[snt_index]
                  snt_index=snt_index+1
                
        else:
            # features and labels for batch i
            if to_do!='forward':
                inp= data_set[beg_batch:end_batch,:].contiguous()
            else:
                snt_len=data_end_index[snt_index]-beg_snt
                inp= data_set[beg_snt:beg_snt+snt_len,:].contiguous()
                beg_snt=data_end_index[snt_index]
                snt_index=snt_index+1
    
        # use cuda
        if use_cuda:
            inp=inp.cuda()
    
        if to_do=='train':
            # Forward input, with autograd graph active
            outs_dict=forward_model(fea_dict,lab_dict,arch_dict,model,nns,costs,inp,inp_out_dict,max_len,batch_size,to_do,forward_outs)
            
            for opt in optimizers.keys():
                optimizers[opt].zero_grad()
                
    
            outs_dict['loss_final'].backward()
            
            # Gradient Clipping (th 0.1)
            #for net in nns.keys():
            #    torch.nn.utils.clip_grad_norm_(nns[net].parameters(), 0.1)
            
            
            for opt in optimizers.keys():
                if not(strtobool(config[arch_dict[opt][0]]['arch_freeze'])):
                    optimizers[opt].step()
        else:
            with torch.no_grad(): # Forward input without autograd graph (save memory)
                outs_dict=forward_model(fea_dict,lab_dict,arch_dict,model,nns,costs,inp,inp_out_dict,max_len,batch_size,to_do,forward_outs)
    
                    
        if to_do=='forward':
            for out_id in range(len(forward_outs)):
                
                out_save=outs_dict[forward_outs[out_id]].data.cpu().numpy()
                
                if forward_normalize_post[out_id]:
                    # read the config file
                    counts = load_counts(forward_count_files[out_id])
                    out_save=out_save-np.log(counts/np.sum(counts))             
                    
                # save the output    
                write_mat(output_folder,post_file[forward_outs[out_id]], out_save, data_name[i])
        else:
            loss_sum=loss_sum+outs_dict['loss_final'].detach()
            err_sum=err_sum+outs_dict['err_final'].detach()
           
        # update it to the next batch 
        beg_batch=end_batch
        end_batch=beg_batch+batch_size
        
        # Progress bar
        if to_do == 'train':
          status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")"+" | L:" +str(round(loss_sum.cpu().item()/(i+1),3))
          if i==N_batches-1:
             status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")"

             
        if to_do == 'valid':
          status_string="Validating | (Batch "+str(i+1)+"/"+str(N_batches)+")"
        if to_do == 'forward':
          status_string="Forwarding | (Batch "+str(i+1)+"/"+str(N_batches)+")"
          
        progress(i, N_batches, status=status_string)
    
    elapsed_time_chunk=time.time() - start_time 
    
    loss_tot=loss_sum/N_batches
    err_tot=err_sum/N_batches
    
    # clearing memory
    del inp, outs_dict, data_set
    
    # save the model
    if to_do=='train':
     
    
         for net in nns.keys():
             checkpoint={}
             if multi_gpu:
                checkpoint['model_par']=nns[net].module.state_dict()
             else:
                checkpoint['model_par']=nns[net].state_dict()
             
             checkpoint['optimizer_par']=optimizers[net].state_dict()
             
             out_file=info_file.replace('.info','_'+arch_dict[net][0]+'.pkl')
             torch.save(checkpoint, out_file)
         
    if to_do=='forward':
        for out_name in forward_outs:
            post_file[out_name].close()
         
    
         
    # Write info file
    with open(info_file, "w") as text_file:
        text_file.write("[results]\n")
        if to_do!='forward':
            text_file.write("loss=%s\n" % loss_tot.cpu().numpy())
            text_file.write("err=%s\n" % err_tot.cpu().numpy())
        text_file.write("elapsed_time_chunk=%f\n" % elapsed_time_chunk)
    
    text_file.close()
    
    
    # Getting the data for the next chunk (read in parallel)    
    p.join()
    data_name=shared_list[0]
    data_end_index=shared_list[1]
    fea_dict=shared_list[2]
    lab_dict=shared_list[3]
    arch_dict=shared_list[4]
    data_set=shared_list[5]
    
    
    # converting numpy tensors into pytorch tensors and put them on GPUs if specified
    if not(save_gpumem) and use_cuda:
       data_set=torch.from_numpy(data_set).float().cuda()
    else:
       data_set=torch.from_numpy(data_set).float()
       
       
    return [data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict]
示例#2
0
  def main(self,rank):
      os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"
      options=read_conf()

      do_training=bool(int(options.do_training))
      do_eval=bool(int(options.do_eval))
      do_forward=bool(int(options.do_forward))
      if do_forward:
        torch.cuda.set_device(0)
        device = "cuda:{}".format(0)
      else:
        torch.cuda.set_device(dist.get_rank()-1)
        device = "cuda:{}".format(dist.get_rank()-1)
      PS = Parameter_Server()
      if int(rank)==0 and do_training:
        PS.ps_server(rank)
      port = sys.argv[1]
      world_size = sys.argv[3]
      ip_add = sys.argv[4]


      fea_scp=options.fea_scp
      fea_opts=options.fea_opts
      lab_folder=options.lab_folder
      lab_opts=options.lab_opts
      
      dev_fea_scp="/home/slave3/kaldi/egs/timit/s5/pytorch-kaldi/exp/mfcc_shu/dev_split.000"
      dev_fea_opts="apply-cmvn --utt2spk=ark:$KALDI_ROOT/egs/timit/s5/data/dev/utt2spk  ark:$PYTORCH_EXP/mfcc_shu/dev_cmvn_speaker.ark ark:- ark:- | add-deltas --delta-order=2 ark:- ark:- |"
      dev_lab_folder='/home/slave3/kaldi/egs/timit/s5/exp/dnn4_pretrain-dbn_dnn_ali_dev'
      dev_lab_opts='ali-to-pdf'
      
      
      
      out_file=options.out_file
      

      count_file=options.count_file
      pt_file=options.pt_file

      left=int(options.cw_left)
      right=int(options.cw_right)
      seed=int(options.seed)
      use_cuda=bool(int(options.use_cuda))
      multi_gpu=bool(int(options.multi_gpu))
      NN_type=options.NN_type
      

      batch_size=int(options.batch_size)
      lr=float(options.lr)
      save_gpumem=int(options.save_gpumem)
      opt=options.optimizer
      if NN_type=='RNN':
         from neural_nets import RNN as ann
         rnn=1
      
      if NN_type=='LSTM':
         from neural_nets import LSTM as ann
         rnn=1
         
      if NN_type=='GRU':
        from neural_nets import GRU as ann
        rnn=1
      if NN_type=='MLP':
         from neural_nets import MLP as ann
         rnn=0

      options.input_dim=429
      options.num_classes=1944

      net = ann(options)
      if use_cuda:
            net.cuda(device=device)
      update_time=0
      sum_update_time=0
      st_update_time=0
      end_update_time=0
      
      
      shu_time=0
      sum_shu_time=0
      st_shu_time=0
      end_shu_time=0
      
      model_time=0
      sum_model_time=0
      st_model_time=0
      end_model_time=0
      
      load_time=0
      sum_load_time=0
      st_load_time=0
      end_load_time=0
      
      val_time=0
      sum_val_time=0
      st_val_time=0
      end_val_time=0
      
      epoch_time=0
      sum_epoch_time=0
      st_epoch_time=0
      end_epoch_time=0  
      
      data_time=0

      st_data_time=0
      end_data_time=0 
      
      
      train_time=0

      st_train_time=0
      end_train_time=0 
      _, st_train_time= timestamp(), resource_usage(RUSAGE_SELF)   

      torch.manual_seed(seed)
      random.seed(seed)
      print("[INFO] Batch size: ",batch_size)
      if rnn or do_eval or do_forward:
         seed=-1
      _, st_data_time= timestamp(), resource_usage(RUSAGE_SELF)   
      if do_forward == 1:
        dev_data_name=[0]
      if do_forward == 0:
        [dev_data_name,dev_data_set_ori,dev_data_end_index]=load_chunk(dev_fea_scp,dev_fea_opts,dev_lab_folder,dev_lab_opts,left,right,-1)   

      [data_name,data_set_ori,data_end_index]=load_chunk(fea_scp,fea_opts,lab_folder,lab_opts,left,right,seed)

      data_len = int(len(data_set_ori)/(int(world_size)-1))
      if do_training:
        if int(world_size)-1==1:
          print("Partition data 1")
        elif int(world_size)-1==2:
          print("partition data 2")
          if int(rank)==1:
            data_set_ori = data_set_ori[0:data_len]
          elif int(rank)==2:
            data_set_ori = data_set_ori[data_len:]
        elif int(world_size)-1==3:
          print("partition data 3")
          if int(rank)==1:
            data_set_ori = data_set_ori[0:data_len]
          elif int(rank)==2:
            data_set_ori = data_set_ori[data_len:data_len*2]
          elif int(rank)==3:
            data_set_ori = data_set_ori[data_len*2:]
        elif int(world_size)-1==4:
          print("partition data 4")
          if int(rank)==1:
            data_set_ori = data_set_ori[0:data_len]
          elif int(rank)==2:
            data_set_ori = data_set_ori[data_len:data_len*2]
          elif int(rank)==3:
            data_set_ori = data_set_ori[data_len*2:data_len*3]
          elif int(rank)==4:
            data_set_ori = data_set_ori[data_len*3:]
        data_len = len(data_set_ori)

      end_data_time,_  = resource_usage(RUSAGE_SELF), timestamp()
      data_time = end_data_time.ru_utime - st_data_time.ru_utime
      print("data generate time: ", data_time)


      print(np.shape(data_set_ori))

      if not(save_gpumem):
         data_set=torch.from_numpy(data_set_ori).float().cuda(device=device)
      else:
         data_set=torch.from_numpy(data_set_ori).float()   
      if do_forward ==0:  
        if not(save_gpumem):
           dev_data_set=torch.from_numpy(dev_data_set_ori).float().cuda(device=device)
        else:
           dev_data_set=torch.from_numpy(dev_data_set_ori).float()  

      N_fea=data_set.shape[1]-1
      options.input_dim=N_fea
      N_out=int(data_set[:,N_fea].max()-data_set[:,N_fea].min()+1) 
      options.num_classes=N_out
      

      if multi_gpu:
       net = nn.DataParallel(net)
       
       

      
      optimizer_worker=None       

      if optimizer_worker is None:
              optimizer_worker = optim.SGD(net.parameters(), lr=lr)
      else:
        optimizer_worker = optim.RMSprop(net.parameters(), lr=lr,alpha=0.95, eps=1e-8) 
      if do_forward:     
        if pt_file!='none':
          checkpoint_load = torch.load(pt_file)
          net.load_state_dict(checkpoint_load['model_par'])
          optimizer_worker.load_state_dict(checkpoint_load['optimizer_par'])
          optimizer_worker.param_groups[0]['lr']=lr

      dev_N_snt=len(dev_data_name)
      N_snt=len(data_name)
      
      
      if do_training:
        print("do training")
        net.train()
        test_flag=0   

        if do_training:
          N_batches=int((N_snt/batch_size)/(int(world_size)-1))
        else:
          N_batches=int(N_snt/batch_size) 
 
        if rnn==0:
         N_ex_tr=data_set.shape[0]
         N_batches=int(N_ex_tr/batch_size)
         
      if do_eval:
       N_batches=N_snt
       net.eval()
       test_flag=1
       batch_size=1
       
       if do_forward:
        post_file=kaldi_io.open_or_fd(out_file,'wb')
        counts = load_counts(count_file)
        

      beg_batch=0
      end_batch=beg_batch+batch_size   
      
      dev_beg_batch=0
      dev_end_batch=dev_beg_batch+1
      
      
      snt_index=0
      beg_snt=0 
      dev_beg_snt=0
      loss_sum=0
      err_sum=0
      dev_loss_sum=0
      dev_err_sum=0
      temp_err=0
      dev_err_sum_tot=0
      dev_N_batches=0

      num_epoch=24
      main_class = MAIN_CLASS()
      if do_forward:
        for i in range(N_batches):
           if do_training :
            
            if rnn==1:
             max_len=data_end_index[snt_index+batch_size-1]-data_end_index[snt_index+batch_size-2]
           
             inp= Variable(torch.zeros(max_len,batch_size,N_fea)).contiguous()
             lab= Variable(torch.zeros(max_len,batch_size)).contiguous().long()
             
           
             for k in range(batch_size):
              snt_len=data_end_index[snt_index]-beg_snt
              N_zeros=max_len-snt_len
              N_zeros_left=random.randint(0,N_zeros)
              inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,0:N_fea] 
              lab[N_zeros_left:N_zeros_left+snt_len,k]=data_set[beg_snt:beg_snt+snt_len,-1]
              
              beg_snt=data_end_index[snt_index]
              snt_index=snt_index+1
           
            else: 

             inp= Variable(data_set[beg_batch:end_batch,0:N_fea]).contiguous().cuda(device=device)
             lab= Variable(data_set[beg_batch:end_batch,N_fea]).contiguous().long().cuda(device=device)
             
            
           if do_eval:
              end_snt=data_end_index[i]
              inp= Variable(data_set[beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device)
              lab= Variable(data_set[beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device)
              if rnn==1:
                inp=inp.view(inp.shape[0],1,inp.shape[1])
                lab=lab.view(lab.shape[0],1)
              beg_snt=data_end_index[i]
            
           
           [loss,err,pout] = net(inp,lab,test_flag,rank)
           
           if multi_gpu:
             loss=loss.mean()
             err=err.mean()
        
           if do_forward:
            if rnn==1:
               pout=pout.view(pout.shape[0]*pout.shape[1],pout.shape[2]) 
            if int(rank)==0:
              kaldi_io.write_mat(post_file, pout.data.cpu().numpy()-np.log(counts/np.sum(counts)), data_name[i])
            
           if do_training:

            optimizer.zero_grad()  
          

            loss.backward()


            optimizer.step()

           
           loss_sum=loss_sum+loss.data
           err_sum=err_sum+err.data

           beg_batch=end_batch
           end_batch=beg_batch+batch_size

      else:

       m=0 
       for e in range(num_epoch):
        print("Batch size: ",m)
        _, st_epoch_time= timestamp(), resource_usage(RUSAGE_SELF)
        if e>0:
          
          dev_N_batches=dev_N_snt
          if e>1:
              temp_err=dev_err_sum_tot

          net.eval()
          test_flag=1
          dev_batch_size=1
          dev_beg_batch=0
          dev_end_batch=dev_beg_batch+1
          dev_loss_sum=0
          dev_err_sum=0
          dev_beg_snt=0
          _, st_val_time= timestamp(), resource_usage(RUSAGE_SELF)
          
          
          for j in range(dev_N_batches):
               
                end_snt=dev_data_end_index[j]
                dev_inp= Variable(dev_data_set[dev_beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device)
                dev_lab= Variable(dev_data_set[dev_beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device)
                if rnn==1:
                  inp=inp.view(inp.shape[0],1,inp.shape[1])
                  lab=lab.view(lab.shape[0],1)
                dev_beg_snt=dev_data_end_index[j]

                [dev_loss,dev_err,dev_pout] = net(dev_inp,dev_lab,test_flag,rank)

                dev_loss_sum=dev_loss_sum+dev_loss.data
                dev_err_sum=dev_err_sum+dev_err.data
                         
                dev_beg_batch=dev_end_batch
             
                dev_end_batch=dev_beg_batch+dev_batch_size
                
          end_val_time,_  = resource_usage(RUSAGE_SELF), timestamp()
          val_time = end_val_time.ru_utime - st_val_time.ru_utime
          sum_val_time=sum_val_time+val_time
          print('[INFO] EPOCH: %d, In Worker: %d, val_Err: %0.3f, val_loss: %0.3f, val_time: %0.3f' % ((e+1), int(rank),dev_err_sum/dev_N_batches, dev_loss_sum/dev_N_batches, sum_val_time))
          dev_err_sum_tot=dev_err_sum/dev_N_batches   
          if e>1:
              threshold = (temp_err-dev_err_sum_tot)/dev_err_sum_tot

              if threshold<0.0005:
                lr = lr * 0.5
          
          net.train()

          beg_batch=0
          end_batch=beg_batch+batch_size
          
          beg_snt=0

          _, st_shu_time= timestamp(), resource_usage(RUSAGE_SELF)
          
          np.random.shuffle(data_set_ori)
          
          if not(save_gpumem):
             data_set=torch.from_numpy(data_set_ori).float().cuda(device=device)
          else:
             data_set=torch.from_numpy(data_set_ori).float()  

          N_fea=data_set.shape[1]-1
          options.input_dim=N_fea
          N_out=int(data_set[:,N_fea].max()-data_set[:,N_fea].min()+1) 
          options.num_classes=N_out
          end_shu_time,_  = resource_usage(RUSAGE_SELF), timestamp()
          shu_time = end_shu_time.ru_utime - st_shu_time.ru_utime
          sum_shu_time=sum_shu_time+shu_time
          loss_sum=0
          err_sum=0

        for i in range(N_batches):

           _, st_load_time= timestamp(), resource_usage(RUSAGE_SELF)

           end_load_time,_  = resource_usage(RUSAGE_SELF), timestamp()
           load_time = end_load_time.ru_utime - st_load_time.ru_utime
           if do_training :
            
            if rnn==1:
             max_len=data_end_index[snt_index+batch_size-1]-data_end_index[snt_index+batch_size-2]
           
             inp= Variable(torch.zeros(max_len,batch_size,N_fea)).contiguous()
             lab= Variable(torch.zeros(max_len,batch_size)).contiguous().long()
           
           
             for k in range(batch_size):
              snt_len=data_end_index[snt_index]-beg_snt
              N_zeros=max_len-snt_len

              N_zeros_left=random.randint(0,N_zeros)

              inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,0:N_fea] 
              lab[N_zeros_left:N_zeros_left+snt_len,k]=data_set[beg_snt:beg_snt+snt_len,-1]
              
              beg_snt=data_end_index[snt_index]
              snt_index=snt_index+1
           
           
            else:

             inp= Variable(data_set[beg_batch:end_batch,0:N_fea]).contiguous().cuda(device=device)
             lab= Variable(data_set[beg_batch:end_batch,N_fea]).contiguous().long().cuda(device=device)
            
            
           if do_eval:
              end_snt=data_end_index[i]
              inp= Variable(data_set[beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device)
              lab= Variable(data_set[beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device)
              if rnn==1:
                inp=inp.view(inp.shape[0],1,inp.shape[1])
                lab=lab.view(lab.shape[0],1)
              beg_snt=data_end_index[i]
              

           [loss,err,pout] = net(inp,lab,test_flag,rank)

           if multi_gpu:
             loss=loss.mean()
             err=err.mean()
            
           if do_forward:
            if rnn==1:
               pout=pout.view(pout.shape[0]*pout.shape[1],pout.shape[2]) 
            if int(rank)==1:
              kaldi_io.write_mat(post_file, pout.data.cpu().numpy()-np.log(counts/np.sum(counts)), data_name[i])
            
           if do_training:

            optimizer_worker.zero_grad()  
          

            loss.backward()

            _,st_update_time = timestamp(), resource_usage(RUSAGE_SELF)
            
            main_class.ensure_shared_params(net,rank)
            end_update_time,_  = resource_usage(RUSAGE_SELF), timestamp()
            update_time = end_update_time.ru_utime-st_update_time.ru_utime
            
            
            cc=0
            _,st_model_time = timestamp(), resource_usage(RUSAGE_SELF)

            end_model_time,_  = resource_usage(RUSAGE_SELF), timestamp()
            model_time = end_model_time.ru_utime-st_model_time.ru_utime

            b=0
             

           sum_update_time=sum_update_time + update_time
           sum_load_time=sum_load_time+load_time
           sum_model_time= sum_model_time+model_time
           loss_sum=loss_sum+loss.data
           err_sum=err_sum+err.data

           if i%100==0:
             
             if i!=0:

               print('[INFO] EPOCH: %d, Batch: %d, In Worker: %d, Err: %0.3f, loss: %0.3f, update_time: %0.3f, load_time: %0.3f' % ((e+1),i, int(rank),err_sum/i, loss_sum/i,sum_update_time,sum_load_time))           

           beg_batch=end_batch
           end_batch=beg_batch+batch_size

           m=m+1
        end_epoch_time,_  = resource_usage(RUSAGE_SELF), timestamp()
        epoch_time = end_epoch_time.ru_utime - st_epoch_time.ru_utime
        sum_epoch_time= sum_epoch_time+epoch_time

        if do_training:
            checkpoint={'model_par': net.state_dict(),
                    'optimizer_par' : optimizer_worker.state_dict()}
            torch.save(checkpoint,options.out_file)    

      loss_tot=loss_sum/(N_batches)
      err_tot=err_sum/(N_batches)
      end_train_time,_  = resource_usage(RUSAGE_SELF), timestamp() 
      train_time = end_train_time.ru_utime - st_train_time.ru_utime

      if do_training:
        checkpoint={'model_par': net.state_dict(),
                    'optimizer_par' : optimizer_worker.state_dict()}
        torch.save(checkpoint,options.out_file)

      info_file=out_file.replace(".pkl",".info")

      with open(info_file, "a") as inf:
           inf.write("model_in=%s\n" %(pt_file))
           inf.write("fea_in=%s\n" %(fea_scp))
           inf.write("loss=%f\n" %(loss_tot))
           inf.write("err=%f\n" %(err_tot))
           inf.write("all_time=%f\n" %(train_time))
           inf.write("shu_time=%f\n" %(sum_shu_time))
           inf.write("model load time=%f\n" %(sum_load_time))
           inf.write("gradient send time=%f\n" %(sum_update_time))
           inf.write("val data calculate time=%f\n" %(sum_val_time))
           inf.write("data generate time=%f\n" %(data_time))
           inf.write("model update time=%f\n" %(sum_model_time))
           inf.write("epoch time=%f\n" %((sum_epoch_time-sum_load_time-sum_update_time-sum_model_time-sum_val_time)/num_epoch))
           inf.write("training time=%f\n" %(train_time-sum_load_time-sum_update_time-sum_val_time-data_time-sum_model_time-sum_shu_time))
           
      inf.close()
      
      if do_forward:
          post_file.close()
示例#3
0
        # Gradient Clipping (th 0.1)
        #for net in nns.keys():
        #    torch.nn.utils.clip_grad_norm_(nns[net].parameters(), 0.1)

        for opt in optimizers.keys():
            if not (strtobool(config[arch_dict[opt][0]]['arch_freeze'])):
                optimizers[opt].step()

    if to_do == 'forward':
        for out_id in range(len(forward_outs)):

            out_save = outs_dict[forward_outs[out_id]].data.cpu().numpy()

            if forward_normalize_post[out_id]:
                # read the config file
                counts = load_counts(forward_count_files[out_id])
                out_save = out_save - np.log(counts / np.sum(counts))

            # save the output
            kaldi_io.write_mat(post_file[forward_outs[out_id]], out_save,
                               data_name[i])
    else:
        loss_sum = loss_sum + outs_dict['loss_final'].detach()
        err_sum = err_sum + outs_dict['err_final'].detach()

    # update it to the next batch
    beg_batch = end_batch
    end_batch = beg_batch + batch_size

    # Progress bar
    if to_do == 'train':
    # set the next epoch learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # ---EVALUATION OF TEST---#
    beg_snt = 0
    err_sum = 0.0
    loss_sum = 0.0
    n_te_snt = len(te_name)
    net.eval()

    if ep == N_ep:
        # set folder for posteriors ark
        post_file = kaldi_io.open_or_fd(options.out_folder + '/pout_test.ark',
                                        'wb')
        counts = load_counts(count_file)

    for i in range(n_te_snt):

        end_snt = te_end_index[i]
        inp = Variable(te_set[beg_snt:end_snt, 0:N_fea], volatile=True)
        lab = Variable(te_set[beg_snt:end_snt, N_fea], volatile=True)

        if save_gpumem and use_cuda:
            inp = inp.cuda()
            lab = lab.cuda()

        [loss, err, pout, pred] = net(inp, lab)

        if ep == N_ep:
            # writing the ark containing the normalized posterior probabilities (needed for kaldi decoding)
示例#5
0
  net.train()
  test_flag=0   
  N_batches=int(N_snt/batch_size)
  if rnn==0:
   N_ex_tr=data_set.shape[0]
   N_batches=int(N_ex_tr/batch_size)
   
if do_eval:
 N_batches=N_snt
 net.eval()
 test_flag=1
 batch_size=1
 
 if do_forward:
  post_file=kaldi_io.open_or_fd(out_file,'wb')
  counts = load_counts(count_file)
  

beg_batch=0
end_batch=batch_size   

snt_index=0
beg_snt=0 

loss_sum=0
err_sum=0


  
for i in range(N_batches):
   
示例#6
0
def run_nn_refac01(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,cfg_file,processed_first,next_config_file):
    def _read_chunk_specific_config(cfg_file):
        if not(os.path.exists(cfg_file)):
            sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file))
            sys.exit(0)
        else:
            config = configparser.ConfigParser()
            config.read(cfg_file)
        return config
    def _get_batch_size_from_config(config, to_do):
        if to_do=='train':
            batch_size=int(config['batches']['batch_size_train'])
        elif to_do=='valid':
            batch_size=int(config['batches']['batch_size_valid'])
        elif to_do=='forward':
            batch_size=1
        return batch_size
    def _initialize_random_seed(config):
        seed=int(config['exp']['seed'])
        torch.manual_seed(seed)
        random.seed(seed)
        np.random.seed(seed)
    def _load_model_and_optimizer(fea_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do):
        inp_out_dict = fea_dict
        nns, costs = model_init(inp_out_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do)
        optimizers = optimizer_init(nns,config,arch_dict)
        for net in nns.keys():
            pt_file_arch=config[arch_dict[net][0]]['arch_pretrain_file']
            if pt_file_arch!='none':        
                if use_cuda:
                    checkpoint_load = torch.load(pt_file_arch)
                else:
                    checkpoint_load = torch.load(pt_file_arch, map_location='cpu')
                nns[net].load_state_dict(checkpoint_load['model_par'])
                if net in optimizers:
                    optimizers[net].load_state_dict(checkpoint_load['optimizer_par'])
                    optimizers[net].param_groups[0]['lr']=float(config[arch_dict[net][0]]['arch_lr']) # loading lr of the cfg file for pt
            if multi_gpu:
                nns[net] = torch.nn.DataParallel(nns[net])
        return nns, costs, optimizers, inp_out_dict
    def _open_forward_output_files_and_get_file_handles(forward_outs, require_decodings, info_file, output_folder):
        post_file={}
        for out_id in range(len(forward_outs)):
            if require_decodings[out_id]:
                out_file=info_file.replace('.info','_'+forward_outs[out_id]+'_to_decode.ark')
            else:
                out_file=info_file.replace('.info','_'+forward_outs[out_id]+'.ark')
            post_file[forward_outs[out_id]]=open_or_fd(out_file,output_folder,'wb')
        return post_file
    def _get_batch_config(data_set_input, seq_model, to_do, data_name, batch_size):
        N_snt = None
        N_ex_tr = None
        N_batches = None
        if seq_model or to_do=='forward':
            N_snt=len(data_name)
            N_batches=int(N_snt/batch_size)
        else:
            N_ex_tr=data_set_input.shape[0]
            N_batches=int(N_ex_tr/batch_size)
        return N_snt, N_ex_tr, N_batches
    def _prepare_input(snt_index, batch_size, inp_dim, ref_dim, beg_snt_fea, beg_snt_lab, data_end_index_fea, data_end_index_lab, beg_batch, end_batch, seq_model, arr_snt_len_fea, arr_snt_len_lab, data_set_inp, data_set_ref, use_cuda):
        def _zero_padding(inp, ref, max_len_fea, max_len_lab, data_end_index_fea, data_end_index_lab, data_set_inp, data_set_ref, beg_snt_fea, beg_snt_lab, snt_index, k):
            def _input_and_ref_have_same_time_dimension(N_zeros_fea, N_zeros_lab):
                if N_zeros_fea == N_zeros_lab:
                    return True
                return False
            snt_len_fea = data_end_index_fea[snt_index] - beg_snt_fea
            snt_len_lab = data_end_index_lab[snt_index] - beg_snt_lab
            N_zeros_fea = max_len_fea - snt_len_fea
            N_zeros_lab = max_len_lab - snt_len_lab
            if _input_and_ref_have_same_time_dimension(N_zeros_fea, N_zeros_lab):
                N_zeros_fea_left = random.randint(0,N_zeros_fea)
                N_zeros_lab_left = N_zeros_fea_left
            else:
                N_zeros_fea_left = 0 
                N_zeros_lab_left = 0 
            inp[N_zeros_fea_left:N_zeros_fea_left+snt_len_fea,k,:] = data_set_inp[beg_snt_fea:beg_snt_fea+snt_len_fea,:]
            ref[N_zeros_lab_left:N_zeros_lab_left+snt_len_lab,k,:] = data_set_ref[beg_snt_lab:beg_snt_lab+snt_len_lab,:]
            return inp, ref, snt_len_fea, snt_len_lab
        if len(data_set_ref.shape) == 1:
            data_set_ref = data_set_ref.shape.view((data_set_ref.shape[0], 1))
        max_len=0
        if seq_model:
            max_len_fea = int(max(arr_snt_len_fea[snt_index:snt_index+batch_size]))  
            max_len_lab = int(max(arr_snt_len_lab[snt_index:snt_index+batch_size]))  
            inp = torch.zeros(max_len_fea,batch_size,inp_dim).contiguous()
            ref = torch.zeros(max_len_lab,batch_size,ref_dim).contiguous()
            for k in range(batch_size):
                inp, ref, snt_len_fea, snt_len_lab = _zero_padding(inp, ref, max_len_fea, max_len_lab, data_end_index_fea, data_end_index_lab, data_set_inp, data_set_ref, beg_snt_fea, beg_snt_lab, snt_index, k)
                beg_snt_fea = data_end_index_fea[snt_index]
                beg_snt_lab = data_end_index_lab[snt_index]
                snt_index = snt_index + 1
        else:
            if to_do != 'forward':
                inp = data_set[beg_batch:end_batch,:].contiguous()
            else:
                snt_len_fea = data_end_index_fea[snt_index] - beg_snt_fea
                snt_len_lab = data_end_index_lab[snt_index] - beg_snt_lab
                inp = data_set_inp[beg_snt_fea:beg_snt_fea+snt_len_fea,:].contiguous()
                ref = data_set_ref[beg_snt_lab:beg_snt_lab+snt_len_lab,:].contiguous()
                beg_snt_fea = data_end_index_fea[snt_index]
                beg_snt_lab = data_end_index_lab[snt_index]
                snt_index = snt_index + 1
        if use_cuda:
            inp=inp.cuda()
            ref=ref.cuda()
        return inp, ref, max_len_fea, max_len_lab, snt_len_fea, snt_len_lab, beg_snt_fea, beg_snt_lab, snt_index
    def _optimization_step(optimizers, outs_dict, config, arch_dict):
        for opt in optimizers.keys():
            optimizers[opt].zero_grad()
        outs_dict['loss_final'].backward()
        for opt in optimizers.keys():
            if not(strtobool(config[arch_dict[opt][0]]['arch_freeze'])):
                optimizers[opt].step()
    def _update_progress_bar(to_do, i, N_batches, loss_sum):
        if to_do == 'train':
            status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")"+" | L:" +str(round(loss_sum.cpu().item()/(i+1),3))
            if i==N_batches-1:
                status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")"
        if to_do == 'valid':
            status_string="Validating | (Batch "+str(i+1)+"/"+str(N_batches)+")"
        if to_do == 'forward':
            status_string="Forwarding | (Batch "+str(i+1)+"/"+str(N_batches)+")"
        progress(i, N_batches, status=status_string)
    def _write_info_file(info_file, to_do, loss_tot, err_tot, elapsed_time_chunk):
        with open(info_file, "w") as text_file:
            text_file.write("[results]\n")
            if to_do!='forward':
                text_file.write("loss=%s\n" % loss_tot.cpu().numpy())
                text_file.write("err=%s\n" % err_tot.cpu().numpy())
            text_file.write("elapsed_time_chunk=%f\n" % elapsed_time_chunk)
        text_file.close()
    def _save_model(to_do, nns, multi_gpu, optimizers, info_file, arch_dict):
        if to_do=='train':
             for net in nns.keys():
                 checkpoint={}
                 if multi_gpu:
                     checkpoint['model_par']=nns[net].module.state_dict()
                 else:
                     checkpoint['model_par']=nns[net].state_dict()
                 if net in optimizers:
                     checkpoint['optimizer_par']=optimizers[net].state_dict()
                 else:
                     checkpoint['optimizer_par']=dict()
                 out_file=info_file.replace('.info','_'+arch_dict[net][0]+'.pkl')
                 torch.save(checkpoint, out_file)
    def _get_dim_from_data_set(data_set_inp, data_set_ref):
        inp_dim = data_set_inp.shape[1]
        ref_dim = 1
        if len(data_set_ref.shape) > 1:
            ref_dim = data_set_ref.shape[1]
        return inp_dim, ref_dim
    
    from data_io import read_lab_fea_refac01 as read_lab_fea
    from utils import forward_model_refac01 as forward_model
    config = _read_chunk_specific_config(cfg_file)
    _initialize_random_seed(config)
    
    output_folder = config['exp']['out_folder']
    use_cuda = strtobool(config['exp']['use_cuda'])
    multi_gpu = strtobool(config['exp']['multi_gpu'])
    to_do = config['exp']['to_do']
    info_file = config['exp']['out_info']
    model = config['model']['model'].split('\n')
    forward_outs = config['forward']['forward_out'].split(',')
    forward_normalize_post = list(map(strtobool,config['forward']['normalize_posteriors'].split(',')))
    forward_count_files = config['forward']['normalize_with_counts_from'].split(',')
    require_decodings = list(map(strtobool,config['forward']['require_decoding'].split(',')))
    save_gpumem = strtobool(config['exp']['save_gpumem'])
    is_production = strtobool(config['exp']['production'])
    batch_size = _get_batch_size_from_config(config, to_do)

    if processed_first:
        shared_list = list()
        p = read_next_chunk_into_shared_list_with_subprocess(read_lab_fea, shared_list, cfg_file, is_production, output_folder, wait_for_process=True)
        data_name, data_end_index_fea, data_end_index_lab, fea_dict, lab_dict, arch_dict, data_set_dict = extract_data_from_shared_list(shared_list)
        data_set_inp, data_set_ref = convert_numpy_to_torch(data_set_dict, save_gpumem, use_cuda)
    else:
        data_set_inp = data_set['input']
        data_set_ref = data_set['ref']
        data_end_index_fea = data_end_index['fea']
        data_end_index_lab = data_end_index['lab']
    shared_list = list()
    data_loading_process = None
    if not next_config_file is None:
        data_loading_process = read_next_chunk_into_shared_list_with_subprocess(read_lab_fea, shared_list, next_config_file, is_production, output_folder, wait_for_process=False)
    nns, costs, optimizers, inp_out_dict = _load_model_and_optimizer(fea_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do)
    if to_do=='forward':
        post_file = _open_forward_output_files_and_get_file_handles(forward_outs, require_decodings, info_file, output_folder)
    
    seq_model = is_sequential_dict(config,arch_dict)
    N_snt, N_ex_tr, N_batches = _get_batch_config(data_set_inp, seq_model, to_do, data_name, batch_size) 
    beg_batch = 0
    end_batch = batch_size 
    snt_index = 0
    beg_snt_fea = 0 
    beg_snt_lab = 0 
    arr_snt_len_fea = shift(shift(data_end_index_fea, -1,0) - data_end_index_fea,1,0)
    arr_snt_len_lab = shift(shift(data_end_index_lab, -1,0) - data_end_index_lab,1,0)
    arr_snt_len_fea[0] = data_end_index_fea[0]
    arr_snt_len_lab[0] = data_end_index_lab[0]
    data_set_inp_dim, data_set_ref_dim = _get_dim_from_data_set(data_set_inp, data_set_ref)
    inp_dim = data_set_inp_dim + data_set_ref_dim
    loss_sum = 0
    err_sum = 0
    start_time = time.time()
    for i in range(N_batches):
        inp, ref, max_len_fea, max_len_lab, snt_len_fea, snt_len_lab, beg_snt_fea, beg_snt_lab, snt_index = _prepare_input(snt_index, batch_size, data_set_inp_dim, data_set_ref_dim, beg_snt_fea, beg_snt_lab, data_end_index_fea, data_end_index_lab, beg_batch, end_batch, seq_model, arr_snt_len_fea, arr_snt_len_lab, data_set_inp, data_set_ref, use_cuda)
        if to_do=='train':
            outs_dict = forward_model(fea_dict, lab_dict, arch_dict, model, nns, costs, inp, ref, inp_out_dict, max_len_fea, max_len_lab, batch_size, to_do, forward_outs)
            _optimization_step(optimizers, outs_dict, config, arch_dict)
        else:
            with torch.no_grad():
                outs_dict = forward_model(fea_dict, lab_dict, arch_dict, model, nns, costs, inp, ref, inp_out_dict, max_len_fea, max_len_lab, batch_size, to_do, forward_outs)
        if to_do == 'forward':
            for out_id in range(len(forward_outs)):
                out_save = outs_dict[forward_outs[out_id]].data.cpu().numpy()
                if forward_normalize_post[out_id]:
                    counts = load_counts(forward_count_files[out_id])
                    out_save=out_save-np.log(counts/np.sum(counts))             
                write_mat(output_folder,post_file[forward_outs[out_id]], out_save, data_name[i])
        else:
            loss_sum=loss_sum+outs_dict['loss_final'].detach()
            err_sum=err_sum+outs_dict['err_final'].detach()
        beg_batch=end_batch
        end_batch=beg_batch+batch_size
        _update_progress_bar(to_do, i, N_batches, loss_sum)
    elapsed_time_chunk=time.time() - start_time
    loss_tot=loss_sum/N_batches
    err_tot=err_sum/N_batches
    del inp, ref, outs_dict, data_set_inp_dim, data_set_ref_dim
    _save_model(to_do, nns, multi_gpu, optimizers, info_file, arch_dict)
    if to_do=='forward':
        for out_name in forward_outs:
            post_file[out_name].close()
    _write_info_file(info_file, to_do, loss_tot, err_tot, elapsed_time_chunk)
    if not data_loading_process is None:
        data_loading_process.join()
        data_name, data_end_index_fea, data_end_index_lab, fea_dict, lab_dict, arch_dict, data_set_dict = extract_data_from_shared_list(shared_list)
        data_set_inp, data_set_ref = convert_numpy_to_torch(data_set_dict, save_gpumem, use_cuda)
        data_set = {'input': data_set_inp, 'ref': data_set_ref}
        data_end_index = {'fea': data_end_index_fea,'lab': data_end_index_lab}
        return [data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict]
    else:
        return [None,None,None,None,None,None]