def train(): cuda_available = torch.cuda.is_available() train_params,model_params,dataset_params = get_arguments() net = wavenet_autoencoder(**model_params) epoch_trained = 0 if train_params['restore_model']: net = load_model(net,train_params['restore_dir'],train_params['restore_model']) if net is None: print("Initialize network and train from scratch.") net = wavenet_autoencoder(**model_params) else: epoch_trained = train_params["restore_model"].split('.')[0] epoch_trained = int(epoch_trained[7:]) dataloader = audio_data_loader(**dataset_params) if cuda_available is False : warnings.warn("Cuda is not avalable, can not train model using multi-gpu.") if cuda_available: if train_params["device_ids"]: batch_size = dataset_params["batch_size"] num_gpu = len(train_params["device_ids"]) assert batch_size % num_gpu == 0 net = nn.DataParallel(net,device_ids=train_params['device_ids']) torch.backends.cudnn.benchmark = True net = net.cuda() optimizer = get_optimizer(net,train_params['optimizer_type'],train_params['learning_rate'],train_params['momentum']) loss_func = nn.CrossEntropyLoss() if cuda_available: loss_func=loss_func.cuda() if not os.path.exists(train_params['log_dir']) : os.makedirs(train_params['log_dir']) if not os.path.exists(train_params['restore_dir']): os.makedirs(train_params['restore_dir']) loss_log_file = open(train_params['log_dir']+'loss_log.log','a') store_log_file = open(train_params['log_dir']+'store_log.log','a') total_loss = 0 with open(train_params['log_dir']+'loss_log.log','r') as f: lines = f.readlines() if len(lines) > 0: num_trained = lines[-1].split(' ')[2] num_trained = int(num_trained) else: num_trained = 0 f.close() for epoch in range(train_params['num_epochs']): for i_batch,sample_batch in enumerate(dataloader): print(i_batch) optimizer.zero_grad() music_piece = sample_batch['audio_piece'] target_piece = sample_batch['audio_target'] if cuda_available: music_piece = music_piece.cuda(async=True) target_piece = target_piece.cuda(async=True) music_piece = Variable(music_piece) target_piece = Variable(target_piece.view(-1)) # print(music_piece.size()) # print(target_piece.size()) # print('it is ok1') outputs = net(music_piece) # print(outputs.size()) # print('it is ok') loss = loss_func(outputs,target_piece) total_loss += loss.data[0] loss.backward() optimizer.step() num_trained += 1 if num_trained%train_params['print_every'] ==0: avg_loss = total_loss/train_params['print_every'] line = 'Average loss is ' + str(avg_loss) +'\n' loss_log_file.writelines(line) loss_log_file.flush() total_loss =0 if (epoch+1)%train_params['check_point_every'] ==0: print(epoch_trained) save_model(net,epoch_trained + epoch + 1,train_params['restore_dir']) line = 'Epoch' + str(epoch_trained+epoch+1) +'model saved!' store_log_file.writelines(line) store_log_file.flush() loss_log_file.close() store_log_file.close()
def train(): cuda_available = torch.cuda.is_available() train_params, model_params, dataset_params = get_arguments() net = WavenetAutoencoder(**model_params) epoch_trained = 0 if train_params['restore_model']: net = load_model(net, train_params['restore_dir'], train_params['restore_model']) if net is None: print("Initialize network and train from scratch.") net = WavenetAutoencoder(**model_params) else: #epoch_trained = train_params["restore_model"].split('.')[0] #epoch_trained = int(epoch_trained[7:]) epoch_trained = 0 dataloader = audio_data_loader(**dataset_params) if cuda_available is False: warnings.warn( "Cuda is not avalable, can not train model using multi-gpu.") if cuda_available: # Remove train_params "device_ids" for single GPU if train_params["device_ids"]: batch_size = dataset_params["batch_size"] num_gpu = len(train_params["device_ids"]) assert batch_size % num_gpu == 0 net = nn.DataParallel(net, device_ids=train_params['device_ids']) torch.backends.cudnn.benchmark = True net = net.cuda() optimizer = get_optimizer(net, train_params['optimizer'], train_params['learning_rate'], train_params['momentum']) loss_func = nn.CrossEntropyLoss() if cuda_available: loss_func = loss_func.cuda() if not os.path.exists(train_params['log_dir']): os.makedirs(train_params['log_dir']) if not os.path.exists(train_params['restore_dir']): os.makedirs(train_params['restore_dir']) loss_log_file = open(train_params['log_dir'] + 'loss_log.log', 'a') store_log_file = open(train_params['log_dir'] + 'store_log.log', 'a') total_loss = 0 with open(train_params['log_dir'] + 'loss_log.log', 'r') as f: lines = f.readlines() if len(lines) > 0: num_trained = lines[-1].split(' ')[2] num_trained = int(num_trained) else: num_trained = 0 f.close() # Add print for start of training time time = str(datetime.now()) line = 'Training Started at' + str(time) + ' !!! \n' loss_log_file.writelines(line) loss_log_file.flush() for epoch in range(train_params['num_epochs']): net.train() for i_batch, sample_batch in enumerate(dataloader): optimizer.zero_grad() music_piece = sample_batch['audio_piece'] target_piece = sample_batch['audio_target'] if cuda_available: music_piece = music_piece.cuda(async=True) target_piece = target_piece.cuda(async=True) print("music_piece size = ", music_piece.size()) music_piece = Variable(music_piece) target_piece = Variable(target_piece.view(-1)) outputs = net(music_piece) print('target size = ', target_piece.data.size()) print('outputs size = ', outputs.data.size()) loss = loss_func(outputs, target_piece) print("loss is ", loss) loss.backward() if check_grad(net.parameters(), train_params['clip_grad'], train_params['ignore_grad']): print('Not a finite gradient or too big, ignoring.') optimizer.zero_grad() continue optimizer.step() total_loss += loss.data[0] print(num_trained) num_trained += 1 if num_trained % train_params['print_every'] == 0: avg_loss = total_loss / train_params['print_every'] line = 'Average loss is ' + str(avg_loss) + '\n' loss_log_file.writelines(line) loss_log_file.flush() total_loss = 0 if (epoch + 1) % train_params['check_point_every'] == 0: stored_models = glob.glob(train_params['restore_dir'] + '*.model') if len(stored_models) == train_params['max_check_points']: def cmp(x, y): x = os.path.splitext(x)[0] x = os.path.split(x)[-1] y = os.path.splitext(y)[0] y = os.path.split(y)[-1] x = int(x[7:]) y = int(y[7:]) return x - y sorted_models = sorted(stored_models, keys=cmp_to_key(cmp)) os.remove(sorted_models[0]) print(epoch_trained) save_model(net, epoch_trained + epoch + 1, train_params['restore_dir']) line = 'Epoch' + str(epoch_trained + epoch + 1) + 'model saved!' store_log_file.writelines(line) store_log_file.flush() # Add print for end of training time time = str(datetime.now()) line = 'Training Ended at' + str(time) + ' !!! \n' loss_log_file.writelines(line) loss_log_file.flush() loss_log_file.close() store_log_file.close()
def train(): ''' Check whether cuda is available. ''' cuda_available = torch.cuda.is_available() if cuda_available: torch.backends.cudnn.benchmark = True ''' Get all needed parameters. All parameters are stored in json file in directory './params'. If you want to change the settings, simply modify the json file in './params/' ''' train_params, wavenet_params, dataset_params = get_arguments() ''' Launch instances of wavenet model and dataloader. ''' net = wavenet(**wavenet_params) epoch_trained = 0 if train_params["restore_model"]: net = load_model(net, train_params["restore_dir"], train_params["restore_model"]) if net is None: print("Initialize network and train from scratch.") net = wavenet(**wavenet_params) else: epoch_trained = train_params["restore_model"].split('.')[0] epoch_trained = int(epoch_trained[7:]) dataloader = audio_data_loader(**dataset_params) ''' Whether use gpu to train the network. whether use multi-gpu to train the network. ''' if cuda_available is False and train_params["device_ids"] is not None: raise ValueError("Cuda is not avalable,", " can not train model using multi-gpu.") if cuda_available: if train_params["device_ids"]: batch_size = dataset_params["batch_size"] num_gpu = len(train_params["device_ids"]) assert batch_size % num_gpu == 0 net = nn.DataParallel(net, device_ids=train_params["device_ids"]) net = net.cuda() ''' Start training. Save the model per train_params["check_point_every"] epochs. Save model to train_params["restore_dir"]. Save at most train_params["max_check_points"] models. If the number of models in restore_dir is over max_check_points, overwrite the oldest model with the newest one. Write logging information to train_params["log_dir"]. Logging information includes one epoch's average loss ''' print("Start training.") print("Writing logging information to ", "{}".format(train_params["log_dir"])) print("Models are saved in {}".format(train_params["restore_dir"])) ''' Define optimizer and loss function. ''' optimizer = get_optimizer(net, train_params["optimizer"], train_params["learning_rate"], train_params["momentum"]) loss_func = nn.CrossEntropyLoss() if cuda_available: loss_func = loss_func.cuda() if not os.path.exists(train_params["log_dir"]): os.makedirs(train_params["log_dir"]) if not os.path.exists(train_params["restore_dir"]): os.makedirs(train_params["restore_dir"]) loss_log_file = open(train_params["log_dir"] + 'loss_log.log', 'a') store_log_file = open(train_params["log_dir"] + 'store_log.log', 'a') ''' Train in epochs ''' total_loss = 0.0 with open(train_params["log_dir"] + 'loss_log.log', 'r') as f: lines = f.readlines() if len(lines) > 0: num_trained = lines[-1].split(' ')[2] num_trained = int(num_trained) else: num_trained = 0 f.close() for epoch in range(train_params["num_epochs"]): for i_batch, sampled_batch in enumerate(dataloader): optimizer.zero_grad() piece = sampled_batch["audio_piece"] target = sampled_batch["audio_target"] if cuda_available: piece = piece.cuda(async=True) target = target.cuda(async=True) piece, target = Variable(piece), Variable(target.view(-1)) logits = net(piece) loss = loss_func(logits, target) total_loss += loss.data[0] loss.backward() optimizer.step() ''' check whether to write loss information to log file ''' num_trained += 1 if num_trained % train_params["print_every"] == 0: avg_loss = total_loss / train_params["print_every"] line = "Trained over " + str(num_trained) + " pieces," line += "Average loss is " + str(avg_loss) + "\n" loss_log_file.writelines(line) loss_log_file.flush() total_loss = 0.0 ''' Store model per check_point_every epochs. ''' if (epoch + 1) % train_params["check_point_every"] == 0: stored_models = glob.glob(train_params["restore_dir"] + "*.model") # First whether to delete one oldest model if len(stored_models) == train_params["max_check_points"]: def cmp(x, y): x = x.split('/')[-1] y = y.split('/')[-1] x = x.split('.')[0] y = y.split('.')[0] x = int(x[7:]) y = int(y[7:]) return x - y stored_models = sorted(stored_models, key=cmp_to_key(cmp)) os.remove(stored_models[0]) # Then store the newest model save_model(net, epoch_trained + epoch + 1, train_params["restore_dir"]) line = "Epoch " + str(epoch_trained + epoch + 1) + \ ", model saved!\n" store_log_file.writelines(line) store_log_file.flush() loss_log_file.close() store_log_file.close()