def run(args): train_loader = torch.utils.data.DataLoader(datasets.ImageFolder( args.data + '/train', transform=data_transforms), batch_size=args.batch_size, shuffle=True, num_workers=16) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( args.data + '/val', transform=validation_data_transforms), batch_size=args.batch_size, shuffle=False, num_workers=16) model = CNNModel() model = nn.DataParallel(model) model = model.to(args.device) if args.checkpoint is not None: model.load_state_dict(torch.load(args.checkpoint)) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-3) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size) for epoch in range(1, args.epochs + 1): scheduler.step() train(epoch, model, optimizer, train_loader, args.log_interval) validation(epoch, model, val_loader) model_file = 'model_' + str(epoch) + '.pth' torch.save(model.state_dict(), model_file) writer.close()
def run(args): train_loader = torch.utils.data.DataLoader( datasets.ImageFolder('../../ssl_data_96/supervised/train', transform=data_transforms), batch_size=args.batch_size, shuffle=True, num_workers=4) #n_worker to 4, to use 4 gpu val_loader = torch.utils.data.DataLoader( datasets.ImageFolder('../../ssl_data_96/supervised/val', transform=validation_data_transforms), batch_size=args.batch_size, shuffle=False, num_workers=4) #n_worker to 4, to use 4 gpu model = CNNModel() model.cuda() optimizer = optim.RMSprop(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=1e-3) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5) for epoch in range(1, args.epochs + 1): scheduler.step() train(epoch, model, optimizer, train_loader, args.log_interval) validation(epoch, model, val_loader) model_file = 'model_' + str(epoch) + '.pth' torch.save(model.state_dict(), model_file) writer.close()
def on_message(client, userdata, msg): try: print("Model from trainer received!") print('Topic: ', msg.topic) #print('Message: ', msg.payload) model_str = msg.payload buff = io.BytesIO(bytes(model_str)) # Create a dummy model to read weights model = CNNModel() model.load_state_dict(torch.load(buff)) global trainer_weights trainer_weights.append(copy.deepcopy(model.state_dict())) # Wait until we get trained weights from all trainers if len(trainer_weights) == NUM_TRAINERS: update_global_weights_and_send(trainer_weights) trainer_weights.clear() except: print("Unexpected error:", sys.exc_info())
inputs, lbl = inputs.cuda(), lbl.cuda() # set the gradient for each parameters zero optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, lbl) loss.backward() optimizer.step() print('-[step: %d, loss: %f]' % (i + 1, loss.item())) scheduler.step() print('Finished Training') if __name__ == '__main__': cnn = CNNModel() batch = 2000 if torch.cuda.is_available(): cnn.cuda() trainingDataset = LoadTrainingData() dataLoader = DataLoader(dataset=trainingDataset, batch_size=batch, shuffle=True, num_workers=2) train_model(cnn, dataLoader, epoch=40, batch_size=batch) # save model torch.save(cnn.state_dict(), './trained_model.pth')
output = model(x_batch) loss = loss_func(output.view(-1, output.size(-1)), y_batch.view(-1)) y_pred = torch.max(output, -1)[1] y_pred = y_pred.masked_fill_((y_batch == 0), 0) nonzeros += (y_batch != 0).data.sum() total_loss += loss.data[0] total_acc += (y_pred == y_batch).data.sum() - (y_batch == 0).data.sum() total_ed += avg_ed(encode(y_pred.data.cpu().numpy()), encode(y_batch.data.cpu().numpy())) print('Validation: loss:{:.4f}, acc:{:.4f}, ed:{:.4f}'.format( total_loss / (i + 1), total_acc / nonzeros, total_ed / y_valid.shape[0])) early_stop_cnt += 1 if (total_ed / y_valid.shape[0]) < best_ed: early_stop_cnt = 0 best_ed = total_ed / y_valid.shape[0] print('Save best model: ed={:.4f}'.format(best_ed)) with open('model/model_best.pt'.format(best_ed), 'wb') as file: torch.save(model.state_dict(), file) if early_stop_cnt >= 20 and best_ed < 15: print('No improvement for 20 epochs. Stop training.') break
train_epochs = 300000 test_episode = 10 log_interval = 100 test_interval = 1000 save_interval = 1000 env = make_env('BreakoutNoFrameskip-v4', seed, num_procs) in_ch = env.observation_space.shape[-1] n_action = env.action_space.n import ipdb;ipdb.set_trace() model = CNNModel(in_ch, n_action) obs_preproc = ObsPreproc(device=device) agent = A2CAgent(model, env, obs_preproc, device, lr, gamma, entropy_coef, value_loss_coef) test_env = make_env('BreakoutNoFrameskip-v4', seed, 1, clip_reward=False) test_agent = TestAgent(model, test_env, obs_preproc, device, test_episode) for i in range(train_epochs): batch, log = agent.collect_batch(num_frames_per_proc) info = agent.update_parameters(batch) if i % log_interval == 0: print_dict({'step': i}, info, log) if i % test_interval == 0: print('=' * 20 + 'Test Agent' + '=' * 20) info = test_agent.evaluate() print_dict(info) if i % save_interval == 0: print('Save Model') torch.save(model.state_dict(), 'ckpt.pth')
print("Model received from coordinator!") print(msg.topic + ' ' + str(msg.payload)) except: print("Unexpected error:", sys.exc_info()[0]) local_mqttclient = mqtt.Client() local_mqttclient.connect(LOCAL_MQTT_HOST, LOCAL_MQTT_PORT, 60) local_mqttclient.on_connect = on_connect_local local_mqttclient.on_message = on_message # Read test model model = CNNModel() model.load_state_dict(torch.load('models/mnist_cnn.pt')) buff = io.BytesIO() torch.save(model.state_dict(), buff) buff.seek(0) # Convert model to string for transmission model_str = buff.getvalue() local_mqttclient.publish(LOCAL_MQTT_TOPIC, payload=model_str, qos=0, retain=False) #local_mqttclient.publish(LOCAL_MQTT_TOPIC, payload="test message", qos=0, retain=False) local_mqttclient.loop_forever()
train_set = SpeechDataset(filelist='data/digits/short_train.lst', rootdir='data/digits', n_mfcc=20) test_set = SpeechDataset(filelist='data/digits/short_test.lst', rootdir='data/digits', n_mfcc=20) train_dl = DataLoader(train_set, batch_size=64, shuffle=False, num_workers=16, pin_memory=True) test_dl = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=16, pin_memory=True) device = get_default_device() model = CNNModel(pool_method=args.pool_method).to(device) fit(model, train_dl, test_dl, epochs=10, lr=0.001) Path("models").mkdir(parents=True, exist_ok=True) torch.save(model.state_dict(), args.model_file) """ device = get_default_device() model = ResNetModel(pool_method=args.pool_method).to(device) fit(model, train_dl, test_dl, epochs=10, lr=0.001) Path("models").mkdir(parents=True, exist_ok=True) torch.save(model.state_dict(), args.model_file) """
label = label.to(DEVICE) optimizer.zero_grad() outputs = model(tokens, pos1, pos2) loss = criterion(outputs, label) loss.backward() optimizer.step() running_loss += loss.item() if i % PRINT_PER_STEP == PRINT_PER_STEP - 1: acc, precision, recall, f1_micro, f1_macro = evaluate(model, train_loader, DEVICE) print(' [%d, %5d] AVG-Loss: %.4f - TRAIN >>> ACC: %.4f, Precision: %.4f, Recall: %.4f, F1-micro: %.4f, F1-macro: %.4f\r' \ % (epoch+1, i+1, running_loss / PRINT_PER_STEP, acc, precision, recall, f1_micro, f1_macro), end='') running_loss = 0.0 acc, precision, recall, f1_micro, f1_macro = evaluate(model, test_loader, DEVICE) print('\nTEST >>> ACC: %.4f, Precision: %.4f, Recall: %.4f, F1-micro: %.4f, F1-macro: %.4f\n' \ % (acc, precision, recall, f1_micro, f1_macro)) if f1_micro > best_f1_micro: print('Best model, storing...\n') torch.save(model.state_dict(), BEST_MODEL_SAVE_PATH) best_f1_micro = f1_micro else: waste_epoch += 1 if EARLY_STOP_EPOCH > 0: if waste_epoch >= EARLY_STOP_EPOCH: break print('Traning finished. Best f1-micro score: %.4f' % best_f1_micro)
# target = batch_y.unsqueeze(2).cuda() LSTM MODEL UNCOMMENT THIS target = batch_y.cuda() # LSTML MODEL COMMENT THIS data = data.cuda() target = target.cuda() pred = model(data) lossB = torch.abs(pred - target).mean() # lossA = -(pred * (target*2-1)).mean() lossC = F.cosine_similarity(pred, target) loss = torch.exp(-lossC).mean() + lossB loss.backward() optimizer.step() if step % 10 == 0 and step > 0: print('%d epoch\'s %d step has total loss %f, the L1 loss is %f'%(epoch, step, loss.item(), lossB.item())) if step % 2000 == 0 and step > 0: # print( torch.min(pred), torch.max(pred), torch.mean(pred)) torch.save(model.state_dict(), './models/model_epoch'+str(epoch)+'_iter'+str(step)+'.pth') W_distance = 0.0 model.eval() for step, (batch_x, batch_y) in enumerate(valid_loader): # data = batch_x.unsqueeze(2).cuda() # bs, seq, 1 LSTM MODEL UNCOMMENT THIS data = batch_x.unsqueeze(1).cuda() data = (data - torch.mean(data, dim=2, keepdim=True)) / torch.std(data, dim=2, keepdim=True) # target = batch_y.unsqueeze(2).cuda() LSTM MODEL UNCOMMENT THIS target = batch_y.cuda() pred = model(data) # for d in range(1029): # test_debug = pred.squeeze().cpu().detach() # print(test_debug[10][d]) #print('debug ', pred.squeeze().cpu().detach().shape, batch_y.squeeze().cpu().detach().shape)
def train_dann(dataset_source, dataset_target, n_epoch, batch_size, in_dim, h_dims, out_dim, ckpt_save_path): lr = 1e-3 l_d = 0.1 dataloader_source = torch.utils.data.DataLoader( dataset=dataset_source, batch_size=batch_size, shuffle=True, ) dataloader_target = torch.utils.data.DataLoader( dataset=dataset_target, batch_size=batch_size, shuffle=True, ) model = CNNModel(in_dim, h_dims, out_dim) optimizer = torch.optim.Adam(model.parameters(), lr=lr) loss_class = torch.nn.CrossEntropyLoss() loss_domain = torch.nn.CrossEntropyLoss() if cuda: model = model.cuda() loss_class = loss_class.cuda() loss_domain = loss_domain.cuda() for p in model.parameters(): p.requires_grad = True # training best_acc = 0.0 best_ep = 0 tr_acc_ls = [] te_acc_ls = [] loss_ls = [] for epoch in range(n_epoch): model.train() len_dataloader = min(len(dataloader_source), len(dataloader_target)) data_source_iter = iter(dataloader_source) data_target_iter = iter(dataloader_target) loss_sum = 0.0 n_s = 0 for i in range(len_dataloader): # Compute reverse layer parameter alpha p = float(i + epoch * len_dataloader) / n_epoch / len_dataloader alpha = 2. / (1. + np.exp(-10 * p)) - 1 # training model using source data data_s, label_s = data_source_iter.next() batch_size_s = len(label_s) n_s += batch_size_s domain_label = torch.zeros(batch_size_s).long() if cuda: data_s = data_s.cuda() label_s = label_s.cuda() domain_label = domain_label.cuda() class_output, domain_output = model(input_data=data_s, alpha=alpha) loss_c = loss_class(class_output, label_s) loss_ds = loss_domain(domain_output, domain_label) # training model using target data data_t, _ = data_target_iter.next() batch_size_t = len(data_t) domain_label = torch.ones(batch_size_t).long() if cuda: data_t = data_t.cuda() domain_label = domain_label.cuda() _, domain_output = model(input_data=data_t, alpha=alpha) loss_dt = loss_domain(domain_output, domain_label) # Compute overall loss and backprop loss = loss_c + l_d * (loss_dt + loss_ds) loss_sum += loss.item() * batch_size_s model.zero_grad() loss.backward() optimizer.step() # logger.info('epoch: {:>4}, [iter: {:>4} / all {:>4}], loss {:8.4f}, ' # 'loss_c: {:8.4f}, loss_ds: {:8.4f}, loss_dt: {:8.4f}\n' # .format(epoch, i+1, len_dataloader, loss.item(), loss_c.item(), loss_ds.item(), loss_dt.item())) tr_acc, tr_f1 = evaluate_dann(model, dataset_source, batch_size) te_acc, te_f1 = evaluate_dann(model, dataset_target, batch_size) tr_acc_ls.append(tr_acc) te_acc_ls.append(te_acc) loss_ls.append(loss_sum) # If find a better result, save the model if te_acc > best_acc: best_acc = te_acc best_ep = epoch checkpoint = {"epoch": epoch, "state_dict": model.state_dict()} torch.save(checkpoint, ckpt_save_path + '.ckpt') logger.info( 'epoch: {:>4}, loss: {:8.4f}, train acc: {:8.4f}, train f1: {:8.4f},' ' eval acc: {:8.4f}, eval f1: {:8.4f}'.format( epoch, loss_sum, tr_acc, tr_f1, te_acc, te_f1)) logger.info('=' * 10) logger.info('best epoch: {:>4}, best acc: {:8.4f}'.format( best_ep, best_acc)) pickle.dump(tr_acc_ls, open(ckpt_save_path + '.tracc', 'wb')) pickle.dump(te_acc_ls, open(ckpt_save_path + '.teacc', 'wb')) pickle.dump(loss_ls, open(ckpt_save_path + '.loss', 'wb'))