def get_masked_lm_loss( logit_blob, masked_lm_positions, masked_lm_labels, label_weights, max_prediction_per_seq=20, ): # gather valid position indices logit_blob = flow.gather( logit_blob, index=masked_lm_positions.unsqueeze(2).repeat(1, 1, 30522), dim=1, ) logit_blob = flow.reshape(logit_blob, [-1, 30522]) label_id_blob = flow.reshape(masked_lm_labels, [-1]) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. pre_example_loss = nn.CrossEntropyLoss(reduction="none")(logit_blob, label_id_blob) pre_example_loss = flow.reshape(pre_example_loss, [-1, max_prediction_per_seq]) sum_label_weight = flow.sum(label_weights, dim=-1) sum_label_weight = sum_label_weight / label_weights.shape[0] numerator = flow.sum(pre_example_loss * label_weights) denominator = flow.sum(label_weights) + 1e-5 loss = numerator / denominator return logit_blob, loss
def cal_loss(pred, gold, smoothing=0.0): """Calculate cross entropy loss, apply label smoothing if needed. """ if smoothing > 0.0: eps = smoothing n_class = pred.size(1) # Generate one-hot matrix: N x C. # Only label position is 1 and all other positions are 0 # gold include -1 value (IGNORE_ID) and this will lead to assert error gold_for_scatter = gold.ne(IGNORE_ID).long() * gold one_hot = F.one_hot(gold_for_scatter, n_class).to(dtype=flow.float32) one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / n_class sof_prb = F.softmax(pred) log_prb = flow.log(sof_prb) non_pad_mask = gold.ne(IGNORE_ID) n_word = float(non_pad_mask.sum().numpy()) loss = -(one_hot * log_prb).sum(dim=1) loss = loss.masked_select(non_pad_mask).sum() / n_word else: loss_fn = nn.CrossEntropyLoss(ignore_index=IGNORE_ID).to(pred.device) loss = loss_fn(pred, gold) return loss
def _test_train_and_eval(test_case): if os.getenv("ONEFLOW_TEST_CPU_ONLY"): device = flow.device("cpu") else: device = flow.device("cuda") net = LeNet() lr, num_epochs = 0.02, 1 optimizer = flow.optim.SGD(net.parameters(), lr=lr, momentum=0.9) net.to(device) batch_size = 256 data_dir = os.path.join(os.getenv("ONEFLOW_TEST_CACHE_DIR", "./data-test"), "fashion-mnist-lenet") source_url = "https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/" train_iter, test_iter = load_data_fashion_mnist( batch_size=batch_size, resize=None, root=data_dir, download=True, source_url=source_url, num_workers=0, ) loss = nn.CrossEntropyLoss() loss.to(device) final_accuracy = 0 for epoch in range(num_epochs): train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time( ) for X, y in train_iter: X = X.to(device=device) y = y.to(device=device) # forward y_hat = net(X) l = loss(y_hat, y).sum() # backward l.backward() optimizer.step() optimizer.zero_grad() train_l_sum += l.numpy() train_acc_sum += (y_hat.argmax(dim=1).numpy() == y.numpy()).sum() n += y.shape[0] batch_count += 1 if batch_count == 20: break test_acc = evaluate_accuracy(test_iter, net) final_accuracy = train_acc_sum / n print( "epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec" % ( epoch + 1, train_l_sum / batch_count, final_accuracy, test_acc, time.time() - start, ))
def main(args): random.seed(args.seed) dataset_path = "./data/names" n_categories = processDataset(dataset_path) n_hidden = 128 rnn = LSTM(n_letters, n_hidden, n_categories) criterion = nn.CrossEntropyLoss() rnn.to("cuda") criterion.to("cuda") of_sgd = optim.SGD(rnn.parameters(), lr=learning_rate) # Keep track of losses for plotting current_loss = 0 all_losses = [] start = time.time() samples = 0.0 correct_guess = 0.0 for iter in range(1, n_iters + 1): category, line, category_tensor, line_tensor = randomTrainingExample() output, loss = train(category_tensor, line_tensor, rnn, criterion, of_sgd) current_loss += loss # Print iter number, loss, name and guess if iter % print_every == 0: start, time_str = timeSince(start) guess, guess_i = categoryFromOutput(output) correct = "✓" if guess == category else "✗ (%s)" % category if correct == "✓": correct_guess += 1 samples += 1 print( "iter: %d / %f%%, time_for_every_%d_iter: %s, loss: %.4f, predict: %s / %s, correct? %s, acc: %f" % ( iter, float(iter) / n_iters * 100, print_every, time_str, loss, line, guess, correct, correct_guess / samples, ) ) # Add current loss avg to list of losses if iter % plot_every == 0: all_losses.append(current_loss / plot_every) current_loss = 0 writer = open("all_losses.txt", "w") for o in all_losses: writer.write("%f\n" % o) writer.close()
def train(opt): with open(opt.label_dict, "r") as f: lab_dict = json.load(f) cnn = simple_CNN(opt.num_speakers) cnn.to("cuda") cost = nn.CrossEntropyLoss() cost.to("cuda") optimizer = optim.RMSprop(cnn.parameters(), lr=opt.lr, alpha=opt.alpha, eps=opt.eps) output_folder = opt.output_path N_batches = opt.N_batches N_epoches = opt.N_epoches for epoch in range(N_epoches): cnn.train() loss_sum = 0 err_sum = 0 for i in range(N_batches): inp, lab = create_batches_rnd( lab_dict, batch_size=opt.batch_size, wlen=opt.wlen, fact_amp=opt.fact_amp, train=True, ) inp = inp.unsqueeze(1) lab -= 1 pout = cnn(inp) pred = flow.argmax(pout, dim=1) loss = cost(pout, lab.long()) err = np.mean(pred.numpy() != lab.long().numpy()) loss.backward() optimizer.step() optimizer.zero_grad() loss_sum = loss_sum + loss.detach() err_sum = err_sum + err loss_tot = loss_sum / N_batches err_tot = err_sum / N_batches if epoch % 10 == 0: print("epoch %i, loss_tr=%f err_tr=%f" % (epoch, loss_tot.numpy(), err_tot)) flow.save(cnn.state_dict(), os.path.join(output_folder, "CNN_model"))
def main(): print("Generating data...", end="") voc_size = args.vocab_sz inp = np.arange(2, voc_size, 2) tgt = np.arange(3, voc_size, 2) data_x, data_y = get_numbers(inp, tgt) train_len = int(len(data_x) * 0.9) train_x, val_x = data_x[:train_len], data_x[train_len:] train_y, val_y = data_y[:train_len], data_y[train_len:] print("Done") print("Setting model...", end="") model = TransformerModel( input_sz=voc_size, output_sz=voc_size, d_model=args.d_model, nhead=args.n_head, num_encoder_layers=args.n_encoder_layers, num_decoder_layers=args.n_decoder_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, ) if args.load_dir != ".": model.load_state_dict(flow.load(args.load_dir)) model = to_cuda(model) criterion = to_cuda(nn.CrossEntropyLoss()) optimizer = flow.optim.Adam(model.parameters(), lr=args.lr) print("Done") print("Training...") min_loss = 100 for i in range(1, args.n_epochs + 1): epoch_loss = train(model, criterion, optimizer, train_x, train_y) epoch_loss_val = validation(model, criterion, val_x, val_y) print("epoch: {} train loss: {}".format(i, epoch_loss)) print("epoch: {} val loss: {}".format(i, epoch_loss_val)) if epoch_loss < min_loss: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) else: shutil.rmtree(args.save_dir) assert not os.path.exists(args.save_dir) os.mkdir(args.save_dir) flow.save(model.state_dict(), args.save_dir) if i % 3 == 2: print(test(model, test_times=10))
def __init__(self): super().__init__() self.squad_model = squad_model self.criterion = nn.CrossEntropyLoss() self.add_optimizer(optimizer, lr_sch=lr_scheduler) self._decoders = train_decoders if args.use_fp16: self.config.enable_amp(True) grad_scaler = flow.amp.GradScaler( init_scale=2 ** 30, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, ) self.set_grad_scaler(grad_scaler)
def forward( self, input_ids, position_ids=None, token_type_ids=None, labels=None, past_key_values=None, use_cache=False, output_attentions=False, output_hidden_states=False, ): transformer_outputs = self.transformer( input_ids, position_ids, token_type_ids, past_key_values, use_cache, output_attentions, output_hidden_states, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n seq_len = lm_logits.size(1) shift_logits = lm_logits[..., :seq_len - 1, :] shift_labels = labels[..., 1:] # Flatten the tokens loss_fct = nn.CrossEntropyLoss() shift_logits = shift_logits.view(-1, shift_logits.size(-1)) shift_labels = shift_labels.view(-1) loss = loss_fct(shift_logits, shift_labels) output = (lm_logits, ) + transformer_outputs[1:] if loss is not None: return (loss, ) + output else: return output
def _test_train_and_eval(test_case): if os.getenv("ONEFLOW_TEST_CPU_ONLY"): device = flow.device("cpu") else: device = flow.device("cuda") model = Net() model.to(device) loss = nn.CrossEntropyLoss().to(device) optimizer = flow.optim.SGD(model.parameters(), lr=0.10) num_epochs = 1 for epoch in range(num_epochs): train_loss, n_correct, n_samples = 0.0, 0.0, 0 for images, labels in train_iter: images = images.reshape(-1, 28 * 28) images = images.to(device=device) labels = labels.to(device=device) features = model(images) l = loss(features, labels).sum() optimizer.zero_grad() l.backward() optimizer.step() train_loss += l.numpy() n_correct += (features.argmax(dim=1).numpy() == labels.numpy()).sum() n_samples += images.shape[0] if n_samples > 2000: break test_acc = evaluate_accuracy(test_iter, model, device) train_acc = n_correct / n_samples print( "epoch %d, train loss %.4f, train acc %.3f, test acc %.3f" % (epoch + 1, train_loss / n_samples, train_acc, test_acc) )
def main(): args = get_config() world_size = flow.env.get_world_size() if args.train_global_batch_size is None: args.train_global_batch_size = args.train_batch_size * world_size else: assert args.train_global_batch_size % args.train_batch_size == 0 if args.val_global_batch_size is None: args.val_global_batch_size = args.val_batch_size * world_size else: assert args.val_global_batch_size % args.val_batch_size == 0 flow.boxing.nccl.set_fusion_threshold_mbytes(args.nccl_fusion_threshold_mb) flow.boxing.nccl.set_fusion_max_ops_num(args.nccl_fusion_max_ops) if args.with_cuda: device = "cuda" else: device = "cpu" print("Device is: ", device) print("Creating Dataloader") train_data_loader = OfRecordDataLoader( ofrecord_dir=args.ofrecord_path, mode="train", dataset_size=args.train_dataset_size, batch_size=args.train_global_batch_size, data_part_num=args.train_data_part, seq_length=args.seq_length, max_predictions_per_seq=args.max_predictions_per_seq, consistent=args.use_consistent, ) test_data_loader = OfRecordDataLoader( ofrecord_dir=args.ofrecord_path, mode="test", dataset_size=1024, batch_size=args.val_global_batch_size, data_part_num=4, seq_length=args.seq_length, max_predictions_per_seq=args.max_predictions_per_seq, consistent=args.use_consistent, ) print("Building BERT Model") hidden_size = 64 * args.num_attention_heads intermediate_size = 4 * hidden_size bert_model = BertForPreTraining( args.vocab_size, args.seq_length, hidden_size, args.num_hidden_layers, args.num_attention_heads, intermediate_size, nn.GELU(), args.hidden_dropout_prob, args.attention_probs_dropout_prob, args.max_position_embeddings, args.type_vocab_size, ) # Load the same initial parameters with lazy model. # from utils.compare_lazy_outputs import load_params_from_lazy # load_params_from_lazy( # bert_model.state_dict(), # "../../OneFlow-Benchmark/LanguageModeling/BERT/initial_model", # ) assert id(bert_model.cls.predictions.decoder.weight) == id( bert_model.bert.embeddings.word_embeddings.weight ) ns_criterion = nn.CrossEntropyLoss(reduction="mean") mlm_criterion = nn.CrossEntropyLoss(reduction="none") if args.use_consistent: placement = flow.env.all_device_placement("cuda") bert_model = bert_model.to_consistent( placement=placement, sbp=flow.sbp.broadcast ) else: bert_model.to(device) ns_criterion.to(device) mlm_criterion.to(device) optimizer = build_optimizer( args.optim_name, bert_model, args.lr, args.weight_decay, weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], clip_grad_max_norm=1, clip_grad_norm_type=2.0, ) steps = args.epochs * len(train_data_loader) warmup_steps = int(steps * args.warmup_proportion) lr_scheduler = PolynomialLR(optimizer, steps=steps, end_learning_rate=0.0) lr_scheduler = flow.optim.lr_scheduler.WarmUpLR( lr_scheduler, warmup_factor=0, warmup_iters=warmup_steps, warmup_method="linear" ) def get_masked_lm_loss( logit, masked_lm_labels, label_weights, max_predictions_per_seq, ): label_id = flow.reshape(masked_lm_labels, [-1]) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. pre_example_loss = mlm_criterion(logit, label_id) pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq]) numerator = flow.sum(pre_example_loss * label_weights) denominator = flow.sum(label_weights) + 1e-5 loss = numerator / denominator return loss class BertGraph(nn.Graph): def __init__(self): super().__init__() self.bert = bert_model self.ns_criterion = ns_criterion self.masked_lm_criterion = partial( get_masked_lm_loss, max_predictions_per_seq=args.max_predictions_per_seq ) self.add_optimizer(optimizer, lr_sch=lr_scheduler) self._train_data_loader = train_data_loader if args.grad_acc_steps > 1: self.config.set_gradient_accumulation_steps(args.grad_acc_steps) if args.use_fp16: self.config.enable_amp(True) grad_scaler = flow.amp.GradScaler( init_scale=2 ** 30, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, ) self.set_grad_scaler(grad_scaler) self.config.allow_fuse_add_to_output(True) self.config.allow_fuse_model_update_ops(True) def build(self): ( input_ids, next_sentence_labels, input_mask, segment_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights, ) = self._train_data_loader() input_ids = input_ids.to(device=device) input_mask = input_mask.to(device=device) segment_ids = segment_ids.to(device=device) next_sentence_labels = next_sentence_labels.to(device=device) masked_lm_ids = masked_lm_ids.to(device=device) masked_lm_positions = masked_lm_positions.to(device=device) masked_lm_weights = masked_lm_weights.to(device=device) # 1. forward the next_sentence_prediction and masked_lm model prediction_scores, seq_relationship_scores = self.bert( input_ids, segment_ids, input_mask, masked_lm_positions ) # 2-1. loss of is_next classification result next_sentence_loss = self.ns_criterion( seq_relationship_scores.reshape(-1, 2), next_sentence_labels.reshape(-1) ) masked_lm_loss = self.masked_lm_criterion( prediction_scores, masked_lm_ids, masked_lm_weights ) total_loss = masked_lm_loss + next_sentence_loss total_loss.backward() return ( seq_relationship_scores, next_sentence_labels, total_loss, masked_lm_loss, next_sentence_loss, ) bert_graph = BertGraph() class BertEvalGraph(nn.Graph): def __init__(self): super().__init__() self.bert = bert_model self._test_data_loader = test_data_loader self.config.allow_fuse_add_to_output(True) def build(self): ( input_ids, next_sent_labels, input_masks, segment_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights, ) = self._test_data_loader() input_ids = input_ids.to(device=device) input_masks = input_masks.to(device=device) segment_ids = segment_ids.to(device=device) next_sent_labels = next_sent_labels.to(device=device) masked_lm_ids = masked_lm_ids.to(device=device) masked_lm_positions = masked_lm_positions.to(device) with flow.no_grad(): # 1. forward the next_sentence_prediction and masked_lm model _, seq_relationship_scores = self.bert( input_ids, input_masks, segment_ids ) return seq_relationship_scores, next_sent_labels bert_eval_graph = BertEvalGraph() train_total_losses = [] for epoch in range(args.epochs): metric = Metric( desc="bert pretrain", print_steps=args.loss_print_every_n_iters, batch_size=args.train_global_batch_size * args.grad_acc_steps, keys=["total_loss", "mlm_loss", "nsp_loss", "pred_acc"], ) # Train bert_model.train() for step in range(len(train_data_loader)): bert_outputs = pretrain(bert_graph, args.metric_local) if flow.env.get_rank() == 0: metric.metric_cb(step, epoch=epoch)(bert_outputs) train_total_losses.append(bert_outputs["total_loss"]) # Eval bert_model.eval() val_acc = validation( epoch, len(test_data_loader), bert_eval_graph, args.val_print_every_n_iters, args.metric_local, ) save_model(bert_model, args.checkpoint_path, epoch, val_acc, args.use_consistent)
def test(test_case): num_inputs, num_outputs, num_hiddens = 784, 10, 256 net = nn.Sequential( FlattenLayer(), nn.Linear(num_inputs, num_hiddens), nn.ReLU(), nn.Linear(num_hiddens, num_outputs), ) if os.getenv("ONEFLOW_TEST_CPU_ONLY"): device = flow.device("cpu") else: device = flow.device("cuda") net.to(device) batch_size = 256 num_epochs = 1 data_dir = os.path.join( os.getenv("ONEFLOW_TEST_CACHE_DIR", "./data-test"), "fashion-mnist" ) source_url = "https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/" train_iter, test_iter = load_data_fashion_mnist( batch_size, resize=None, root=data_dir, download=True, source_url=source_url ) loss = nn.CrossEntropyLoss() loss.to(device) optimizer = flow.optim.SGD(net.parameters(), lr=0.1) final_accuracy = 0 for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 start = time.time() for X, y in train_iter: X = X.to(device=device) y = y.to(device=device) y_hat = net(X) l = loss(y_hat, y).sum() optimizer.zero_grad() l.backward() optimizer.step() train_l_sum += l.numpy() train_acc_sum += (y_hat.argmax(dim=1).numpy() == y.numpy()).sum() n += y.shape[0] if n > 200: break test_acc = evaluate_accuracy(test_iter, net) final_accuracy = train_acc_sum / n print( "epoch %d, loss %.4f, train acc %.3f, test acc %.3f, cost >>>>>>> %s(s)" % ( epoch + 1, train_l_sum / n, final_accuracy, test_acc, str(time.time() - start), ) ) final_accuracy = train_acc_sum / n
def _test(test_case): if os.getenv("ONEFLOW_TEST_CPU_ONLY"): device = flow.device("cpu") else: device = flow.device("cuda") net = Net() net.to(device) optimizer = optim.SGD(net.parameters(), lr=0.002, momentum=0.9) criterion = nn.CrossEntropyLoss() criterion.to(device) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) train_epoch = 1 batch_size = 4 num_workers = 0 data_dir = os.path.join(os.getenv("ONEFLOW_TEST_CACHE_DIR", "./data-test"), "cifar10") train_iter, test_iter = load_data_cifar10( batch_size=batch_size, data_dir=data_dir, download=True, transform=transform, source_url= "https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/cifar/cifar-10-python.tar.gz", num_workers=num_workers, ) final_loss = 0 for epoch in range(1, train_epoch + 1): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(train_iter, 1): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs = inputs.to(dtype=flow.float32, device=device) labels = labels.to(dtype=flow.int64, device=device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 200 == 0: # print every 200 mini-batches final_loss = running_loss / 200 print("epoch: %d step: %5d loss: %.3f " % (epoch, i, final_loss)) running_loss = 0.0 break print("final loss : ", final_loss)
def loss(self, cls_score, labels): losses = nn.CrossEntropyLoss(cls_score, labels) return losses
# net = EfficientNetB0() # net = RegNetX_200MF() # net = SimpleDLA() net = net.to(device) net.train() if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = flow.load('./checkpoint/ckpt.pth') net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) scheduler = flow.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) class ResNet18TrainGraph(flow.nn.Graph): def __init__(self): super().__init__() self.model = net self.loss_fn = criterion self.add_optimizer(optimizer, lr_sch=scheduler) def build(self, x, y):
def main(args): transform = vision.transforms.Compose([ vision.transforms.ToTensor(), vision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) trainset = vision.datasets.CIFAR10(root=args.data_root, train=True, download=True, transform=transform) trainloader = flow.utils.data.DataLoader(trainset, batch_size=args.train_batch_size, shuffle=True, num_workers=1) testset = vision.datasets.CIFAR10(root=args.data_root, train=False, download=True, transform=transform) testloader = flow.utils.data.DataLoader(testset, batch_size=args.val_batch_size, shuffle=False, num_workers=1) classes = ( "plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck", ) device = flow.device("cuda") expert_network = MLP(input_size=3072, output_size=10, hidden_size=256) net = MoE(expert_network, 3072, 10, num_experts=10, noisy_gating=True, k=4) net.to(device) optimizer = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=args.mom) criterion = nn.CrossEntropyLoss() criterion.to(device) for epoch in range(args.epochs): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize inputs = inputs.view(inputs.shape[0], -1) outputs, aux_loss = net(inputs) loss = criterion(outputs, labels) total_loss = loss + aux_loss total_loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 100 == 99: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 100)) running_loss = 0.0 print("Finished Training") correct = 0 total = 0 with torch.no_grad(): for i, data in enumerate(testloader, 0): images, labels = data images, labels = images.to(device), labels.to(device) outputs, _ = net(images.view(images.shape[0], -1)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total))
def train_eval(config): """ training and testing model Args: config: configuration items Returns: the trained model and the evaluation results """ # loading the features preprocessed by preprocess.py if config.feature_method == "o": x_train, x_test, y_train, y_test = of.load_feature( config, config.train_feature_path_opensmile, train=True) elif config.feature_method == "l": x_train, x_test, y_train, y_test = lf.load_feature( config, config.train_feature_path_librosa, train=True) n_feats = x_train.shape[1] y_train = np.array(y_train) y_test = np.array(y_test) train_dataset = SpeechDataset(x_train, y_train) test_dataset = SpeechDataset(x_test, y_test) train_iter = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) test_iter = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=True) if config.model == "lstm": model = lstm_ser(n_feats, config.rnn_size, len(config.class_labels), config.batch_size) else: model = cnn1d_ser(1, config.n_kernels, n_feats, config.hidden_size, len(config.class_labels)) loss_fn = nn.CrossEntropyLoss() model.to("cuda") loss_fn.to("cuda") optimizer = flow.optim.Adam(model.parameters(), lr=config.lr) def train(iter, model, loss_fn, optimizer): size = len(iter.dataset) num_batches = len(iter) trian_loss, correct = 0, 0 for batch, (x, y) in enumerate(iter): x = x.reshape(1, x.shape[0], x.shape[1]) x = flow.tensor(x, dtype=flow.float32, device="cuda") y = flow.tensor(y, dtype=flow.int32, device="cuda") # Compute prediction error pred = model(x) loss = loss_fn(pred, y) bool_value = np.argmax(pred.numpy(), 1) == y.numpy() correct += float(bool_value.sum()) trian_loss += loss # Backpropagation optimizer.zero_grad() loss.backward() optimizer.step() current = batch * config.batch_size if batch % 15 == 0: print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") return trian_loss / num_batches, 100 * correct / size def test(iter, model, loss_fn): size = len(iter.dataset) num_batches = len(iter) model.eval() test_loss, correct = 0, 0 flag = 0 with flow.no_grad(): for x, y in iter: if x.shape[0] != config.batch_size: flag = 1 n = config.batch_size - x.shape[0] x_comp = flow.zeros((n, x.shape[1])) y_comp = flow.zeros(y.shape[0]) x = flow.tensor(np.vstack((x.numpy(), x_comp.numpy()))) y = flow.tensor(np.hstack((y.numpy(), y_comp.numpy()))) x = x.reshape(1, x.shape[0], x.shape[1]) x = flow.tensor(x, dtype=flow.float32, device="cuda") y = flow.tensor(y, dtype=flow.int32, device="cuda") pred = model(x) test_loss += loss_fn(pred, y) if flag == 0: bool_value = np.argmax(pred.numpy(), 1) == y.numpy() else: bool_value = np.argmax(pred.numpy()[0:16], 1) == y.numpy()[0:16] correct += float(bool_value.sum()) test_loss /= num_batches print("test_loss", test_loss, "num_batches ", num_batches) correct /= size print( f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f}" ) return test_loss, 100 * correct train_loss, train_acc, test_loss, test_acc = [], [], [], [] for e in range(config.epochs): print(f"Epoch {e + 1}\n-------------------------------") tr_loss, tr_acc = train(train_iter, model, loss_fn, optimizer) train_loss.append(tr_loss.numpy()) train_acc.append(tr_acc) te_loss, te_acc = test(test_iter, model, loss_fn) test_loss.append(te_loss.numpy()) test_acc.append(te_acc) print("Done!") # Saving the trained model model_path = os.path.join(config.checkpoint_path, config.checkpoint_name) if os.path.exists(model_path): shutil.rmtree(model_path) flow.save(model.state_dict(), model_path) # Visualize the training process if config.vis: curve(train_acc, test_acc, "Accuracy", "acc") curve(train_loss, test_loss, "Loss", "loss") return train_loss, test_loss, train_acc, test_acc
def train(): train_data, train_labels, test_data, test_labels = prepare_data() best_accuracy = 0.0 best_epoch = 0 print("Setting model...") model = TransformerEncoderModel( emb_sz=args.vocab_sz, n_classes=args.n_classes, d_model=args.d_model, nhead=args.n_head, num_encoder_layers=args.n_encoder_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, batch_first=True, ) criterion = nn.CrossEntropyLoss() model.to("cuda") criterion.to("cuda") of_adam = flow.optim.Adam(model.parameters(), lr=args.lr) if args.load_dir != ".": model.load_state_dict(flow.load(args.load_dir)) print("Starting training...") training_time = 0 for epoch in range(1, args.n_epochs + 1): print("[Epoch:{}]".format(epoch)) model.train() data, label = shuffle_batch(train_data, train_labels, args.batch_size) s_t = time.time() epoch_loss = 0 for i, (texts, labels) in enumerate(zip(data, label)): output = model(texts) loss = criterion(output, labels) loss.backward() of_adam.step() of_adam.zero_grad() epoch_loss += loss.numpy() if i % 50 == 0 or i == data.shape[0] - 1: print("{0:d}/{1:d}, loss:{2:.4f}".format( i + 1, data.shape[0], loss.numpy())) epoch_loss /= data.shape[0] e_t = time.time() - s_t training_time += e_t print("Epoch:{0:d} training time:{1:.2f}s, loss:{2:.4f}".format( epoch, e_t, epoch_loss)) model.eval() data, label = shuffle_batch(test_data, test_labels, args.batch_size) g = {"correct": 0, "total": 0} for i, (texts, labels) in enumerate(zip(data, label)): logits = model(texts) acc(labels, logits, g) accuracy = g["correct"] * 100 / g["total"] print("[Epoch:{0:d} ] accuracy: {1:.1f}%".format(epoch, accuracy)) if accuracy > best_accuracy: best_accuracy = accuracy best_epoch = epoch if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) else: shutil.rmtree(args.save_dir) assert not os.path.exists(args.save_dir) os.mkdir(args.save_dir) print("Epoch:{} save best model.".format(best_epoch)) flow.save(model.state_dict(), args.save_dir) print("Epoch:{} get best accuracy:{}, average training time:{}s".format( best_epoch, best_accuracy, training_time / args.n_epochs))
input_mask = lazy_info["input_mask"] segment_ids = lazy_info["segment_ids"] masked_lm_ids = lazy_info["masked_lm_ids"] masked_lm_positions = lazy_info["masked_lm_positions"] masked_lm_weights = lazy_info["masked_lm_weights"] bert_module.to("cuda") prediction_scores, seq_relationship_scores = bert_module( flow.tensor(input_ids).to("cuda"), flow.tensor(segment_ids).to("cuda"), flow.tensor(input_mask).to("cuda"), ) next_sentence_loss = nn.CrossEntropyLoss()( seq_relationship_scores.view(-1, 2), flow.tensor(next_sentence_labels).view(-1).to("cuda"), ) logit_prob, masked_lm_loss = get_masked_lm_loss( prediction_scores, flow.tensor(masked_lm_positions).to("cuda"), flow.tensor(masked_lm_ids).to("cuda"), flow.tensor(masked_lm_weights).to("cuda"), ) eager_total_loss = next_sentence_loss + masked_lm_loss # Loss equal assert np.allclose(total_loss, eager_total_loss.numpy()), "total loss is not equal!" assert np.allclose(
def train(self): # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr start_iters = 0 if self.resume_iters: pass norm = Normalizer() data_iter = iter(self.data_loader) print("Start training......") start_time = datetime.now() for i in range(start_iters, self.num_iters): # Preprocess input data # Fetch real images and labels. try: x_real, speaker_idx_org, label_org = next(data_iter) except: data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # Generate target domain labels randomly. rand_idx = flow.randperm(label_org.size(0)) label_trg = label_org[rand_idx] speaker_idx_trg = speaker_idx_org[rand_idx] x_real = x_real.to(self.device) # Original domain one-hot labels. label_org = label_org.to(self.device) # Target domain one-hot labels. label_trg = label_trg.to(self.device) speaker_idx_org = speaker_idx_org.to(self.device) speaker_idx_trg = speaker_idx_trg.to(self.device) # Train the discriminator # Compute loss with real audio frame. CELoss = nn.CrossEntropyLoss() cls_real = self.C(x_real) cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) self.reset_grad() cls_loss_real.backward() self.c_optimizer.step() # Logging. loss = {} loss["C/C_loss"] = cls_loss_real.item() out_r = self.D(x_real, label_org) # Compute loss with fake audio frame. x_fake = self.G(x_real, label_trg) out_f = self.D(x_fake.detach(), label_trg) d_loss_t = nn.BCEWithLogitsLoss()( input=out_f, target=flow.zeros_like( out_f).float()) + nn.BCEWithLogitsLoss()( input=out_r, target=flow.ones_like(out_r).float()) out_cls = self.C(x_fake) d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Compute loss for gradient penalty. alpha = flow.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = ((alpha * x_real + (1 - alpha) * x_fake).detach().requires_grad_(True)) out_src = self.D(x_hat, label_trg) # TODO: Second-order derivation is not currently supported in oneflow, so gradient penalty cannot be used temporarily. if self.use_gradient_penalty: d_loss_gp = self.gradient_penalty(out_src, x_hat) d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp else: d_loss = d_loss_t + self.lambda_cls * d_loss_cls self.reset_grad() d_loss.backward() self.d_optimizer.step() loss["D/D_loss"] = d_loss.item() # Train the generator if (i + 1) % self.n_critic == 0: # Original-to-target domain. x_fake = self.G(x_real, label_trg) g_out_src = self.D(x_fake, label_trg) g_loss_fake = nn.BCEWithLogitsLoss()( input=g_out_src, target=flow.ones_like(g_out_src).float()) out_cls = self.C(x_real) g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org) # Target-to-original domain. x_reconst = self.G(x_fake, label_org) g_loss_rec = nn.L1Loss()(x_reconst, x_real) # Original-to-Original domain(identity). x_fake_iden = self.G(x_real, label_org) id_loss = nn.L1Loss()(x_fake_iden, x_real) # Backward and optimize. g_loss = (g_loss_fake + self.lambda_cycle * g_loss_rec + self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss) self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss["G/loss_fake"] = g_loss_fake.item() loss["G/loss_rec"] = g_loss_rec.item() loss["G/loss_cls"] = g_loss_cls.item() loss["G/loss_id"] = id_loss.item() loss["G/g_loss"] = g_loss.item() # Miscellaneous # Print out training information. if (i + 1) % self.log_step == 0: et = datetime.now() - start_time et = str(et)[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format( et, i + 1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) # Translate fixed images for debugging. if (i + 1) % self.sample_step == 0: with flow.no_grad(): d, speaker = TestSet(self.test_dir).test_data() target = random.choice( [x for x in speakers if x != speaker]) label_t = self.spk_enc.transform([target])[0] label_t = np.asarray([label_t]) for filename, content in d.items(): f0 = content["f0"] ap = content["ap"] sp_norm_pad = self.pad_coded_sp( content["coded_sp_norm"]) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = flow.Tensor(one_seg).to(self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = flow.Tensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).detach().cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content["coded_sp_norm"]. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f"{speaker}-{target}_iter{i+1}_{filename}" path = os.path.join(self.sample_dir, name) print(f"[save]:{path}") sf.write(path, wav, SAMPLE_RATE) # Save model checkpoints. if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, "{}-G".format(i + 1)) D_path = os.path.join(self.model_save_dir, "{}-D".format(i + 1)) C_path = os.path.join(self.model_save_dir, "{}-C".format(i + 1)) flow.save(self.G.state_dict(), G_path) flow.save(self.D.state_dict(), D_path) flow.save(self.C.state_dict(), C_path) print("Saved model checkpoints into {}...".format( self.model_save_dir)) # Decay learning rates. if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= self.g_lr / float(self.num_iters_decay) d_lr -= self.d_lr / float(self.num_iters_decay) c_lr -= self.c_lr / float(self.num_iters_decay) self.update_lr(g_lr, d_lr, c_lr) print("Decayed learning rates, g_lr: {}, d_lr: {}.".format( g_lr, d_lr))
def main(): args = get_config() if args.with_cuda: device = flow.device("cuda") else: device = flow.device("cpu") print("Creating Dataloader") train_data_loader = OfRecordDataLoader( ofrecord_dir=args.ofrecord_path, mode="train", dataset_size=args.train_dataset_size, batch_size=args.train_batch_size, data_part_num=args.train_data_part, seq_length=args.seq_length, max_predictions_per_seq=args.max_predictions_per_seq, consistent=False, ) test_data_loader = OfRecordDataLoader( ofrecord_dir=args.ofrecord_path, mode="test", dataset_size=1024, batch_size=args.val_batch_size, data_part_num=4, seq_length=args.seq_length, max_predictions_per_seq=args.max_predictions_per_seq, consistent=False, ) print("Building BERT Model") hidden_size = 64 * args.num_attention_heads intermediate_size = 4 * hidden_size bert_model = BertForPreTraining( args.vocab_size, args.seq_length, hidden_size, args.num_hidden_layers, args.num_attention_heads, intermediate_size, nn.GELU(), args.hidden_dropout_prob, args.attention_probs_dropout_prob, args.max_position_embeddings, args.type_vocab_size, ) # Load the same initial parameters with lazy model. # from utils.compare_lazy_outputs import load_params_from_lazy # load_params_from_lazy( # bert_model.state_dict(), # "../../OneFlow-Benchmark/LanguageModeling/BERT/initial_model", # ) bert_model = bert_model.to(device) if args.use_ddp: bert_model = ddp(bert_model) optimizer = build_optimizer( args.optim_name, bert_model, args.lr, args.weight_decay, weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], clip_grad_max_norm=1, clip_grad_norm_type=2.0, ) steps = args.epochs * len(train_data_loader) warmup_steps = int(steps * args.warmup_proportion) lr_scheduler = PolynomialLR(optimizer, steps=steps, end_learning_rate=0.0) lr_scheduler = flow.optim.lr_scheduler.WarmUpLR(lr_scheduler, warmup_factor=0, warmup_iters=warmup_steps, warmup_method="linear") ns_criterion = nn.CrossEntropyLoss(reduction="mean") mlm_criterion = nn.CrossEntropyLoss(reduction="none") def get_masked_lm_loss( logit_blob, masked_lm_positions, masked_lm_labels, label_weights, max_prediction_per_seq, ): # gather valid position indices logit_blob = flow.gather( logit_blob, index=masked_lm_positions.unsqueeze(2).repeat( 1, 1, args.vocab_size), dim=1, ) logit_blob = flow.reshape(logit_blob, [-1, args.vocab_size]) label_id_blob = flow.reshape(masked_lm_labels, [-1]) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. pre_example_loss = mlm_criterion(logit_blob, label_id_blob) pre_example_loss = flow.reshape(pre_example_loss, [-1, max_prediction_per_seq]) numerator = flow.sum(pre_example_loss * label_weights) denominator = flow.sum(label_weights) + 1e-5 loss = numerator / denominator return loss train_total_losses = [] for epoch in range(args.epochs): metric = Metric( desc="bert pretrain", print_steps=args.loss_print_every_n_iters, batch_size=args.train_batch_size, keys=["total_loss", "mlm_loss", "nsp_loss", "pred_acc"], ) # Train bert_model.train() for step in range(len(train_data_loader)): bert_outputs = pretrain( train_data_loader, bert_model, ns_criterion, partial( get_masked_lm_loss, max_prediction_per_seq=args.max_predictions_per_seq, ), optimizer, lr_scheduler, ) if flow.env.get_rank() == 0: metric.metric_cb(step, epoch=epoch)(bert_outputs) train_total_losses.append(bert_outputs["total_loss"]) # Eval bert_model.eval() val_acc = validation(epoch, test_data_loader, bert_model, args.val_print_every_n_iters) save_model(bert_model, args.checkpoint_path, epoch, val_acc, False)