def make_data_loader(args, mode, is_consistent=False, synthetic=False): assert mode in ("train", "validation") if mode == "train": total_batch_size = args.batch_size * flow.env.get_world_size() batch_size = args.batch_size num_samples = args.num_image else: total_batch_size = args.val_global_batch_size batch_size = args.val_batch_size num_samples = args.val_samples_per_epoch placement = None sbp = None if is_consistent: placement = flow.env.all_device_placement("cpu") sbp = flow.sbp.split(0) batch_size = total_batch_size if synthetic: data_loader = SyntheticDataLoader( batch_size=batch_size, num_classes=args.num_classes, placement=placement, sbp=sbp, ) return data_loader.to("cuda") ofrecord_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode=mode, dataset_size=num_samples, batch_size=batch_size, total_batch_size=total_batch_size, data_part_num=args.ofrecord_part_num, placement=placement, sbp=sbp, ) return ofrecord_data_loader
def main(args): train_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="train", dataset_size=9469, batch_size=args.train_batch_size, ) val_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="val", dataset_size=3925, batch_size=args.val_batch_size, ) # oneflow init start_t = time.time() mobilenetv2_module = mobilenet_v2() if args.load_checkpoint != "": print("load_checkpoint >>>>>>>>> ", args.load_checkpoint) mobilenetv2_module.load_state_dict(flow.load(args.load_checkpoint)) end_t = time.time() print("init time : {}".format(end_t - start_t)) of_cross_entropy = flow.nn.CrossEntropyLoss() mobilenetv2_module.to("cuda") of_cross_entropy.to("cuda") of_sgd = flow.optim.SGD(mobilenetv2_module.parameters(), lr=args.learning_rate, momentum=args.mom) of_losses = [] all_samples = len(val_data_loader) * args.val_batch_size print_interval = 20 for epoch in range(args.epochs): mobilenetv2_module.train() for b in range(len(train_data_loader)): image, label = train_data_loader.get_batch() # oneflow train start_t = time.time() image = image.to("cuda") label = label.to("cuda") logits = mobilenetv2_module(image) loss = of_cross_entropy(logits, label) loss.backward() of_sgd.step() of_sgd.zero_grad() end_t = time.time() if b % print_interval == 0: l = loss.numpy() of_losses.append(l) print( "epoch {} train iter {} oneflow loss {}, train time : {}". format(epoch, b, l, end_t - start_t)) print("epoch %d train done, start validation" % epoch) mobilenetv2_module.eval() correct_of = 0.0 for b in range(len(val_data_loader)): image, label = val_data_loader.get_batch() start_t = time.time() image = image.to("cuda") with flow.no_grad(): logits = mobilenetv2_module(image) predictions = logits.softmax() of_predictions = predictions.numpy() clsidxs = np.argmax(of_predictions, axis=1) label_nd = label.numpy() for i in range(args.val_batch_size): if clsidxs[i] == label_nd[i]: correct_of += 1 end_t = time.time() print("epoch %d, oneflow top1 val acc: %f" % (epoch, correct_of / all_samples)) flow.save( mobilenetv2_module.state_dict(), os.path.join( args.save_checkpoint_path, "epoch_%d_val_acc_%f" % (epoch, correct_of / all_samples), ), ) writer = open("of_losses.txt", "w") for o in of_losses: writer.write("%f\n" % o) writer.close()
def setup(args): train_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="train", dataset_size=9469, batch_size=args.train_batch_size, ) val_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="val", dataset_size=3925, batch_size=args.val_batch_size, ) criterion = flow.nn.CrossEntropyLoss() # model setup eager_model = resnet50() graph_model = resnet50() graph_model.load_state_dict(eager_model.state_dict()) eager_model.to("cuda") graph_model.to("cuda") # optimizer setup eager_optimizer = flow.optim.SGD(eager_model.parameters(), lr=args.learning_rate, momentum=args.mom) graph_optimizer = flow.optim.SGD(graph_model.parameters(), lr=args.learning_rate, momentum=args.mom) # criterion setup criterion = flow.nn.CrossEntropyLoss() criterion = criterion.to("cuda") class ModelTrainGraph(flow.nn.Graph): def __init__(self): super().__init__() self.graph_model = graph_model self.criterion = criterion self.add_optimizer(graph_optimizer) def build(self, image, label): logits = self.graph_model(image) loss = self.criterion(logits, label) loss.backward() return loss class ModelEvalGraph(flow.nn.Graph): def __init__(self): super().__init__() self.graph_model = graph_model def build(self, image): with flow.no_grad(): logits = self.graph_model(image) predictions = logits.softmax() return predictions model_train_graph = ModelTrainGraph() model_eval_graph = ModelEvalGraph() dic = { "train_dataloader": train_data_loader, "val_dataloader": val_data_loader, "eager": [eager_model, eager_optimizer, criterion], "graph": [graph_model, model_train_graph, model_eval_graph], } return dic
def main(args): # Data Setup train_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="train", dataset_size=9469, batch_size=args.train_batch_size, ) val_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="val", dataset_size=3925, batch_size=args.val_batch_size, ) # Model Setup print("***** Initialization *****") start_t = time.time() model = build_model(args) if args.load_checkpoint != "": print("load_checkpoint >>>>>>>>> ", args.load_checkpoint) model.load_state_dict(flow.load(args.load_checkpoint)) end_t = time.time() print("init time : {}".format(end_t - start_t)) # Training Setup criterion = flow.nn.CrossEntropyLoss() model.to("cuda") criterion.to("cuda") optimizer = flow.optim.SGD( model.parameters(), lr=args.learning_rate, momentum=args.mom, weight_decay=args.weight_decay, ) lr_scheduler = flow.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) loss_list = [] accuracy_list = [] best_acc = 0.0 for epoch in range(args.epochs): print("***** Runing Training *****") train_loss = train_one_epoch(args, model, criterion, train_data_loader, optimizer, epoch, lr_scheduler) print("***** Run Validation *****") accuracy = valid(args, model, criterion, val_data_loader) # save model after each epoch print("***** Save Checkpoint *****") save_path = os.path.join(args.save_checkpoint_path, "epoch_%d_val_acc_%f" % (epoch, accuracy)) save_checkpoint(model, save_path) print("Save checkpoint to: ", save_path) # save best model if best_acc < accuracy: save_path = os.path.join(args.save_checkpoint_path, "best_model") if os.path.exists(save_path): shutil.rmtree(save_path, True) save_checkpoint(model, save_path) best_acc = accuracy loss_list.append(train_loss) accuracy_list.append(accuracy) print("End Training!") print("Max Accuracy: ", best_acc) # saving training information print("***** Save Logs *****") save_logs(loss_list, "eager/losses.txt") print("Save loss info to: ", "eager/losses.txt") save_logs(accuracy_list, "eager/accuracy.txt") print("Save acc info to: ", "eager/accuracy.txt")
def main(args): # path setup training_results_path = os.path.join(args.results, args.tag) os.makedirs(training_results_path, exist_ok=True) # build dataloader train_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="train", dataset_size=9469, batch_size=args.train_batch_size, image_size=args.image_size, ) val_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="val", dataset_size=3925, batch_size=args.val_batch_size, image_size=args.image_size, ) # oneflow init start_t = time.time() model = build_model(args) if args.load_checkpoint != "": print("load_checkpoint >>>>>>>>> ", args.load_checkpoint) model.load_state_dict(flow.load(args.load_checkpoint)) end_t = time.time() print("init time : {}".format(end_t - start_t)) of_cross_entropy = flow.nn.CrossEntropyLoss() model.to("cuda") of_cross_entropy.to("cuda") of_sgd = flow.optim.SGD( model.parameters(), lr=args.learning_rate, momentum=args.mom ) class ViTNetGraph(flow.nn.Graph): def __init__(self): super().__init__() self.model = model self.cross_entropy = of_cross_entropy self.add_optimizer(of_sgd) self.train_data_loader = train_data_loader def build(self): image, label = self.train_data_loader() image = image.to("cuda") label = label.to("cuda") logits = self.model(image) loss = self.cross_entropy(logits, label) loss.backward() return loss vit_graph = ViTNetGraph() class ViTEvalGraph(flow.nn.Graph): def __init__(self): super().__init__() self.model = model self.val_data_loader = val_data_loader def build(self): image, label = self.val_data_loader() image = image.to("cuda") with flow.no_grad(): logits = self.model(image) predictions = logits.softmax() return predictions, label vit_eval_graph = ViTEvalGraph() of_losses = [] of_accuracy = [] all_samples = len(val_data_loader) * args.val_batch_size print_interval = 20 for epoch in range(args.epochs): model.train() for b in range(len(train_data_loader)): # oneflow graph train start_t = time.time() loss = vit_graph() end_t = time.time() if b % print_interval == 0: l = loss.numpy() of_losses.append(l) print( "epoch {} train iter {} oneflow loss {}, train time : {}".format( epoch, b, l, end_t - start_t ) ) print("epoch %d train done, start validation" % epoch) model.eval() correct_of = 0.0 for b in range(len(val_data_loader)): start_t = time.time() predictions, label = vit_eval_graph() of_predictions = predictions.numpy() clsidxs = np.argmax(of_predictions, axis=1) label_nd = label.numpy() for i in range(args.val_batch_size): if clsidxs[i] == label_nd[i]: correct_of += 1 end_t = time.time() top1 = correct_of / all_samples of_accuracy.append(top1) print("epoch %d, oneflow top1 val acc: %f" % (epoch, top1)) flow.save( model.state_dict(), os.path.join( args.save_checkpoint_path, "epoch_%d_val_acc_%f" % (epoch, correct_of / all_samples), ), ) writer = open("graph/losses.txt", "w") for o in of_losses: writer.write("%f\n" % o) writer.close() writer = open("graph/accuracy.txt", "w") for o in of_accuracy: writer.write("%f\n" % o) writer.close()