def run_task(data_directory, task_id): """ Parse data, build model, and run training and testing for a single task. :param data_directory: Path to train and test data. :param task_id: Task to evaluate """ print("Train and test for task %d ..." % task_id) # Parse data train_files = glob.glob('%s/qa%d_*_train.txt' % (data_directory, task_id)) test_files = glob.glob('%s/qa%d_*_test.txt' % (data_directory, task_id)) dictionary = {"nil": 0} # Story shape: (SENTENCE_SIZE, STORY_SIZE, NUM_STORIES) # Questions shape: (14 (see parser.py), NUM_SAMPLES) # QStory shape: (SENTENCE_SIZE, NUM_SAMPLES) train_story, train_questions, train_qstory = parse_babi_task(train_files, dictionary, False) test_story, test_questions, test_qstory = parse_babi_task(test_files, dictionary, False) general_config = BabiConfig(train_story, train_questions, dictionary) memory, model, loss = build_model(general_config) if general_config.linear_start: train_linear_start(train_story, train_questions, train_qstory, memory, model, loss, general_config) else: train(train_story, train_questions, train_qstory, memory, model, loss, general_config) test(test_story, test_questions, test_qstory, memory, model, loss, general_config)
def main(args): start_t = time.time() model = build_model(args) end_t = time.time() print("init time : {}".format(end_t - start_t)) start_t = time.time() pretrain_models = flow.load(args.model_path) model.load_state_dict(pretrain_models) end_t = time.time() print("load params time : {}".format(end_t - start_t)) model.eval() model.to("cuda") start_t = time.time() image = load_image(args.image_path, image_size=(args.image_size, args.image_size)) image = flow.Tensor(image, device=flow.device("cuda")) predictions = model(image).softmax() predictions = predictions.numpy() end_t = time.time() print("infer time : {}".format(end_t - start_t)) clsidx = np.argmax(predictions) print("predict prob: %f, class name: %s" % (np.max(predictions), clsidx_2_labels[clsidx]))
def train(train_cfg): with open(train_cfg,"r") as f: train_cfg = json.load(f) #UserDataApi = UserDataApi() feature_cfg = train_cfg["feature_cfg"] model_name = train_cfg["model"] model_cfg = train_cfg["model_cfg"] #stock_list = jq.get_index_stocks('000300.XSHG')#jq.get_industry_stocks('I64') stock_list = jq.get_all_securities().index.tolist() #stock_list= ['300014.XSHE'] print (stock_list) feature_cfg = "./config/feature_create_cfg.json" with open(feature_cfg,"r") as f: feature_cfg = json.load(f) info = create_feature(feature_cfg,stock_list,if_dump = True,load_feature = False) feature = info["feature"] label = info["label"] label_index = 0 clip_index = info["label_clip_index"][:,label_index] label = label[:,label_index] feature = feature[clip_index] label = label[clip_index] model = build_model(model_name,model_cfg) model.fit(feature,label) model.dump()
def train(self, sess, num_gpus): tf.set_random_seed(cfg.seed) set_log(cfg.job_dir) job_env = Prework(cfg) job_env.make_env() assert cfg.input_style == 0 model, handlers = build_model(cfg, job_env, num_gpus) handle, train_iterator, valid_iterator, train_num, valid_num = handlers valid_loss_checker = checker(cfg) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, name='train_saver') best_saver = tf.train.Saver(tf.global_variables(), name="best_saver") checkpoint_path = os.path.join(job_env.job_dir, 'model.ckpt') train_summary_writer = tf.summary.FileWriter(job_env.train_event_dir) valid_summary_writer = tf.summary.FileWriter(job_env.valid_event_dir) train_handle = sess.run(train_iterator.string_handle()) if cfg.resume: load_path = load_model(cfg, job_env, saver, sess) logging.info("Loading the existing model: %s" % load_path) epoch, train_cursor = 0, 0 group_size = num_gpus * cfg.batch_size train_num_batches = train_num // group_size valid_num_batches = valid_num // group_size while True: try: start_time = time.time() loss, _, i_global, i_merge = sess.run( [model.loss, model.train_op, model.global_step, model.merged], feed_dict={handle: train_handle, model.learning_rate: valid_loss_checker.learning_rate, model.fw_dropout_keep: cfg.fw_dropout_keep, model.recur_dropout_keep: cfg.recur_dropout_keep}) iter_time = time.time() - start_time if i_global % cfg.train_log_freq == 0: report_train(epoch, i_global, train_cursor, train_num_batches, loss, iter_time, i_merge, train_summary_writer) if i_global % cfg.valid_freq == 0: report_valid(sess, i_global, handle, valid_iterator, valid_num_batches, model, best_saver, valid_loss_checker, job_env, valid_summary_writer) if valid_loss_checker.should_stop(): break if i_global % cfg.save_freq == 0: saver.save(sess, checkpoint_path, global_step=i_global) train_cursor += 1 if train_cursor == train_num_batches: train_cursor = 0 epoch += 1 except tf.errors.OutOfRangeError: break sess.close()
def main(args): start_t = time.time() model = build_model(args) end_t = time.time() print("init time : {}".format(end_t - start_t)) start_t = time.time() pretrain_models = flow.load(args.model_path) model.load_state_dict(pretrain_models) end_t = time.time() print("load params time : {}".format(end_t - start_t)) model.eval() model.to("cuda") class ViTEvalGraph(flow.nn.Graph): def __init__(self): super().__init__() self.model = model def build(self, image): with flow.no_grad(): predictions = self.model(image) return predictions vit_eval_graph = ViTEvalGraph() start_t = time.time() image = load_image(args.image_path, image_size=(args.image_size, args.image_size)) image = flow.Tensor(image, device=flow.device("cuda")) predictions = vit_eval_graph(image).softmax() predictions = predictions.numpy() end_t = time.time() print("infer time : {}".format(end_t - start_t)) clsidx = np.argmax(predictions) print("predict prob: %f, class name: %s" % (np.max(predictions), clsidx_2_labels[clsidx]))
def run_task(data_directory, task_id): """ Parse data, build model, and run training and testing for a single task. :param data_directory: Path to train and test data. :param task_id: Task to evaluate """ print("Train and test for task %d ..." % task_id) # Parse data train_files = glob.glob('%s/qa%d_*_train.txt' % (data_directory, task_id)) test_files = glob.glob('%s/qa%d_*_test.txt' % (data_directory, task_id)) dictionary = {"nil": 0} # Story shape: (SENTENCE_SIZE, STORY_SIZE, NUM_STORIES) # Questions shape: (14 (see parser.py), NUM_SAMPLES) # QStory shape: (SENTENCE_SIZE, NUM_SAMPLES) train_story, train_questions, train_qstory = parse_babi_task( train_files, dictionary, False) test_story, test_questions, test_qstory = parse_babi_task( test_files, dictionary, False) general_config = BabiConfig(train_story, train_questions, dictionary) memory, model, loss = build_model(general_config) if general_config.linear_start: train_linear_start(train_story, train_questions, train_qstory, memory, model, loss, general_config) else: train(train_story, train_questions, train_qstory, memory, model, loss, general_config) test(test_story, test_questions, test_qstory, memory, model, loss, general_config)
def train(args, cfg): #加载数据 train_loader, test_loader = build_data.build_data_loader(args, cfg) print("------------load dataset success !---------------") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # net = VGG16_Finetuning.VGG16_Finetuning(cfg.DATASETS.NUM_CLASSES).to(device) net = build_model(args, cfg, device) print("-------------load model success !----------------") #学习算法的选择 if "SGD" == cfg.TRAIN_SET.LR_POLOCY: optimizer = optim.SGD(net.parameters(), lr=cfg.TRAIN_SET.BASE_LR, momentum=0.9) if "Adam" == cfg.Train_SET.LR_POLOCY: pass #optimizer = optim.Adam(net.parameters(), lr=cfg['lr']) schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones=[50,70,100,200],gamma=0.1) criterion = nn.CrossEntropyLoss().to(device) if not os.path.exists(cfg.WORK_SAVE.MODEL_PATH): os.mkdir(cfg.WORK_SAVE.MODEL_PATH) if not os.path.exists(cfg.WORK_SAVE.MODEL_PATH + "/" + cfg.MODEL.NAME): os.mkdir(cfg.WORK_SAVE.MODEL_PATH + "/" + cfg.MODEL.NAME) #定义Summary_Writer writer = SummaryWriter(cfg.WORK_SAVE.MODEL_PATH + "/" + cfg.MODEL.NAME + "/logs") writer.add_graph(net, torch.rand([1,3,cfg.MODEL.INPUT_SIZE,cfg.MODEL.INPUT_SIZE])) start_epoch = 0 if args.RESUME: checkpoint = torch.load(args.path_checkpoint) # 断点路径 net.load_state_dict(checkpoint['net']) # 加载断点 optimizer.load_state_dict(checkpoint['optimizer']) # 加载优化器参数 start_epoch = checkpoint['epoch'] # 设置开始的epoch schedule.load_state_dict(checkpoint['schedule']) #加载学习率的状态 else: if cfg.MODEL.PRE_TRAIN: net.load_state_dict(torch.load(cfg.MODEL.PRETRAIN_MODEL)) for epoch in range(start_epoch+1, args.epoch): running_loss = 0.0 total = 0.0 correct = 0.0 for i, data in enumerate(tqdm(train_loader)): img, label = data #input处理 img = img.to(device) label = label.to(device) img, label = Variable(img), Variable(label) #推断、损失计算和反向传播 optimizer.zero_grad() #梯度清零 outputs = net(img) #inference loss = criterion(outputs, label) #求解loss loss.backward() #反向传播求解梯度 optimizer.step() #更新权重参数 #计算准确率 _, predicted = torch.max(outputs.data, 1) total += label.size(0) correct += torch.sum(predicted == label.data).cpu().numpy() running_loss += loss.item() if i % 100 == 99: loss_avg = running_loss / 100 print( "Time: {:.19s} Training:Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss:{:.4f} Acc:{}" .format( str(datetime.now()), epoch, args.epoch, i + 1, len(train_loader), loss_avg, correct / total)) #writer loss for i, (name, param) in enumerate(net.named_parameter()): if 'bn' not in name: writer.add_histogram(name, param, 0) writer.add_scalar('loss', running_loss, i) running_loss = running_loss * 0.5 #feature map的可视化和卷积核的可视化 schedule.step() #模型保存 if (epoch + 1) % 50 == 0: print("epoch: ", epoch) print('learning rate: ', optimizer.state_dict()['param_groups'][0]['lr']) checkpoint = { "net": net.state_dict(), "optimizer":optimizer.state_dict(), "epoch": epoch, "schedule": schedule.state_dict() } torch.save(checkpoint, cfg.WORK_SAVE.MODEL_PATH + "/" + cfg.MODEL.NAME + '/' + str(epoch) + ".pth") #模型测试 test(cfg.WORK_SAVE.MODEL_PATH + "/" + cfg.MODEL.NAME + '/' + str(epoch) + ".pth", cfg.DATASETS.VAL_TXT)
def main(args): # path setup training_results_path = os.path.join(args.results, args.tag) os.makedirs(training_results_path, exist_ok=True) # build dataloader train_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="train", dataset_size=9469, batch_size=args.train_batch_size, image_size=args.image_size, ) val_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="val", dataset_size=3925, batch_size=args.val_batch_size, image_size=args.image_size, ) # oneflow init start_t = time.time() model = build_model(args) if args.load_checkpoint != "": print("load_checkpoint >>>>>>>>> ", args.load_checkpoint) model.load_state_dict(flow.load(args.load_checkpoint)) end_t = time.time() print("init time : {}".format(end_t - start_t)) of_cross_entropy = flow.nn.CrossEntropyLoss() model.to("cuda") of_cross_entropy.to("cuda") of_sgd = flow.optim.SGD( model.parameters(), lr=args.learning_rate, momentum=args.mom ) class ViTNetGraph(flow.nn.Graph): def __init__(self): super().__init__() self.model = model self.cross_entropy = of_cross_entropy self.add_optimizer(of_sgd) self.train_data_loader = train_data_loader def build(self): image, label = self.train_data_loader() image = image.to("cuda") label = label.to("cuda") logits = self.model(image) loss = self.cross_entropy(logits, label) loss.backward() return loss vit_graph = ViTNetGraph() class ViTEvalGraph(flow.nn.Graph): def __init__(self): super().__init__() self.model = model self.val_data_loader = val_data_loader def build(self): image, label = self.val_data_loader() image = image.to("cuda") with flow.no_grad(): logits = self.model(image) predictions = logits.softmax() return predictions, label vit_eval_graph = ViTEvalGraph() of_losses = [] of_accuracy = [] all_samples = len(val_data_loader) * args.val_batch_size print_interval = 20 for epoch in range(args.epochs): model.train() for b in range(len(train_data_loader)): # oneflow graph train start_t = time.time() loss = vit_graph() end_t = time.time() if b % print_interval == 0: l = loss.numpy() of_losses.append(l) print( "epoch {} train iter {} oneflow loss {}, train time : {}".format( epoch, b, l, end_t - start_t ) ) print("epoch %d train done, start validation" % epoch) model.eval() correct_of = 0.0 for b in range(len(val_data_loader)): start_t = time.time() predictions, label = vit_eval_graph() of_predictions = predictions.numpy() clsidxs = np.argmax(of_predictions, axis=1) label_nd = label.numpy() for i in range(args.val_batch_size): if clsidxs[i] == label_nd[i]: correct_of += 1 end_t = time.time() top1 = correct_of / all_samples of_accuracy.append(top1) print("epoch %d, oneflow top1 val acc: %f" % (epoch, top1)) flow.save( model.state_dict(), os.path.join( args.save_checkpoint_path, "epoch_%d_val_acc_%f" % (epoch, correct_of / all_samples), ), ) writer = open("graph/losses.txt", "w") for o in of_losses: writer.write("%f\n" % o) writer.close() writer = open("graph/accuracy.txt", "w") for o in of_accuracy: writer.write("%f\n" % o) writer.close()
def train(cfg, output_dir=""): # logger = logging.getLogger("ModelZoo.trainer") # build model set_random_seed(cfg.RNG_SEED) model, loss_fn, metric_fn = build_model(cfg) logger.info("Build model:\n{}".format(str(model))) model = nn.DataParallel(model).cuda() # build optimizer optimizer = build_optimizer(cfg, model) # build lr scheduler scheduler = build_scheduler(cfg, optimizer) # build checkpointer checkpointer = Checkpointer(model, optimizer=optimizer, scheduler=scheduler, save_dir=output_dir, logger=logger) checkpoint_data = checkpointer.load(cfg.GLOBAL.TRAIN.WEIGHT, resume=cfg.AUTO_RESUME) ckpt_period = cfg.GLOBAL.TRAIN.CHECKPOINT_PERIOD # build data loader train_data_loader = build_data_loader(cfg, cfg.GLOBAL.DATASET, mode="train") val_period = cfg.GLOBAL.VAL.VAL_PERIOD # val_data_loader = build_data_loader(cfg, mode="val") if val_period > 0 else None # build tensorboard logger (optionally by comment) tensorboard_logger = TensorboardLogger(output_dir) # train max_epoch = cfg.GLOBAL.MAX_EPOCH start_epoch = checkpoint_data.get("epoch", 0) # best_metric_name = "best_{}".format(cfg.TRAIN.VAL_METRIC) # best_metric = checkpoint_data.get(best_metric_name, None) logger.info("Start training from epoch {}".format(start_epoch)) for epoch in range(start_epoch, max_epoch): cur_epoch = epoch + 1 scheduler.step() start_time = time.time() train_meters = train_model( model, loss_fn, metric_fn, data_loader=train_data_loader, optimizer=optimizer, curr_epoch=epoch, tensorboard_logger=tensorboard_logger, log_period=cfg.GLOBAL.TRAIN.LOG_PERIOD, output_dir=output_dir, ) epoch_time = time.time() - start_time logger.info("Epoch[{}]-Train {} total_time: {:.2f}s".format( cur_epoch, train_meters.summary_str, epoch_time)) # checkpoint if cur_epoch % ckpt_period == 0 or cur_epoch == max_epoch: checkpoint_data["epoch"] = cur_epoch # checkpoint_data[best_metric_name] = best_metric checkpointer.save("model_{:03d}".format(cur_epoch), **checkpoint_data) ''' # validate if val_period < 1: continue if cur_epoch % val_period == 0 or cur_epoch == max_epoch: val_meters = validate_model(model, loss_fn, metric_fn, image_scales=cfg.MODEL.VAL.IMG_SCALES, inter_scales=cfg.MODEL.VAL.INTER_SCALES, isFlow=(cur_epoch > cfg.SCHEDULER.INIT_EPOCH), data_loader=val_data_loader, curr_epoch=epoch, tensorboard_logger=tensorboard_logger, log_period=cfg.TEST.LOG_PERIOD, output_dir=output_dir, ) logger.info("Epoch[{}]-Val {}".format(cur_epoch, val_meters.summary_str)) # best validation cur_metric = val_meters.meters[cfg.TRAIN.VAL_METRIC].global_avg if best_metric is None or cur_metric > best_metric: best_metric = cur_metric checkpoint_data["epoch"] = cur_epoch checkpoint_data[best_metric_name] = best_metric checkpointer.save("model_best", **checkpoint_data) ''' logger.info("Train Finish!") # logger.info("Best val-{} = {}".format(cfg.TRAIN.VAL_METRIC, best_metric)) return model
def main(args): train_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="train", dataset_size=9469, batch_size=args.train_batch_size, image_size=args.image_size, ) val_data_loader = OFRecordDataLoader( ofrecord_root=args.ofrecord_path, mode="val", dataset_size=3925, batch_size=args.val_batch_size, image_size=args.image_size, ) # oneflow init start_t = time.time() model = build_model(args) if args.load_checkpoint != "": print("load_checkpoint >>>>>>>>> ", args.load_checkpoint) checkpoint = flow.load(args.load_checkpoint) model.load_state_dict(checkpoint) end_t = time.time() print("init time : {}".format(end_t - start_t)) of_cross_entropy = flow.nn.CrossEntropyLoss() model.to("cuda") of_cross_entropy.to("cuda") of_sgd = flow.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.mom) of_losses = [] all_samples = len(val_data_loader) * args.val_batch_size print_interval = 20 for epoch in range(args.epochs): model.train() for b in range(len(train_data_loader)): image, label = train_data_loader() # oneflow train start_t = time.time() image = image.to("cuda") label = label.to("cuda") logits = model(image) loss = of_cross_entropy(logits, label) loss.backward() of_sgd.step() of_sgd.zero_grad() end_t = time.time() if b % print_interval == 0: l = loss.numpy() of_losses.append(l) print( "epoch {} train iter {} oneflow loss {}, train time : {}". format(epoch, b, l, end_t - start_t)) print("epoch %d train done, start validation" % epoch) model.eval() correct_of = 0.0 for b in range(len(val_data_loader)): image, label = val_data_loader() start_t = time.time() image = image.to("cuda") with flow.no_grad(): logits = model(image) predictions = logits.softmax() of_predictions = predictions.numpy() clsidxs = np.argmax(of_predictions, axis=1) label_nd = label.numpy() for i in range(args.val_batch_size): if clsidxs[i] == label_nd[i]: correct_of += 1 end_t = time.time() print("epoch %d, oneflow top1 val acc: %f" % (epoch, correct_of / all_samples)) flow.save( model.state_dict(), os.path.join( args.save_checkpoint_path, "epoch_%d_val_acc_%f" % (epoch, correct_of / all_samples), ), ) writer = open("of_losses.txt", "w") for o in of_losses: writer.write("%f\n" % o) writer.close()