def main(): args = parse() if args.logger_name is not None: logger_name = args.logger_name else: logger_name = datetime.datetime.now().isoformat() if not args.dont_log: logging.getLogger(logger_name) logger_filename = re.subn('\\D+', '', logger_name)[0] + '.log' logging.basicConfig(filename=logger_filename, level=logging.DEBUG) save_file = None if args.dont_save else args.save_file root = args.liar_dataset_dir tokenizer, model = pretrained(model=args.pretrained_model, weights=args.pretrained_weights, freeze=not args.dont_freeze) if not args.dont_load_model_from_file and os.path.exists(args.save_file): model.load_state_dict(torch.load(args.save_file)) device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_gpu else "cpu") print('using', device) model = model.to(device) if torch.cuda.device_count() > 1 and args.model_parallel: model = nn.DataParallel(model) if args.test: test(root, model, tokenizer, batch_size=args.batch_size, device=device) else: train(root, model, tokenizer, epochs=args.epochs, batch_size=args.batch_size, save_file=save_file, device=device)
def demo_test(args): if args.doc: args = config_loader(args.doc, args) # config # model_config(args, save=False) # print model configuration of evaluation # set cuda torch.cuda.set_device(args.gpu_id) # model model = model_builder(args.model_name, args.scale, **args.model_args).cuda() # criteriohn criterion = criterion_builder(args.criterion) # dataset test_set = AxisDataSet(args.test_path, args.target_path) test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, # pin_memory=True, pin_memory=False, ) # test test(model, test_loader, criterion, args)
def run_test(type): params = Params() #'./weight_one_hot/model3.ckpt' is best if type == 1: load_model = torch.load('./weight_0810/model46301.ckpt', map_location=lambda storage, loc: storage.cuda( params.gpu_ids_test[0])) else: load_model = None test(0, params, load_model, None, None, None, evaluation=False)
def eval(*args, **kwargs): print("=" * 80) print("Eval model on [weekly_data_all_rm_duplicate.txt]") print("Model to eval: best_model.keras") print("=" * 80) print("\n") test() print("=" * 80) print("Finish Eval.") print("Check results in folder [weekly_result_nonredundant_sep_iedbid].") print("=" * 80) print("\n")
def test_ckp(ckp_name, setting): sess = Session() sess.load_checkpoints(ckp_name) search_mode = setting.split('_')[0] # 'all' or 'indoor' search_setting = setting.split('_')[1] # 'single' or 'multi' transform_test = settings.test_transforms_list results_ranks = np.zeros(50) results_map = np.zeros(1) for i in range(settings.test_times): eval_test = SYSU_eval_datasets(data_folder=settings.data_folder, data_split='test', search_mode=search_mode, search_setting=search_setting, use_random=True) test_queryloader = DataLoader( Image_dataset(eval_test.query, transform=transform_test), batch_size=settings.val_batch_size, shuffle=False, num_workers=0, drop_last=False, ) test_galleryloader = DataLoader( Image_dataset(eval_test.gallery, transform=transform_test), batch_size=settings.val_batch_size, shuffle=False, num_workers=0, drop_last=False, ) test_ranks, test_mAP = test([ nn.Sequential(sess.feature_generator, sess.feature_embedder_rgb), nn.Sequential(sess.feature_generator, sess.feature_embedder_ir) ], test_queryloader, test_galleryloader) results_ranks += test_ranks results_map += test_mAP logger.info( 'Test no.{} for model {} in setting {}, Test mAP: {}, R1: {}, R5: {}, R10: {}, R20: {}' .format(i, ckp_name, setting, test_mAP * 100.0, test_ranks[0] * 100.0, test_ranks[4] * 100.0, test_ranks[9] * 100.0, test_ranks[19] * 100.0)) test_mAP = results_map / settings.test_times test_ranks = results_ranks / settings.test_times logger.info( 'For model {} in setting {}, AVG test mAP: {}, R1: {}, R5: {}, R10: {}, R20: {}' .format(ckp_name, setting, test_mAP * 100.0, test_ranks[0] * 100.0, test_ranks[4] * 100.0, test_ranks[9] * 100.0, test_ranks[19] * 100.0)) return [ ckp_name, test_mAP * 100.0, test_ranks[0] * 100.0, test_ranks[4] * 100.0, test_ranks[9] * 100.0, test_ranks[19] * 100.0 ]
def main(): global opt train_dataset = mnist_Dataset(num_of_cross=0,cross=1) if opt.manualSeed is None: opt.manualSeed = random.randint(1, 10000) if torch.cuda.is_available() and not opt.cuda: print("WARNING: You have a CUDA device, so you should probably run with \"cuda: True\"") torch.manual_seed(opt.manualSeed) else: if int(opt.ngpu) == 1: print('so we use 1 gpu to training') print('setting gpu on gpuid {0}'.format(opt.gpu_id)) if opt.cuda: os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id torch.cuda.manual_seed(opt.manualSeed) cudnn.benchmark = True #loss_rec = np.load('acc_train.npy') #acc_rec = np.load('acc.npy') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) # create model model = mnist_model.cat_and_dog_resnet() if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() # criterion = criterion.cuda() acc = test(model,opt,0,Training =False,cross=1)
def main(): global opt loss_rec = np.zeros((opt.folds, 100)) acc_rec = np.zeros((opt.folds, 100)) #loss_rec = np.load('acc_train.npy') #acc_rec = np.load('acc.npy') for iteration in range(opt.folds): train_dataset = mnist_Dataset(num_of_cross=iteration) print('number of train samples is: {0}'.format(len(train_dataset))) print('finished loading data') if opt.manualSeed is None: opt.manualSeed = random.randint(1, 10000) if torch.cuda.is_available() and not opt.cuda: print( "WARNING: You have a CUDA device, so you should probably run with \"cuda: True\"" ) torch.manual_seed(opt.manualSeed) else: if int(opt.ngpu) == 1: print('so we use 1 gpu to training') print('setting gpu on gpuid {0}'.format(opt.gpu_id)) if opt.cuda: os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id torch.cuda.manual_seed(opt.manualSeed) cudnn.benchmark = True print('Random Seed: {0}'.format(opt.manualSeed)) # train data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int( opt.workers)) # create model model = mnist_model.cat_and_dog_resnet() if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # Contrastive Loss #criterion = mnist_model.StableBCELoss() criterion = nn.CrossEntropyLoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() criterion = criterion.cuda() # optimizer # optimizer = optim.SGD(model.parameters(), lr=opt.lr, # momentum=opt.momentum, # weight_decay=opt.weight_decay) optimizer = optim.Adam(model.parameters(), lr=opt.lr) # optimizer = optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay, momentum=opt.momentum) # optimizer = optim.Adadelta(params=model.parameters(), lr=opt.lr) # adjust learning rate every lr_decay_epoch lambda_lr = lambda epoch: opt.lr_decay**( (epoch + 1) // opt.lr_decay_epoch) # poly policy scheduler = LR_Policy(optimizer, lambda_lr) resume_epoch = 0 acc = test(model, opt, iteration) acc_rec[iteration][0] = acc acc = test(model, opt, iteration, Training=True) loss_rec[iteration][0] = acc for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch ################################# #accuracy = test(model, opt, epoch) train(train_loader, model, criterion, optimizer, iteration, opt, epoch) scheduler.step() ################################## # save checkpoints ################################## # save model every 10 epochs accuracy = test(model, opt, iteration) acc_rec[iteration][epoch + 1] = accuracy np.save('acc.npy', acc_rec) accuracy = test(model, opt, iteration, Training=True) loss_rec[iteration][epoch + 1] = accuracy np.save('acc_train.npy', loss_rec) if ((epoch + 1) % opt.epoch_save) == 0: path_checkpoint = '{0}/{1}_{3}_epoch{2}.pth'.format( opt.checkpoint_folder, opt.prefix, epoch + 1, iteration) utils.save_checkpoint(model.state_dict(), path_checkpoint)
parser.add_argument('--eval', type=bool, default=False, help='evaluate the model') parser.add_argument('--mc_level', type=int, default=-1, help='label level to use; -1 means all') args = parser.parse_args() abs_cfg_dir = os.path.abspath(os.path.join(__file__, "../configs")) config.merge_cfg_from_dir(abs_cfg_dir) cfg = config.CONFIG HM = read_h_matrix_file_list(cfg.DATASET.DATA.H_MATRIX_LIST_FILE) _init_() name_dict = {True:"eval", False:""} io = IOStream('checkpoints/' + args.exp_name + '/{}run.log'.format(name_dict[args.eval])) args.cuda = torch.cuda.is_available() torch.manual_seed(cfg.DEVICES.SEED) if args.cuda: if len(cfg.DEVICES.GPU_ID) == 1: torch.cuda.set_device(cfg.DEVICES.GPU_ID[0]) io.cprint( 'Using GPU : ' + str(torch.cuda.current_device()) + ' from ' + str(torch.cuda.device_count()) + ' devices') torch.cuda.manual_seed(cfg.DEVICES.SEED) else: io.cprint('Using CPU') if not args.eval: train(args, io, cfg, HM) else: test(args, io, cfg, HM)
def main(args): if args.decoder_type == "attn": args.use_bi = True if (args.test_only == True) and (args.decode_method == "beam"): args.batch_size = 1 if args.self_attn == True: args.encoder_hidden_size = 300 args.decoder_hidden_size = 300 source_words_to_load = 1000000 target_words_to_load = 1000000 input_lang, output_lang, train_pairs, train_max_length = prepareData( "train", args.language, "en", args.data_path, max_len_ratio=args.max_len_ratio, char=args.char_chinese) input_lang_dev, output_lang_dev, dev_pairs, _ = prepareData( 'dev', args.language, 'en', path=args.data_path, max_len_ratio=1, char=args.char_chinese) # _, _, test_pairs, _ = prepareData('test', args.language, 'en', path=args.data_path) if args.use_pretrain_emb: if args.language == "zh": if args.char_chinese: source_embedding, source_notPretrained = load_char_embd( args.emb_path + "sgns.literature.char", input_lang, reload=args.reload_emb) else: file_check(args.emb_path + 'chinese_ft_300.txt') source_embedding, source_notPretrained = load_fasttext_embd( args.emb_path + 'chinese_ft_300.txt', input_lang, input_lang, source_words_to_load, reload=args.reload_emb) else: file_check(args.emb_path + 'vietnamese_ft_300.txt') source_embedding, source_notPretrained = load_fasttext_embd( args.emb_path + 'vietnamese_ft_300.txt', input_lang, input_lang, source_words_to_load, reload=args.reload_emb) file_check(args.emb_path + 'english_ft_300.txt') target_embedding, target_notPretrained = load_fasttext_embd( args.emb_path + 'english_ft_300.txt', output_lang, input_lang, target_words_to_load, reload=args.reload_emb) if args.tune_pretrain_emb: source_notPretrained[:] = 1 target_notPretrained[:] = 1 else: source_embedding = source_notPretrained = target_embedding = target_notPretrained = None # 0000000000 # target_embedding = target_notPretrained = None params = { 'batch_size': args.batch_size, 'shuffle': True, 'collate_fn': vocab_collate_func, 'num_workers': 20 } params2 = { 'batch_size': args.batch_size, 'shuffle': False, 'collate_fn': vocab_collate_func, 'num_workers': 20 } train_set, dev_set = Dataset(train_pairs, input_lang, output_lang), Dataset(dev_pairs, input_lang, output_lang_dev) train_loader = torch.utils.data.DataLoader(train_set, **params) dev_loader = torch.utils.data.DataLoader(dev_set, **params2) print(len(train_loader), len(dev_loader)) if args.self_attn: encoder = Encoder_SelfAttn(input_lang.n_words, EMB_DIM, args.dim_ff, args.selfattn_en_num, args.decoder_layers, args.decoder_hidden_size, source_embedding, source_notPretrained, args.device, args.attn_head).to(args.device) else: encoder = EncoderRNN(input_lang.n_words, EMB_DIM, args.encoder_hidden_size, args.encoder_layers, args.decoder_layers, args.decoder_hidden_size, source_embedding, source_notPretrained, args.rnn_type, args.use_bi, args.device, False, args.attn_head).to(args.device) if args.transformer: decoder = Decoder_SelfAttn(output_lang.n_words, EMB_DIM, args.dim_ff, args.selfattn_de_num, target_embedding, target_notPretrained, args.device, args.attn_head).to(args.device) elif args.decoder_type == "basic": decoder = DecoderRNN(output_lang.n_words, EMB_DIM, args.decoder_hidden_size, args.decoder_layers, target_embedding, target_notPretrained, args.rnn_type, dropout_p=args.decoder_emb_dropout, device=args.device).to(args.device) elif args.decoder_type == "attn": decoder = DecoderRNN_Attention(output_lang.n_words, EMB_DIM, args.decoder_hidden_size, args.decoder_layers, target_embedding, target_notPretrained, args.rnn_type, dropout_p=args.decoder_emb_dropout, device=args.device, method=args.attn_method).to(args.device) else: raise ValueError print(encoder, decoder) if not args.test_only: trainIters(encoder, decoder, train_loader, dev_loader, \ input_lang, output_lang, input_lang_dev, output_lang_dev, train_max_length, args.epoch, plot_every=args.plot_every, print_every=args.print_every, weight_decay=args.weight_decay, learning_rate=args.learning_rate, device=args.device, teacher_forcing_ratio=args.teacher_forcing_ratio, label=args.save_model_name, use_lr_scheduler = True, gamma_en = 0.99, gamma_de = 0.99, beam_width=args.beam_width, min_len=args.min_len, n_best=args.n_best, decode_method=args.decode_method, save_result_path = args.save_result_path, save_model=args.save_model) else: encoder.load_state_dict( torch.load('encoder' + "-" + args.save_model_name + '.ckpt', map_location=lambda storage, location: storage)) decoder.load_state_dict( torch.load('decoder' + "-" + args.save_model_name + '.ckpt', map_location=lambda storage, location: storage)) bleu_score, decoded_list, target_list, attn_weight = test( encoder, decoder, dev_loader, input_lang, output_lang, input_lang, output_lang_dev, args.beam_width, args.min_len, args.n_best, train_max_length, args.decode_method, args.device) print("dev bleu: ", bleu_score) i = 0 with open("results/dev_examples_{}.txt".format(args.save_result_label), "w+") as f: f.write("bleu: {}\n".format(bleu_score)) for (source, target, source_len, target_len) in (dev_loader): source_list = [[ input_lang.index2word[k.item()] for k in source[i] ][:source_len[i] - 1] for i in range(len(source))] for s in source_list: f.write("S: {}\n".format(" ".join(s))) f.write("T: {}\n".format(decoded_list[i])) f.write("H: {}\n".format(target_list[i])) f.write("\n") i += 1 # ===================================================== # bleu_score, decoded_list, target_list, attn_weight = test( encoder, decoder, train_loader, input_lang, output_lang, input_lang, output_lang, args.beam_width, args.min_len, args.n_best, train_max_length, args.decode_method, args.device) print("train bleu: ", bleu_score) i = 0 with open( "results/train_examples_{}.txt".format(args.save_result_label), "w+") as f: f.write("bleu: {}\n".format(bleu_score)) for (source, target, source_len, target_len) in (train_loader): source_list = [[ input_lang.index2word[k.item()] for k in source[i] ][:source_len[i] - 1] for i in range(len(source))] for s in source_list: f.write("S: {}\n".format(" ".join(s))) f.write("T: {}\n".format(decoded_list[i])) f.write("H: {}\n".format(target_list[i])) f.write("\n") i += 1 return 0
def train(): criterion = nn.CrossEntropyLoss().cuda() print('train start!') data_iter_s = iter(source_loader) data_iter_t = iter(target_loader) data_iter_t_l = iter(target_labeled_loader) len_train_source = len(source_loader) len_train_target = len(target_loader) len_train_target_l = len(target_labeled_loader) for step in range(conf.train.min_step + 1): G.train() C1.train() C2.train() if step % len_train_target == 0: data_iter_t = iter(target_loader) if step % len_train_target_l == 0: data_iter_t_l = iter(target_labeled_loader) if step % len_train_source == 0: data_iter_s = iter(source_loader) data_t = next(data_iter_t) data_t_l = next(data_iter_t_l) data_s = next(data_iter_s) inv_lr_scheduler(param_lr_g, opt_g, step, init_lr=conf.train.lr, max_iter=conf.train.min_step) inv_lr_scheduler(param_lr_f, opt_c1, step, init_lr=conf.train.lr, max_iter=conf.train.min_step) img_s = data_s[0] label_s = data_s[1] img_t = data_t[0] index_t = data_t[2] img_s, label_s = Variable(img_s.cuda()), \ Variable(label_s.cuda()) img_t = Variable(img_t.cuda()) index_t = Variable(index_t.cuda()) img_t_l = data_t_l[0].cuda() label_t_l = data_t_l[1].cuda() if len(img_t) < batch_size: break if len(img_s) < batch_size: break opt_g.zero_grad() opt_c1.zero_grad() ## Weight normalizztion C1.module.weight_norm() ## Source loss calculation feat = G(img_s) out_s = C1(feat) loss_s = criterion(out_s, label_s) #loss_s += criterion(C2(feat.detach()), label_s) feat_t = G(img_t) out_t = C1(feat_t) feat_t = F.normalize(feat_t) ## Train a linear classifier on top of feature extractor. ## We should not update feature extractor. G.eval() feat_t_l = G(img_t_l) G.train() out_t_l = C2(feat_t_l.detach()) loss_t_l = criterion(out_t_l, label_t_l) ### Calculate mini-batch x memory similarity feat_mat = lemniscate(feat_t, index_t) ### We do not use memory features present in mini-batch feat_mat[:, index_t] = -1 / conf.model.temp ### Calculate mini-batch x mini-batch similarity feat_mat2 = torch.matmul(feat_t, feat_t.t()) / conf.model.temp mask = torch.eye(feat_mat2.size(0), feat_mat2.size(0)).bool().cuda() feat_mat2.masked_fill_(mask, -1 / conf.model.temp) loss_nc = conf.train.eta * entropy(torch.cat([feat_mat, feat_mat2], 1)) loss_ent = conf.train.eta * entropy_margin(out_t, conf.train.thr, conf.train.margin) all = loss_nc + loss_s + loss_t_l with amp.scale_loss(all, [opt_g, opt_c1]) as scaled_loss: scaled_loss.backward() opt_g.step() opt_c1.step() opt_g.zero_grad() opt_c1.zero_grad() lemniscate.update_weight(feat_t, index_t) if step % conf.train.log_interval == 0: print('Train [{}/{} ({:.2f}%)]\tLoss Source: {:.6f} ' 'Loss NC: {:.6f} Loss LT: {:.6f}\t'.format( step, conf.train.min_step, 100 * float(step / conf.train.min_step), loss_s.item(), loss_nc.item(), loss_t_l.item())) if step > 0 and step % conf.test.test_interval == 0: test(step, dataset_test, filename, n_share, num_class, G, C1, conf.train.thr) test_class_inc(step, dataset_test, filename, n_target, G, C2, n_share) G.train() C1.train() C2.train()
# lr = 0.0001, betas=(0.5, 0.999)) ############################# Hyper-parameters ################################ alpha = 1.0 beta = 1.0 gamma = 0.05 K = 5 nu = 1 ################################ Train ###################################### # Loss plot logger = Logger(2000, len(train_dataloader)) test_ranks, test_mAP = test(feature_generator, queryloader, galleryloader) train_ranks, train_mAP = test(feature_generator, queryloader_train, galleryloader_train) for epoch in range(0, 2000): print("Epoch ---------------", epoch + 1) for i, batch in enumerate(train_dataloader): #print("Batch number ",i) anchor_rgb, positive_rgb, negative_rgb, anchor_ir, positive_ir, \ negative_ir, anchor_label, modality_rgb, modality_ir = batch if torch.cuda.is_available(): anchor_rgb = anchor_rgb.cuda()
def train(config): gpu_manage(config) ### DATASET LOAD ### print('===> Loading datasets') dataset = Dataset(config) train_size = int(0.9 * len(dataset)) test_size = len(dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( dataset, [train_size, test_size]) training_data_loader = DataLoader(dataset=train_dataset, num_workers=config.threads, batch_size=config.batchsize, shuffle=True) test_data_loader = DataLoader(dataset=test_dataset, num_workers=config.threads, batch_size=config.test_batchsize, shuffle=False) ### MODELS LOAD ### print('===> Loading models') if config.gen_model == 'unet': gen = UNet(in_ch=config.in_ch, out_ch=config.out_ch, gpu_ids=config.gpu_ids) else: print('The generator model does not exist') if config.gen_init is not None: param = torch.load(config.gen_init) gen.load_state_dict(param) print('load {} as pretrained model'.format(config.gen_init)) dis = Discriminator(in_ch=config.in_ch, out_ch=config.out_ch, gpu_ids=config.gpu_ids) if config.dis_init is not None: param = torch.load(config.dis_init) dis.load_state_dict(param) print('load {} as pretrained model'.format(config.dis_init)) # setup optimizer opt_gen = optim.Adam(gen.parameters(), lr=config.lr, betas=(config.beta1, 0.999), weight_decay=0.00001) opt_dis = optim.Adam(dis.parameters(), lr=config.lr, betas=(config.beta1, 0.999), weight_decay=0.00001) real_a = torch.FloatTensor(config.batchsize, config.in_ch, 256, 256) real_b = torch.FloatTensor(config.batchsize, config.out_ch, 256, 256) criterionL1 = nn.L1Loss() criterionMSE = nn.MSELoss() criterionSoftplus = nn.Softplus() if config.cuda: gen = gen.cuda(0) dis = dis.cuda(0) criterionL1 = criterionL1.cuda(0) criterionMSE = criterionMSE.cuda(0) criterionSoftplus = criterionSoftplus.cuda(0) real_a = real_a.cuda(0) real_b = real_b.cuda(0) real_a = Variable(real_a) real_b = Variable(real_b) logreport = LogReport(log_dir=config.out_dir) testreport = TestReport(log_dir=config.out_dir) # main for epoch in range(1, config.epoch + 1): for iteration, batch in enumerate(training_data_loader, 1): real_a_cpu, real_b_cpu = batch[0], batch[1] real_a.data.resize_(real_a_cpu.size()).copy_(real_a_cpu) real_b.data.resize_(real_b_cpu.size()).copy_(real_b_cpu) fake_b = gen.forward(real_a) ################ ### Update D ### ################ opt_dis.zero_grad() # train with fake fake_ab = torch.cat((real_a, fake_b), 1) pred_fake = dis.forward(fake_ab.detach()) batchsize, _, w, h = pred_fake.size() loss_d_fake = torch.sum( criterionSoftplus(pred_fake)) / batchsize / w / h # train with real real_ab = torch.cat((real_a, real_b), 1) pred_real = dis.forward(real_ab) loss_d_real = torch.sum( criterionSoftplus(-pred_real)) / batchsize / w / h # Combined loss loss_d = loss_d_fake + loss_d_real loss_d.backward() if epoch % config.minimax == 0: opt_dis.step() ################ ### Update G ### ################ opt_gen.zero_grad() # First, G(A) should fake the discriminator fake_ab = torch.cat((real_a, fake_b), 1) pred_fake = dis.forward(fake_ab) loss_g_gan = torch.sum( criterionSoftplus(-pred_fake)) / batchsize / w / h # Second, G(A) = B loss_g_l1 = criterionL1(fake_b, real_b) * config.lamb loss_g = loss_g_gan + loss_g_l1 loss_g.backward() opt_gen.step() # log if iteration % 100 == 0: print( "===> Epoch[{}]({}/{}): loss_d_fake: {:.4f} loss_d_real: {:.4f} loss_g_gan: {:.4f} loss_g_l1: {:.4f}" .format(epoch, iteration, len(training_data_loader), loss_d_fake.item(), loss_d_real.item(), loss_g_gan.item(), loss_g_l1.item())) log = {} log['epoch'] = epoch log['iteration'] = len(training_data_loader) * (epoch - 1) + iteration log['gen/loss'] = loss_g.item() log['dis/loss'] = loss_d.item() logreport(log) with torch.no_grad(): log_test = test(config, test_data_loader, gen, criterionMSE, epoch) testreport(log_test) if epoch % config.snapshot_interval == 0: checkpoint(config, epoch, gen, dis) logreport.save_lossgraph() testreport.save_lossgraph()
def cluster(approach, datapath): """ Run a clustering approach on unlabeled data. """ report_path = test(datapath, approach, params[approach]) c.echo('Report compiled at {0}.'.format(report_path))
def train(g, d, train_loader, neg_loader, epoches, g_optim, d_optim, neg_lens): g = g.to(device) d = d.to(device) time.sleep(0.1) print("start training on {}".format(device)) time.sleep(0.1) bce_loss = torch.nn.BCELoss() # 训练判别器D for e in tqdm(range(epoches)): start_time = time.time() idx = 0 d_loss = 0.0 neg_iter = neg_loader.__iter__() # 训练判别器d for _, _, real_attr, real_user_emb in train_loader: if idx > neg_lens: break _, _, neg_attr, neg_user_emb = neg_iter.next() # 正例的属性和用户嵌入 real_attr = real_attr.to(device) real_user_emb = real_user_emb.to(device) # 负例的属性和用户嵌入 neg_attr = neg_attr.to(device) neg_user_emb = neg_user_emb.to(device) # 生成器生成虚拟用户嵌入 fake_user_emb = g(real_attr) fake_user_emb = fake_user_emb.to(device) # 判别器判别 d_real, d_logit_real = d(real_attr, real_user_emb) d_fake, d_logit_fake = d(real_attr, fake_user_emb) d_neg, d_logit_neg = d(neg_attr, neg_user_emb) # 计算d_loss d_optim.zero_grad() d_loss_real = bce_loss(d_real, torch.ones_like(d_real)) d_loss_fake = bce_loss(d_fake, torch.zeros_like(d_fake)) d_loss_neg = bce_loss(d_neg, torch.zeros_like(d_neg)) d_loss = torch.mean(d_loss_real + d_loss_fake + d_loss_neg) d_loss.backward() d_optim.step() idx += batch_size # 训练生成器g g_loss = 0.0 for uid, mid, attr, user_emb in train_loader: g_optim.zero_grad() attr = attr.to(device) # 生成虚拟用户嵌入 fake_user_emb = g(attr) fake_user_emb = fake_user_emb.to(device) # 算loss d_fake, d_logit_fake = d(attr, fake_user_emb) g_loss = bce_loss(d_fake, torch.ones_like(d_fake)) g_loss.backward() g_optim.step() end_time = time.time() print("\nepoch:{}: time:{:.2f}, d_loss:{:.3f}, g_loss:{:.3f}".format( e + 1, end_time - start_time, d_loss, g_loss)) # test test_item, test_attribute = data_loader.load_test_data() test_item = torch.tensor(test_item).to(device) test_attribute = torch.tensor(test_attribute, dtype=torch.long).to(device) fake_user = g(test_attribute) eval.test(fake_user.cpu().detach().numpy()) time.sleep(0.1)
target = parser.parse_args().target if __name__ == '__main__': config = Config('config.yaml') if not os.path.exists(config.checkpoint_dir): os.makedirs(config.checkpoint_dir) word2idx, train_data, valid_data, test_data = load_data(config) idx2word = dict(zip(word2idx.values(), word2idx.keys())) config.nwords = len(word2idx) print("vacab size is %d" % config.nwords) while True: random.seed(time.time()) config.srand = random.randint(0, 100000) np.random.seed(config.srand) random.seed(config.srand) paddle.seed(config.srand) model = MemN2N(config) train(model, train_data, valid_data, config) test_ppl = test(model, test_data, config) if test_ppl < target: model_path = os.path.join( config.checkpoint_dir, config.model_name + "_" + str(config.srand) + "_good") paddle.save(model.state_dict(), model_path) break
def run_train_val(ckp_name='ckp_latest'): sess = Session() sess.load_checkpoints(ckp_name) sess.tensorboard('train_stats') sess.tensorboard('val_stats') ######################## Get Datasets & Dataloaders ########################### train_dataset = SYSU_triplet_dataset( data_folder=settings.data_folder, transforms_list=settings.transforms_list) def get_train_dataloader(): return iter( DataLoader(SYSU_triplet_dataset( data_folder=settings.data_folder, transforms_list=settings.transforms_list), batch_size=settings.train_batch_size, shuffle=True, num_workers=settings.num_workers, drop_last=True)) train_dataloader = get_train_dataloader() eval_val = SYSU_eval_datasets(data_folder=settings.data_folder, data_split='val') transform_test = settings.test_transforms_list val_queryloader = DataLoader( Image_dataset(eval_val.query, transform=transform_test), batch_size=settings.val_batch_size, shuffle=False, num_workers=0, drop_last=False, ) val_galleryloader = DataLoader( Image_dataset(eval_val.gallery, transform=transform_test), batch_size=settings.val_batch_size, shuffle=False, num_workers=0, drop_last=False, ) while sess.step < settings.iter_sche[-1]: sess.sche_G.step() sess.feature_generator.train() sess.feature_embedder_rgb.train() sess.feature_embedder_ir.train() sess.id_classifier.train() try: batch_t = next(train_dataloader) except StopIteration: train_dataloader = get_train_dataloader() batch_t = next(train_dataloader) sess.epoch_count += 1 sess.inf_batch(batch_t) if sess.step % int(settings.latest_steps) == 0: sess.save_checkpoints('ckp_latest') sess.save_checkpoints('ckp_latest_backup') if sess.step % settings.val_step == 0: sess.feature_generator.eval() sess.feature_embedder_rgb.eval() sess.feature_embedder_ir.eval() sess.id_classifier.eval() test_ranks, test_mAP = test([ nn.Sequential(sess.feature_generator, sess.feature_embedder_rgb), nn.Sequential(sess.feature_generator, sess.feature_embedder_ir) ], val_queryloader, val_galleryloader) sess.write('val_stats', {'test_mAP_percentage': test_mAP*100.0, \ 'test_rank-1_accuracy_percentage':test_ranks[0]*100.0,\ 'test_rank-5_accuracy_percentage':test_ranks[4]*100.0,\ 'test_rank-10_accuracy_percentage':test_ranks[9]*100.0,\ 'test_rank-20_accuracy_percentage':test_ranks[19]*100.0 }) if sess.step % sess.save_steps == 0: sess.save_checkpoints('ckp_step_%d' % sess.step) logger.info('save model as ckp_step_%d' % sess.step) sess.step += 1
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = ['model.%s.' % x for x in range(5)] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}' ) # save previous weights if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info( 'Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.8, imgsz * 1.2 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if final_epoch: # replot predictions [ os.remove(x) for x in glob.glob( str(log_dir / 'test_batch*_pred.jpg')) if os.path.exists(x) ] results, maps, times = eval.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( "--batch-size", type=int, default=1, metavar="N", help="input batch size for training (default: 1)", ) parser.add_argument( "--test-batch-size", type=int, default=1, metavar="N", help="input batch size for testing (default: 1)", ) parser.add_argument( "--epochs", type=int, default=3, metavar="N", help="number of epochs to train (default: 14)", ) parser.add_argument( "--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)", ) parser.add_argument( "--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)", ) parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training") parser.add_argument( "--dry-run", action="store_true", default=False, help="quickly check a single pass", ) parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=100, metavar="N", help="how many batches to wait before logging training status", ) parser.add_argument( "--save-model", action="store_true", default=False, help="For Saving the current Model", ) args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {"batch_size": args.batch_size} if use_cuda: kwargs.update({ "num_workers": 1, "pin_memory": True, "shuffle": True }, ) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform) dataset2 = datasets.MNIST("../data", train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) model = Net().to(device) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, epoch) test(model, device, test_loader) if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def cross_validate(model, crit, opt): dataset = VideoDataset(opt, 'val') print(len(dataset)) _, _, seq_probs, seq_preds, labels, masks = test(model, crit, dataset, dataset.get_vocab(), opt)
def main(seed, k_fold, batch_size, num_epoch, continue_my_model, continue_my_model_train_path, learning_rate, num_instance, delta_v, delta_d, p_var, p_dist, p_reg, p_seg, p_disc, p_cla, is_pseudo_mask, is_pre, is_pre_path): '''1准备数据集''' transform = ImgMaskTransform(img_size=(128, 256)) train_dataset = Metric_Learning_ImageFolder(root=image_folder + '/data_train', transform=transform) val_dataset = Metric_Learning_ImageFolder(root=image_folder + '/data_train', transform=transform) # {'narrow1':0, 'narrow2':1, 'narrow3':2, 'narrow4':3, 'wide':4} narrow_list = train_dataset.class_to_idx cla_dict = dict((val, key) for key, val in narrow_list.items()) cla = [] for key, val in narrow_list.items(): cla.append(key) # write dict into json file json_str = json.dumps(cla_dict, indent=4) with open('class_indices.json', 'w') as json_file: json_file.write(json_str) indices = list(range(len(train_dataset))) print(len(train_dataset)) # 打乱数据 np.random.seed(seed) np.random.shuffle(indices) # print(indices) for i in range(k_fold): print('\n', '*' + '-' * 10, 'F{}'.format(i + 1), '-' * 10 + '*') '''2设置实验结果保存路径''' train_result = pd.DataFrame(columns=('loss', 'accurate')) val_result = pd.DataFrame(columns=('loss', 'accurate', 'recall', 'precision', 'AUC', 'F1')) test_result = pd.DataFrame(columns=('loss', 'accurate', 'recall', 'precision', 'AUC', 'F1')) train_len, train_loader, validation_loader = k_fold_loader( i, int(len(train_dataset) * 1 / k_fold), indices, train_dataset, val_dataset, batch_size) test_data = StandarImageFolder( root=os.path.join(image_folder, 'data_test'), transform=augumentation.liner_classifier_test_transform) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=16) '''3初始化模型''' net = MyNetworks3(n_instance=5, n_classes=5, embed_dim=2, branch_size=1024, deep_features_size=2048, backend='resnet50', pretrained=is_pre, model_path=is_pre_path).cuda() for param in net.extractors.conv1.parameters(): param.requires_grad = False for param in net.extractors.bn1.parameters(): param.requires_grad = False for param in net.extractors.layer1.parameters(): param.requires_grad = False for p in net.extractors.layer2.parameters(): p.requires_grad = False if continue_my_model: print('continue my model.') missing_keys, unexpected_keys = net.load_state_dict( torch.load(continue_my_model_train_path), strict=False) # ##########查看可以更新的参数##################### # 通过这个查看训练好的模型权重是否真的加载进来了 # parm = {} # for name, parameters in net.named_parameters(): # if name == 'extractors.conv1.weight': # print(name, ':', parameters.size()) # print(name, ':', parameters) # parm[name] = parameters.cpu().detach().numpy() for name, param in net.named_parameters(): if param.requires_grad: print(name) # ###############查看可以更新的参数################ # pspnet: 53.86M FLOPs: 997.02M / lanenet34.71M FLOPs: 561.82M # flops, params = profile(net, inputs=(torch.randn(1, 3, 32, 32).cuda(),)) # flops, params = clever_format([flops, params]) # print('# Model Params: {} FLOPs: {}'.format(params, flops)) '''4设置优化器''' optimizer = optim.SGD( net.parameters(), lr=learning_rate, momentum=0.9, dampening=0, # 动量的抑制因子,默认为0 weight_decay=0.0005, # 默认为0,有值说明用作正则化 nesterov=True, ) # 使用Nesterov动量,默认为False '''5初始化损失函数''' disc_criterion = DiscriminativeLoss_wizaron(num_instance=num_instance, delta_v=delta_v, delta_d=delta_d, norm=2, scale_var=p_var, scale_dist=p_dist, scale_reg=p_reg, usegpu=True).cuda() cla_criterion = nn.CrossEntropyLoss().cuda() # seg_criterion = nn.CrossEntropyLoss().cuda() seg_criterion = SoftDiceLoss().cuda() # seg_criterion = MyLovaszLoss().cuda() results = { 'train_loss': [], 'train_acc@1': [], 'train_acc@2': [], 'train_seg_loss': [], 'train_var_loss': [], 'train_dis_loss': [], 'train_reg_loss': [], 'val_loss': [], 'val_acc@1': [], 'val_acc@2': [], # 'val_mask_loss': [], 'val_var_loss': [], 'val_dis_loss': [], 'val_reg_loss': [], 'test_loss': [], 'test_acc@1': [], 'test_acc@2': [] } '''6开始训练''' best_acc, best_recall, best_precision, best_auc, best_f1 = 0.0, 0.0, 0.0, 0.0, 0.0 lr_epoch = [] for epoch in range(1, args.num_epoch + 1): print('\nF{} | Epoch [{}/{}]'.format(i + 1, epoch, args.num_epoch)) # 1 train loss_list里面放的是lovasz,var,dist,reg loss lr, train_loss, train_acc_1, train_acc_2, train_part_loss_list = train( num_epoch=num_epoch, per_epoch=epoch - 1, is_pseudo_mask=is_pseudo_mask, net=net, train_dataset=train_dataset, data_loader=train_loader, train_optimizer=optimizer, lr=learning_rate, disc_loss=disc_criterion, seg_loss=seg_criterion, cla_loss=cla_criterion, p_seg=p_seg, p_discriminative=p_disc, p_cla=p_cla, save_pre=save_pre_img) lr_epoch += lr # print('lr:', lr) results['train_loss'].append(train_loss) results['train_acc@1'].append(train_acc_1) results['train_acc@2'].append(train_acc_2) results['train_seg_loss'].append(train_part_loss_list[0]) results['train_var_loss'].append(train_part_loss_list[1]) results['train_dis_loss'].append(train_part_loss_list[2]) results['train_reg_loss'].append(train_part_loss_list[3]) train_result = train_result.append(pd.DataFrame({ 'loss': [train_loss], 'accurate': [train_acc_1] }), ignore_index=True) # 2 val val_loss, val_acc_1, val_acc_2, val_pred_probs, val_pred_labels, val_gt_labels = val_test( per_epoch=epoch, is_pseudo_mask=args.is_pseudo_mask, val_dataset=val_dataset, net=net, data_loader=validation_loader, disc_loss=disc_criterion, seg_loss=seg_criterion, cla_loss=cla_criterion, p_seg=p_seg, p_discriminative=p_disc, p_cla=p_cla, is_val=True) results['val_loss'].append(val_loss) results['val_acc@1'].append(val_acc_1) results['val_acc@2'].append(val_acc_2) val_acc, val_recall, val_precision, val_auc, val_f1 = metrics_score( val_gt_labels, val_pred_labels) val_result = val_result.append(pd.DataFrame({ 'loss': [val_loss], 'accurate': [val_acc], 'recall': [val_recall], 'precision': [val_precision], 'AUC': [val_auc], 'F1': [val_f1] }), ignore_index=True) # 3 test test_loss, test_acc_1, test_acc_2, test_pred_probs, test_pred_labels, test_gt_labels = test( net=net, data_loader=test_loader, criterion=cla_criterion) results['test_loss'].append(test_loss) results['test_acc@1'].append(test_acc_1) results['test_acc@2'].append(test_acc_2) test_acc, test_recall, test_precision, test_auc, test_f1 = metrics_score( test_gt_labels, test_pred_labels) test_result = test_result.append(pd.DataFrame({ 'loss': [test_loss], 'accurate': [test_acc], 'recall': [test_recall], 'Precision': [test_precision], 'AUC': [test_auc], 'F1': [test_f1] }), ignore_index=True) '''save statistics''' data_frame = pd.DataFrame(data=results, index=range(1, epoch + 1)) data_frame.to_csv(os.path.join( save_dir, 'final_linear_statistics_' + 'K' + str(i + 1) + '.csv'), index_label='epoch') total_curve = plot_loss_acc(data_frame) plt.savefig( os.path.join( save_dir, 'final_linear_statistics_' + 'K' + str(i + 1) + '.png')) # print('[Per_epoch]Val acc:{} | auc:{} | f1:{}'.format(val_acc, val_auc, val_f1)) # print('[Per_epoch]Test acc:{} | auc:{} | f1:{}'.format(test_acc, test_auc, test_f1)) if val_acc_1 > best_acc: best_acc = val_acc_1 # 当验证集准确率最高时,保存测试集的结果 save_best(i, net, val_gt_labels, val_pred_labels, val_pred_probs, test_gt_labels, test_pred_labels, cla, test_pred_probs, save_best_dir, save_dir) torch.save( net.state_dict(), os.path.join( save_best_dir, 'model/K' + str(i + 1) + 'EP' + str(epoch) + '.pth')) print( '[Best]\nVal: acc:{} | recalll:{} | precision:{} | auc:{} | f1:{}' .format(val_acc, val_recall, val_precision, val_auc, val_f1)) print( 'Test: acc:{} | recalll:{} | precision:{} | auc:{} | f1:{}' .format(test_acc, test_recall, test_precision, test_auc, test_f1)) if epoch % 100 == 0: # 保存中间结果 save_intermediate(net, save_intermediate_dir, epoch, i, val_gt_labels, val_pred_labels, val_pred_probs, cla, test_pred_probs, test_gt_labels, test_pred_labels, train_result, val_result, test_result) print('save epoch {}!'.format(epoch)) '''save final epoch results''' train_result.to_csv( os.path.join(save_csv_dir, 'Train' + 'K' + str(i + 1) + '.csv')) val_result.to_csv( os.path.join(save_csv_dir, 'Val' + 'K' + str(i + 1) + '.csv')) test_result.to_csv( os.path.join(save_csv_dir, 'Test' + 'K' + str(i + 1) + '.csv')) save_k_final_results(net, save_dir, save_display_dir, i, lr_epoch, val_gt_labels, val_pred_labels, val_pred_probs, cla, test_pred_probs, test_gt_labels, test_pred_labels) plot_part_loss_AucF1(data_frame, val_result, test_result) plt.savefig( os.path.join(save_dir, 'PartLoss_AUCF1' + 'K' + str(i + 1) + '.png')) break
def eval_reward(args, shared_model, writer_dir=None): """ For evaluation Arguments: - writer: the tensorboard summary writer directory (note: can't get it working directly with the SummaryWriter object) """ writer = SummaryWriter(log_dir=os.path.join( writer_dir, 'eval')) if writer_dir is not None else None # current episode stats episode_reward = episode_value_mse = episode_td_error = episode_pg_loss = episode_length = 0 # global stats i_episode = 0 total_episode = total_steps = 0 num_goals_achieved = 0 # intilialize the env and models torch.manual_seed(args.seed) env = create_env(args.env_name, framework=args.framework, args=args) set_seed(args.seed, env, args.framework) shared_enc, shared_dec, shared_d_module, shared_r_module = shared_model enc = Encoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) dec = Decoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete) r_module = R_Module(env.action_space.shape[0], args.dim, discrete=args.discrete, baseline=False, state_space=env.observation_space.shape[0]) all_params = chain(enc.parameters(), dec.parameters(), d_module.parameters(), r_module.parameters()) if args.from_checkpoint is not None: model_state, _ = torch.load(args.from_checkpoint) model.load_state_dict(model_state) # set the model to evaluation mode enc.eval() dec.eval() d_module.eval() r_module.eval() # reset the state state = env.reset() state = Variable(torch.from_numpy(state).float()) start = time.time() while total_episode < args.num_episodes: # Sync with the shared model r_module.load_state_dict(shared_r_module.state_dict()) d_module.load_state_dict(shared_d_module.state_dict()) enc.load_state_dict(shared_enc.state_dict()) dec.load_state_dict(shared_dec.state_dict()) # reset stuff cd_p = Variable(torch.zeros(1, args.lstm_dim)) hd_p = Variable(torch.zeros(1, args.lstm_dim)) # for the reward cr_p = Variable(torch.zeros(1, args.lstm_dim)) hr_p = Variable(torch.zeros(1, args.lstm_dim)) i_episode += 1 episode_length = 0 episode_reward = 0 args.local = True args.d = 0 succ, _, episode_reward, episode_length = test(1, args, args, args, d_module, r_module, enc) log("Eval: succ {:.2f}, reward {:.2f}, length {:.2f}".format( succ, episode_reward, episode_length)) # Episode has ended, write the summaries here if writer_dir is not None: # current episode stats writer.add_scalar('eval/episode_reward', episode_reward, i_episode) writer.add_scalar('eval/episode_length', episode_length, i_episode) writer.add_scalar('eval/success', succ, i_episode) time.sleep(args.eval_every) print("sleep")
def test_ap(self, net, epoch): for dataset in self.test_datasets: ap, _ = test(net, dataset, batch_size=self.batch_size) self.writer.log_ap(epoch, ap, dataset.name())
def train(hyp, tb_writer, opt, device): print(f'Hyperparameters {hyp}') log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = log_dir + os.sep + 'results.txt' epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank # TODO: Init DDP logging. Only the first process is allowed to log. # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs. # Save run settings with open(Path(log_dir) / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(Path(log_dir) / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Remove previous results if rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg, nc=nc).to(device) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if hyp['optimizer'] == 'adam': # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Load Model with torch_distributed_zero_first(rank): google_utils.attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: exclude = ['anchor'] # exclude keys ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape } model.load_state_dict(ckpt['model'], strict=False) print('Transferred %g/%g items from %s' % (len(ckpt['model']), len(model.state_dict()), weights)) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) # DP mode if device.type != 'cpu' and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and device.type != 'cpu' and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = torch_utils.ModelEMA(model) if rank in [-1, 0] else None # DDP mode if device.type != 'cpu' and rank != -1: model = DDP(model, device_ids=[rank], output_device=rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) # if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 # tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) # When in DDP mode, the generated indices will be broadcasted to synchronize dataset. if dataset.image_weights: # Generate indices. if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast. if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # Only the first process in DDP mode is allowed to log or save checkpoints. if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = eval.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def main(): print(pprint.pformat(vars(args))) ###### Bookkeeping if os.path.exists(test_results_fname(args)): resp = None while resp not in {"yes", "no", "y", "n"}: resp = input( f"{args.save} already exists. Overwrite contents? [y/n]: ") if resp == "yes" or resp == "y": break elif resp == "no" or resp == "n": print("Exiting") exit() else: os.makedirs(args.save, exist_ok=True) # Save command to file with open(command_fname(args), 'w') as f: f.write(pprint.pformat(vars(args))) ###### Dataloading dset_train = DeepChromeDataset(dataroot=args.globstr_train, num_procs=args.dset_workers) print(f"Training set has {len(dset_train)} samples.") dset_val = DeepChromeDataset(dataroot=args.globstr_val, num_procs=args.dset_workers) print(f"Validation set has {len(dset_val)} samples.") dset_test = DeepChromeDataset(dataroot=args.globstr_test, num_procs=args.dset_workers) print(f"Test set has {len(dset_test)} samples.") train_loader = torch.utils.data.DataLoader( dset_train, batch_size=args.batch_size, num_workers=args.dloader_workers, shuffle=True, pin_memory=True, ) val_loader = torch.utils.data.DataLoader( dset_val, batch_size=args.batch_size, num_workers=args.dloader_workers, shuffle=True, pin_memory=True, ) test_loader = torch.utils.data.DataLoader( dset_test, batch_size=args.batch_size, num_workers=args.dloader_workers, shuffle=True, pin_memory=True, ) ###### Setup Model if args.arch == 'DeepChrome': model = DeepChromeModel() elif args.arch == 'DeepChromeFC': model = DeepChromeFCModel() else: raise NotImplementedError() if not args.no_gpu: model = model.cuda() ###### Optimization optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=args.momentum) def cosine_annealing(step, total_steps, lr_max, lr_min): return lr_min + (lr_max - lr_min) * 0.5 * ( 1 + np.cos(step / total_steps * np.pi)) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda step: cosine_annealing( step, args.epochs * len(train_loader), 1, # since lr_lambda computes multiplicative factor 1e-6 / args.lr)) ###### Logging with open(train_log_fname(args), 'w') as f: f.write("epoch,train_loss,val_loss,val_acc,val_auroc\n") ###### Train! print("Beginning training...") best_epoch_auroc = 0 best_epoch = None num_without_changing_best_val_auroc = 0 for epoch in range(args.epochs): train_loss = train_one_epoch(epoch, model, train_loader, optimizer, scheduler) val_auroc, val_acc, val_loss = test(model, val_loader, args.no_gpu) ###### Logging print( 'Epoch {0:3d} | Train Loss {1:.6f} | Val Loss {2:.6f} | Val AUROC {3:.6f} | Val Accuracy {4:.6f}' .format( (epoch + 1), train_loss, val_loss, val_auroc, val_acc, )) with open(train_log_fname(args), 'a') as f: f.write(f"{epoch},{train_loss},{val_loss},{val_acc},{val_auroc}\n") if val_auroc > best_epoch_auroc: best_epoch = epoch best_epoch_auroc = val_auroc num_without_changing_best_val_auroc = 0 # Save the model iff this is the best epoch so far _dict = { "model.state_dict": model.state_dict(), "optimizer.state_dict": optimizer.state_dict(), "epoch": epoch, } torch.save(_dict, checkpoint_fname(args, epoch)) else: num_without_changing_best_val_auroc += 1 if num_without_changing_best_val_auroc > args.patience: print("Early stopping") break print(f"Doing final testing") print("Loading {0}".format(checkpoint_fname(args, best_epoch))) model.load_state_dict( torch.load(checkpoint_fname(args, best_epoch))['model.state_dict']) # Do final testing test_auroc, test_acc, test_loss = test(model, test_loader, args.no_gpu) with open(test_results_fname(args), 'w', encoding='utf-8') as f: data = { "test_auroc": test_auroc, "test_acc": test_acc, "test_loss": test_loss } print(pprint.pformat(data)) json.dump(data, f, ensure_ascii=False, indent=4) print(f"Finished successfully. See {args.save}")
def train(config): gpu_manage(config) ### DATASET LOAD ### print('===> Loading datasets') dataset = TrainDataset(config) print('dataset:', len(dataset)) train_size = int((1 - config.validation_size) * len(dataset)) validation_size = len(dataset) - train_size train_dataset, validation_dataset = torch.utils.data.random_split( dataset, [train_size, validation_size]) print('train dataset:', len(train_dataset)) print('validation dataset:', len(validation_dataset)) training_data_loader = DataLoader(dataset=train_dataset, num_workers=config.threads, batch_size=config.batchsize, shuffle=True) validation_data_loader = DataLoader(dataset=validation_dataset, num_workers=config.threads, batch_size=config.validation_batchsize, shuffle=False) ### MODELS LOAD ### print('===> Loading models') gen = Generator(gpu_ids=config.gpu_ids) if config.gen_init is not None: param = torch.load(config.gen_init) gen.load_state_dict(param) print('load {} as pretrained model'.format(config.gen_init)) dis = Discriminator(in_ch=config.in_ch, out_ch=config.out_ch, gpu_ids=config.gpu_ids) if config.dis_init is not None: param = torch.load(config.dis_init) dis.load_state_dict(param) print('load {} as pretrained model'.format(config.dis_init)) # setup optimizer opt_gen = optim.Adam(gen.parameters(), lr=config.lr, betas=(config.beta1, 0.999), weight_decay=0.00001) opt_dis = optim.Adam(dis.parameters(), lr=config.lr, betas=(config.beta1, 0.999), weight_decay=0.00001) real_a = torch.FloatTensor(config.batchsize, config.in_ch, config.width, config.height) real_b = torch.FloatTensor(config.batchsize, config.out_ch, config.width, config.height) M = torch.FloatTensor(config.batchsize, config.width, config.height) criterionL1 = nn.L1Loss() criterionMSE = nn.MSELoss() criterionSoftplus = nn.Softplus() if config.cuda: gen = gen.cuda() dis = dis.cuda() criterionL1 = criterionL1.cuda() criterionMSE = criterionMSE.cuda() criterionSoftplus = criterionSoftplus.cuda() real_a = real_a.cuda() real_b = real_b.cuda() M = M.cuda() real_a = Variable(real_a) real_b = Variable(real_b) logreport = LogReport(log_dir=config.out_dir) validationreport = TestReport(log_dir=config.out_dir) print('===> begin') start_time = time.time() # main for epoch in range(1, config.epoch + 1): epoch_start_time = time.time() for iteration, batch in enumerate(training_data_loader, 1): real_a_cpu, real_b_cpu, M_cpu = batch[0], batch[1], batch[2] real_a.data.resize_(real_a_cpu.size()).copy_(real_a_cpu) real_b.data.resize_(real_b_cpu.size()).copy_(real_b_cpu) M.data.resize_(M_cpu.size()).copy_(M_cpu) att, fake_b = gen.forward(real_a) ################ ### Update D ### ################ opt_dis.zero_grad() # train with fake fake_ab = torch.cat((real_a, fake_b), 1) pred_fake = dis.forward(fake_ab.detach()) batchsize, _, w, h = pred_fake.size() loss_d_fake = torch.sum( criterionSoftplus(pred_fake)) / batchsize / w / h # train with real real_ab = torch.cat((real_a, real_b), 1) pred_real = dis.forward(real_ab) loss_d_real = torch.sum( criterionSoftplus(-pred_real)) / batchsize / w / h # Combined loss loss_d = loss_d_fake + loss_d_real loss_d.backward() if epoch % config.minimax == 0: opt_dis.step() ################ ### Update G ### ################ opt_gen.zero_grad() # First, G(A) should fake the discriminator fake_ab = torch.cat((real_a, fake_b), 1) pred_fake = dis.forward(fake_ab) loss_g_gan = torch.sum( criterionSoftplus(-pred_fake)) / batchsize / w / h # Second, G(A) = B loss_g_l1 = criterionL1(fake_b, real_b) * config.lamb loss_g_att = criterionMSE(att[:, 0, :, :], M) loss_g = loss_g_gan + loss_g_l1 + loss_g_att loss_g.backward() opt_gen.step() # log if iteration % 10 == 0: print( "===> Epoch[{}]({}/{}): loss_d_fake: {:.4f} loss_d_real: {:.4f} loss_g_gan: {:.4f} loss_g_l1: {:.4f}" .format(epoch, iteration, len(training_data_loader), loss_d_fake.item(), loss_d_real.item(), loss_g_gan.item(), loss_g_l1.item())) log = {} log['epoch'] = epoch log['iteration'] = len(training_data_loader) * (epoch - 1) + iteration log['gen/loss'] = loss_g.item() log['dis/loss'] = loss_d.item() logreport(log) print('epoch', epoch, 'finished, use time', time.time() - epoch_start_time) with torch.no_grad(): log_validation = test(config, validation_data_loader, gen, criterionMSE, epoch) validationreport(log_validation) print('validation finished') if epoch % config.snapshot_interval == 0: checkpoint(config, epoch, gen, dis) logreport.save_lossgraph() validationreport.save_lossgraph() print('training time:', time.time() - start_time)
def main(): print(pprint.pformat(vars(args))) ###### Bookkeeping if os.path.exists(test_results_fname(args)): resp = None while resp not in {"yes", "no", "y", "n"}: resp = input( f"{args.save} already exists. Overwrite contents? [y/n]: ") if resp == "yes" or resp == "y": break elif resp == "no" or resp == "n": print("Exiting") exit() else: os.makedirs(args.save, exist_ok=True) # Save command to file with open(command_fname(args), 'w') as f: f.write(pprint.pformat(vars(args))) ###### Dataloading dset_train = DeepChromeDataset(dataroot=args.globstr_train, num_procs=args.dset_workers) print(f"Training set has {len(dset_train)} samples.") train_loader = torch.utils.data.DataLoader( dset_train, batch_size=args.batch_size, num_workers=args.dloader_workers, shuffle=True, pin_memory=True, ) ###### Setup Model if args.arch == 'DeepChrome': model = DeepChromeModel() elif args.arch == 'DeepChromeFC': model = DeepChromeFCModel() else: raise NotImplementedError() if not args.no_gpu: model = model.cuda() ###### Optimization optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=args.momentum) def cosine_annealing(step, total_steps, lr_max, lr_min): return lr_min + (lr_max - lr_min) * 0.5 * ( 1 + np.cos(step / total_steps * np.pi)) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda step: cosine_annealing( step, args.epochs * len(train_loader), 1, # since lr_lambda computes multiplicative factor 1e-6 / args.lr)) ###### Logging with open(train_log_fname(args), 'w') as f: f.write("epoch,train_loss,val_loss,val_acc,val_auroc\n") ###### Train! print("Beginning training...") best_epoch_auroc = 0 best_epoch = None num_without_changing_best_val_auroc = 0 TestCells = [] for cell in args.globstr_val_cell_ids: TestCells.append( CellDataSet(cell, checkpoint_fname(args, cell), args.batch_size, args.dset_workers, args.dloader_workers)) for epoch in range(args.epochs): if all(map(lambda cell: cell.is_done, TestCells)): break # Train 1 epoch train_loss = train_one_epoch(epoch, model, train_loader, optimizer, scheduler) # Validate # val_auroc, val_acc, val_loss = test(model, val_loader, args.no_gpu) total_val_auroc = 0 total_val_acc = 0 total_val_loss = 0 num_cells = 0 for cell in TestCells: if cell.is_done: continue num_cells += 1 val_auroc, val_acc, val_loss = test(model, cell.val_loader, args.no_gpu) total_val_auroc += val_auroc total_val_acc += val_acc total_val_loss += val_loss cell.add_valid_auroc(val_auroc, epoch, model.state_dict(), optimizer.state_dict(), args.patience) ###### Logging print( 'Epoch {0:3d} | Train Loss {1:.6f} | Val Loss {2:.6f} | Val AUROC {3:.6f} | Val Accuracy {4:.6f}' .format( epoch, train_loss, total_val_auroc / num_cells, total_val_acc / num_cells, total_val_loss / num_cells, )) with open(train_log_fname(args), 'a') as f: f.write( f"{epoch},{train_loss},{total_val_loss / num_cells},{total_val_acc / num_cells},{total_val_auroc / num_cells}\n" ) # Save the stragglers for cell in TestCells: if not cell.is_done: print(f"{cell.cell_id} was a straggler :(") cell._save_model_to_disk() # Test on all cells all_save_data = dict() for cell in TestCells: model.load_state_dict(cell.best_model) print(f"Doing final testing on cell {cell.cell_id}") # Do final testing test_auroc, test_acc, test_loss = test(model, cell.test_loader, args.no_gpu) data = { "test_auroc": test_auroc, "test_acc": test_acc, "test_loss": test_loss } all_save_data[cell.cell_id] = data with open(test_results_fname(args), 'w', encoding='utf-8') as f: print(pprint.pformat(all_save_data)) json.dump(all_save_data, f, ensure_ascii=False, indent=4) print(f"Finished successfully. See {args.save}")
def trainIters(encoder, decoder, train_loader, dev_loader, \ input_lang, output_lang, input_lang_dev, output_lang_dev, max_word_len, n_iters, plot_every=100, print_every=1, weight_decay=0, learning_rate=0.01, device=DEVICE, teacher_forcing_ratio=0.5, label="", use_lr_scheduler = True, gamma_en = 0.9, gamma_de=0.9, beam_width=3, min_len=1, n_best=1, decode_method="beam", save_result_path = '', save_model=False): start = time.time() num_steps = len(train_loader) plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every cur_best = 0 encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=weight_decay) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=weight_decay) scheduler_encoder = ExponentialLR(encoder_optimizer, gamma_en, last_epoch=-1) scheduler_decoder = ExponentialLR(decoder_optimizer, gamma_de, last_epoch=-1) criterion = nn.NLLLoss() loss_file = open(save_result_path +'/%s-loss.txt'%label, 'w+') bleu_file = open(save_result_path +'/%s-bleu.txt'%label, 'w+') for epoch in range(1, n_iters + 1): if use_lr_scheduler: scheduler_encoder.step() scheduler_decoder.step() for i, (data1, data2, len1, len2) in enumerate(train_loader): encoder.train() decoder.train() source, target, source_len, target_len = data1.to(device), data2.to(device),len1.to(device),len2.to(device) loss = train(source, target, source_len, target_len, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, device=device, teacher_forcing_ratio=teacher_forcing_ratio) print_loss_total += loss plot_loss_total += loss if i != 0 and (i % plot_every == 0): plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 if epoch != 0 and (epoch % print_every == 0): print_loss_avg = print_loss_total / len(train_loader) print_loss_total = 0 print("testing..") bleu_score, _ , _, _ = test(encoder, decoder, dev_loader, input_lang, output_lang, input_lang_dev, output_lang_dev, beam_width, min_len, n_best, max_word_len, decode_method, device) print('%s epoch:(%d %d%%) step[%d %d] Average_Loss %.4f, Bleu Score %.3f' % (timeSince(start, epoch / n_iters), epoch, epoch / n_iters * 100, i, num_steps, print_loss_avg, bleu_score)) loss_file.write("%s\n" % print_loss_avg) bleu_file.write("%s\n" % bleu_score) if (bleu_score > cur_best): print("found best! save model...") fail_cnt = 0 if save_model: torch.save(encoder.state_dict(), 'encoder' + "-" + label + '.ckpt') torch.save(decoder.state_dict(), 'decoder' + "-" + label + '.ckpt') print("model saved") cur_best = bleu_score else: fail_cnt += 1 if fail_cnt == 15: print("No improvement for 15 epochs. Halt!") return 0 torch.cuda.empty_cache() loss_file.close() bleu_file.close()
def cluster(approach, datapath): """ Run a clustering approach on unlabeled data. """ report_path = test(datapath, approach, params[approach]) c.echo('Report compiled at {0}.'.format(report_path))
def train(loader, model, crit, optimizer, lr_scheduler, opt, rl_crit=None, opt_test=None, test_dataset=None): model.train() loss_avg = averager() #model = nn.DataParallel(model) writer = SummaryWriter() for epoch in range(opt["epochs"]): lr_scheduler.step() iteration = 0 # If start self crit training if opt["self_crit_after"] != -1 and epoch >= opt["self_crit_after"]: sc_flag = True init_cider_scorer(opt["cached_tokens"]) else: sc_flag = False for data in loader: torch.cuda.synchronize() fc_feats = data['fc_feats'].cuda() labels = data['labels'].cuda() masks = data['masks'].cuda() # clip_nums = data['clip_num'] # sorted_clip_nums, indices = torch.sort(clip_nums, descending=True) # _, desorted_indices = torch.sort(indices, descending=False) # fc_feats = fc_feats[indices] # pack = rnn.pack_padded_sequence(fc_feats, sorted_clip_nums, batch_first=True) optimizer.zero_grad() if not sc_flag: seq_probs, _ = model(fc_feats, labels, 'train') loss = crit(seq_probs, labels[:, 1:], masks[:, 1:]) else: seq_probs, seq_preds = model(fc_feats, mode='inference', opt=opt) reward = get_self_critical_reward(model, fc_feats, data, seq_preds) print(reward.shape) loss = rl_crit(seq_probs, seq_preds, torch.from_numpy(reward).float().cuda()) loss_avg.add(loss) loss.backward() clip_grad_value_(model.parameters(), opt['grad_clip']) optimizer.step() # train_loss = loss.item() torch.cuda.synchronize() iteration += 1 # if not sc_flag: # print("iter %d (epoch %d), train_loss = %.6f" % # (iteration, epoch, train_loss)) # else: # print("iter %d (epoch %d), avg_reward = %.6f" % # (iteration, epoch, np.mean(reward[:, 0]))) print("[epoch %d]->train_loss = %.6f" % (epoch, loss_avg.val())) writer.add_scalar('scalar/train_loss_epcho', loss_avg.val()) if epoch % opt["save_checkpoint_every"] == 0: test(model, crit, test_dataset, test_dataset.get_vocab(), opt_test, writer) model.train() model_path = os.path.join(opt["checkpoint_path"], 'model_%d.pth' % (epoch)) model_info_path = os.path.join(opt["checkpoint_path"], 'model_score.txt') torch.save(model.state_dict(), model_path) print("model saved to %s" % (model_path)) with open(model_info_path, 'a') as f: f.write("model_%d, loss: %.6f\n" % (epoch, loss_avg.val())) loss_avg.reset()