class Logger(object): def __init__(self, opt): self.exp_name = opt['name'] self.use_tb_logger = opt['use_tb_logger'] self.opt = opt['logger'] self.log_dir = opt['path']['log'] # loss log file self.loss_log_path = os.path.join(self.log_dir, 'loss_log.txt') with open(self.loss_log_path, 'a') as log_file: log_file.write('=============== Time: ' + get_timestamp() + ' =============\n') log_file.write( '================ Training Losses ================\n') # val results log file self.val_log_path = os.path.join(self.log_dir, 'val_log.txt') with open(self.val_log_path, 'a') as log_file: log_file.write('================ Time: ' + get_timestamp() + ' ===============\n') log_file.write( '================ Validation Results ================\n') if self.use_tb_logger and 'debug' not in self.exp_name: from tensorboard_logger import Logger as TensorboardLogger self.tb_logger = TensorboardLogger('../tb_logger/' + self.exp_name) def print_format_results(self, mode, rlt): epoch = rlt.pop('epoch') iters = rlt.pop('iters') time = rlt.pop('time') model = rlt.pop('model') if 'lr' in rlt: lr = rlt.pop('lr') message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}, lr:{:.1e}> '.format( epoch, iters, time, lr) else: message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}> '.format( epoch, iters, time) for label, value in rlt.items(): if mode == 'train': message += '{:s}: {:.2e} '.format(label, value) elif mode == 'val': message += '{:s}: {:.4e} '.format(label, value) # tensorboard logger if self.use_tb_logger and 'debug' not in self.exp_name: self.tb_logger.log_value(label, value, iters) # print in console print(message) # write in log file if mode == 'train': with open(self.loss_log_path, 'a') as log_file: log_file.write(message + '\n') elif mode == 'val': with open(self.val_log_path, 'a') as log_file: log_file.write(message + '\n') def log_message(self, rlt): iters = rlt.pop('iters') for label, value in rlt.items(): self.tb_logger.log_value(label, value, iters)
def test_smoke_logger(tmpdir): logger = Logger(str(tmpdir), flush_secs=0.1) for step in range(10): logger.log_value('v1', step * 1.5, step) logger.log_value('v2', step**1.5 - 2) time.sleep(0.5) tf_log, = tmpdir.listdir() assert tf_log.basename.startswith('events.out.tfevents.')
def test_dummy(): logger = Logger(None, is_dummy=True) for step in range(3): logger.log_value('A v/1', step, step) logger.log_value('A v/2', step * 2, step) assert dict(logger.dummy_log) == { 'A_v/1': [(0, 0), (1, 1), (2, 2)], 'A_v/2': [(0, 0), (1, 2), (2, 4)], }
def test_unique(): logger = Logger(None, is_dummy=True) for step in range(1, 3): # names that normalize to the same valid name logger.log_value('A v/1', step, step) logger.log_value('A\tv/1', step * 2, step) logger.log_value('A v/1', step * 3, step) assert dict(logger.dummy_log) == { 'A_v/1': [(1, 1), (2, 2)], 'A_v/1/1': [(1, 2), (2, 4)], 'A_v/1/2': [(1, 3), (2, 6)], }
def test_serialization(tmpdir): logger = Logger(str(tmpdir), flush_secs=0.1, dummy_time=256.5) logger.log_value('v/1', 1.5, 1) logger.log_value('v/22', 16.0, 2) time.sleep(0.5) tf_log, = tmpdir.listdir() assert tf_log.read_binary() == ( # step = 0, initial record b'\x18\x00\x00\x00\x00\x00\x00\x00\xa3\x7fK"\t\x00\x00\x00\x00\x00\x08p@\x1a\rbrain.Event:2\xbc\x98!+' # v/1 b'\x19\x00\x00\x00\x00\x00\x00\x00\x8b\xf1\x08(\t\x00\x00\x00\x00\x00\x08p@\x10\x01*\x0c\n\n\n\x03v/1\x15\x00\x00\xc0?,\xec\xc0\x87' # v/22 b'\x1a\x00\x00\x00\x00\x00\x00\x00\x12\x9b\xd8-\t\x00\x00\x00\x00\x00\x08p@\x10\x02*\r\n\x0b\n\x04v/22\x15\x00\x00\x80A\x8f\xa3\xb6\x88' )
class Visualizer(): def __init__(self, log_dir='runs/', **kwargs): self.tenbd = Logger(log_dir, flush_secs=10) self.index = {} self.log_text = '' def plot(self, name, y): x = self.index.get(name, 0) self.tenbd.log_value(name, y, x) self.index[name] = x + 1 def plotMany(self, data): for k, v in data.iteritems(): self.plot(k, v)
class Visualizer(): ''' 封装了visdom,tensorboard_logger, 更方便记录loss ''' def __init__(self, env='default', log_dir='runs/BiGRU', **kwargs): # self.vis = visdom.Visdom(env=env, **kwargs) self.tenbd = Logger(log_dir, flush_secs=2) # 记录数据的横向坐标{'img':2, 'loss':12} self.index = {} # 记录一些log信息 self.log_text = '' # def reinit(self, env='default', **kwargs): # ''' # 更改visdom的配置 # ''' # self.vis = visdom.Visdom(env=env, **kwargs) # return vis def plot(self, name, y): ''' self.plot('loss',0.23) ''' x = self.index.get(name, 0) # self.vis.line(Y=np.array([y]), # X=np.array([x]), # win=name, # opts=dict(title=name), # update=None if x==0 else 'append') self.tenbd.log_value(name, y, x) self.index[name] = x + 1 def plotMany(self, data): ''' 一次渲染多个数据 ''' for k, v in data.iteritems(): self.plot(k, v) def log(self, info, win='log_text'): '''
class Logger: """ Deals with writing tensorboard summaries. And logging metric history to a pickle file """ def __init__(self, outdir): self.outdir = outdir self.tf_logger = TFLogger(os.path.join(outdir, 'run'), flush_secs=2) self.metric_history: Dict = defaultdict(list) def log_metrics(self, phase, metrics, global_step): """ Logs scalar values as tf summaries. Don't bother with true_mean, it stays the same and doesn't really work as a graph. """ for name, value in metrics.items(): if name != "true_mean": self.tf_logger.log_value(f"{phase} {name}", value, global_step) # save standard pickle object for easy matloblib plot or perf over epochs self.metric_history[phase].append(metrics) with open(os.path.join(self.outdir, "metric_history.pkl"), "wb") as metric_file: pickle.dump(self.metric_history, metric_file)
# -*- coding: utf-8 -*- # @TIME : 2021/3/26 12:37 # @AUTHOR : Xu Bai # @FILE : 5-3-1.TensorBoard.py # @DESCRIPTION : from tensorboard_logger import Logger # ternsorboard --logdir experimient_cnn # 构建logger对象,logdir用来指定log文件路径 # flush_secs指定刷新同步间隔 logger = Logger(logdir='experimient_cnn', flush_secs=2) for ii in range(100): logger.log_value('loss', 10 - ii * .5, step=ii) logger.log_value('accuracy', ii * .5 / 10)
def train(args): # model model = getattr(models, config.model_name)() if args.ckpt and not args.resume: state = torch.load(args.ckpt, map_location='cpu') model.load_state_dict(state['state_dict']) print('train with pretrained weight val_f1', state['f1']) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=4) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) # 模型保存文件夹 model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name, time.strftime("%Y%m%d%H%M")) if args.ex: model_save_dir += args.ex best_f1 = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 if args.resume: if os.path.exists(args.ckpt): # 这里是存放权重的目录 model_save_dir = args.ckpt current_w = torch.load(os.path.join(args.ckpt, config.current_w)) best_w = torch.load(os.path.join(model_save_dir, config.best_w)) best_f1 = best_w['loss'] start_epoch = current_w['epoch'] + 1 lr = current_w['lr'] stage = current_w['stage'] model.load_state_dict(current_w['state_dict']) # 如果中断点恰好为转换stage的点 if start_epoch - 1 in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) model.load_state_dict(best_w['state_dict']) print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_f1 = train_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_f1 = val_epoch(model, criterion, val_dataloader) print( '#epoch:%02d stage:%d train_loss:%.3e train_f1:%.3f val_loss:%0.3e val_f1:%.3f time:%s\n' % (epoch, stage, train_loss, train_f1, val_loss, val_f1, utils.print_time_cost(since))) logger.log_value('train_loss', train_loss, step=epoch) logger.log_value('train_f1', train_f1, step=epoch) logger.log_value('val_loss', val_loss, step=epoch) logger.log_value('val_f1', val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt(state, best_f1 < val_f1, model_save_dir) best_f1 = max(best_f1, val_f1) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay best_w = os.path.join(model_save_dir, config.best_w) model.load_state_dict(torch.load(best_w)['state_dict']) print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr)
def main(): DIR = args.DIR embedding_file = args.embedding_dir best_network_file = "./model/network_model_pretrain.best.top" print >> sys.stderr, "Read model from ", best_network_file best_network_model = torch.load(best_network_file) embedding_matrix = numpy.load(embedding_file) "Building torch model" worker = network.Network( nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"], nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000, nnargs["embedding_size"], nnargs["embedding_dimention"], embedding_matrix).cuda() net_copy(worker, best_network_model) best_network_file = "./model/network_model_pretrain.best.top" print >> sys.stderr, "Read model from ", best_network_file best_network_model = torch.load(best_network_file) manager = network.Network( nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"], nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000, nnargs["embedding_size"], nnargs["embedding_dimention"], embedding_matrix).cuda() net_copy(manager, best_network_model) reduced = "" if args.reduced == 1: reduced = "_reduced" print >> sys.stderr, "prepare data for train ..." #train_docs_iter = DataReader.DataGnerater("train"+reduced) train_docs_iter = DataReader.DataGnerater("dev" + reduced) print >> sys.stderr, "prepare data for dev and test ..." dev_docs_iter = DataReader.DataGnerater("dev" + reduced) test_docs_iter = DataReader.DataGnerater("test" + reduced) print "Performance after pretraining..." print "DEV" metric = performance.performance(dev_docs_iter, worker, manager) print "Average:", metric["average"] print "TEST" metric = performance.performance(test_docs_iter, worker, manager) print "Average:", metric["average"] print "***" print sys.stdout.flush() lr = nnargs["lr"] top_k = nnargs["top_k"] model_save_dir = "./model/reinforce/" utils.mkdir(model_save_dir) score_softmax = nn.Softmax() optimizer_manager = optim.RMSprop(manager.parameters(), lr=lr, eps=1e-6) optimizer_worker = optim.RMSprop(worker.parameters(), lr=lr, eps=1e-6) MAX_AVE = 2048 for echo in range(nnargs["epoch"]): start_time = timeit.default_timer() print "Pretrain Epoch:", echo reward_log = Logger(Tensorboard + args.tb + "/acl2018/%d/reward/" % echo, flush_secs=3) entropy_log_manager = Logger(Tensorboard + args.tb + "/acl2018/%d/entropy/worker" % echo, flush_secs=3) entropy_log_worker = Logger(Tensorboard + args.tb + "/acl2018/%d/entropy/manager" % echo, flush_secs=3) #train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl') train_docs = utils.load_pickle(args.DOCUMENT + 'dev_docs.pkl') docs_by_id = {doc.did: doc for doc in train_docs} ave_reward = [] ave_manager_entropy = [] ave_worker_entropy = [] print >> sys.stderr, "Link docs ..." tmp_data = [] cluster_info = {0: [0]} cluster_list = [0] current_new_cluster = 1 predict_action_embedding = [] choose_action = [] mid = 1 step = 0 statistic = { "worker_hits": 0, "manager_hits": 0, "total": 0, "manager_predict_last": 0, "worker_predict_last": 0 } for data in train_docs_iter.rl_case_generater(shuffle=True): rl = data["rl"] scores_manager, representations_manager = get_score_representations( manager, data) for s, e in zip(rl["starts"], rl["ends"]): action_embeddings = representations_manager[s:e] probs = F.softmax(torch.transpose(scores_manager[s:e], 0, 1)) m = Categorical(probs) this_action = m.sample() index = this_action.data.cpu().numpy()[0] if index == (e - s - 1): should_cluster = current_new_cluster cluster_info[should_cluster] = [] current_new_cluster += 1 else: should_cluster = cluster_list[index] choose_action.append(index) cluster_info[should_cluster].append(mid) cluster_list.append(should_cluster) mid += 1 cluster_indexs = torch.cuda.LongTensor( cluster_info[should_cluster]) action_embedding_predict = torch.mean( action_embeddings[cluster_indexs], 0, keepdim=True) predict_action_embedding.append(action_embedding_predict) tmp_data.append(data) if rl["end"] == True: inside_index = 0 manager_path = [] worker_path = [] doc = docs_by_id[rl["did"]] for data in tmp_data: rl = data["rl"] pair_target = data["pair_target"] anaphoricity_target = 1 - data["anaphoricity_target"] target = numpy.concatenate( (pair_target, anaphoricity_target))[rl["reindex"]] scores_worker, representations_worker = get_score_representations( worker, data) for s, e in zip(rl["starts"], rl["ends"]): action_embeddings = representations_worker[s:e] score = score_softmax( torch.transpose(scores_worker[s:e], 0, 1)).data.cpu().numpy()[0] action_embedding_choose = predict_action_embedding[ inside_index] similarities = torch.sum( torch.abs(action_embeddings - action_embedding_choose), 1) similarities = similarities.data.cpu().numpy() action_probabilities = [] action_list = [] action_candidates = heapq.nlargest( top_k, -similarities) for action in action_candidates: action_index = numpy.argwhere( similarities == -action)[0][0] action_probabilities.append(score[action_index]) action_list.append(action_index) manager_action = choose_action[inside_index] if not manager_action in action_list: action_list.append(manager_action) action_probabilities.append(score[manager_action]) this_target = target[s:e] manager_action = choose_action[inside_index] sample_action = utils.sample_action( numpy.array(action_probabilities)) worker_action = action_list[sample_action] if this_target[worker_action] == 1: statistic["worker_hits"] += 1 if this_target[manager_action] == 1: statistic["manager_hits"] += 1 if worker_action == (e - s - 1): statistic["worker_predict_last"] += 1 if manager_action == (e - s - 1): statistic["manager_predict_last"] += 1 statistic["total"] += 1 inside_index += 1 #link = manager_action link = worker_action m1, m2 = rl['ids'][s + link] doc.link(m1, m2) manager_path.append(manager_action) worker_path.append(worker_action) reward = doc.get_f1() for data in tmp_data: for s, e in zip(rl["starts"], rl["ends"]): ids = rl['ids'][s:e] ana = ids[0, 1] old_ant = doc.ana_to_ant[ana] doc.unlink(ana) costs = rl['costs'][s:e] for ant_ind in range(e - s): costs[ant_ind] = doc.link(ids[ant_ind, 0], ana, hypothetical=True, beta=1) doc.link(old_ant, ana) #costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor)) inside_index = 0 worker_entropy = 0.0 for data in tmp_data: new_step = step # worker scores_worker, representations_worker = get_score_representations( worker, data, dropout=nnargs["dropout_rate"]) optimizer_worker.zero_grad worker_loss = None for s, e in zip(rl["starts"], rl["ends"]): costs = rl['costs'][s:e] costs = autograd.Variable( torch.from_numpy(costs).type( torch.cuda.FloatTensor)) action = worker_path[inside_index] score = F.softmax( torch.transpose(scores_worker[s:e], 0, 1)) if not score.size()[1] == costs.size()[0]: continue score = torch.squeeze(score) baseline = torch.sum(costs * score) this_cost = torch.log( score[action]) * -1.0 * (reward - baseline) if worker_loss is None: worker_loss = this_cost else: worker_loss += this_cost worker_entropy += torch.sum( score * torch.log(score + 1e-7) ).data.cpu().numpy()[ 0] #+ 0.001*torch.sum(score*torch.log(score+1e-7)) inside_index += 1 worker_loss.backward() torch.nn.utils.clip_grad_norm(worker.parameters(), nnargs["clip"]) optimizer_worker.step() ave_worker_entropy.append(worker_entropy) if len(ave_worker_entropy) >= MAX_AVE: ave_worker_entropy = ave_worker_entropy[1:] entropy_log_worker.log_value( 'entropy', float(sum(ave_worker_entropy)) / float(len(ave_worker_entropy)), new_step) new_step += 1 inside_index = 0 manager_entropy = 0.0 for data in tmp_data: new_step = step rl = data["rl"] ave_reward.append(reward) if len(ave_reward) >= MAX_AVE: ave_reward = ave_reward[1:] reward_log.log_value( 'reward', float(sum(ave_reward)) / float(len(ave_reward)), new_step) scores_manager, representations_manager = get_score_representations( manager, data, dropout=nnargs["dropout_rate"]) optimizer_manager.zero_grad manager_loss = None for s, e in zip(rl["starts"], rl["ends"]): score = F.softmax( torch.transpose(scores_manager[s:e], 0, 1)) costs = rl['costs'][s:e] costs = autograd.Variable( torch.from_numpy(costs).type( torch.cuda.FloatTensor)) if not score.size()[1] == costs.size()[0]: continue action = manager_path[inside_index] score = torch.squeeze(score) baseline = torch.sum(costs * score) this_cost = torch.log(score[action]) * -1.0 * ( reward - baseline ) # + 0.001*torch.sum(score*torch.log(score+1e-7)) #this_cost = torch.sum(score*costs) + 0.001*torch.sum(score*torch.log(score+1e-7)) if manager_loss is None: manager_loss = this_cost else: manager_loss += this_cost manager_entropy += torch.sum( score * torch.log(score + 1e-7)).data.cpu().numpy()[0] inside_index += 1 manager_loss.backward() torch.nn.utils.clip_grad_norm(manager.parameters(), nnargs["clip"]) optimizer_manager.step() ave_manager_entropy.append(manager_entropy) if len(ave_manager_entropy) >= MAX_AVE: ave_manager_entropy = ave_manager_entropy[1:] entropy_log_manager.log_value( 'entropy', float(sum(ave_manager_entropy)) / float(len(ave_manager_entropy)), new_step) new_step += 1 step = new_step tmp_data = [] cluster_info = {0: [0]} cluster_list = [0] current_new_cluster = 1 mid = 1 predict_action_embedding = [] choose_action = [] end_time = timeit.default_timer() print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time - start_time) print >> sys.stderr, "save model ..." #print "Top k",top_k print "Worker Hits", statistic[ "worker_hits"], "Manager Hits", statistic[ "manager_hits"], "Total", statistic["total"] print "Worker predict last", statistic[ "worker_predict_last"], "Manager predict last", statistic[ "manager_predict_last"] #torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo) #torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo) print "DEV" metric = performance.performance(dev_docs_iter, worker, manager) print "Average:", metric["average"] print "DEV manager" metric = performance_manager.performance(dev_docs_iter, worker, manager) print "Average:", metric["average"] print "TEST" metric = performance.performance(test_docs_iter, worker, manager) print "Average:", metric["average"] print sys.stdout.flush()
if not args.predict: if args.set == 'train': train_batch_idx = 0 for epoch in range(args.epochs): for batch_idx, sample in enumerate(DataLoaderDict['train']): model.zero_grad() model.hidden = model.init_hidden(args.batch_size) x = sample['data'].transpose(0,1) y = sample['label'].transpose(0,1) if use_gpu: x, y = x.cuda(), y.cuda() pred = model(x) loss = loss_function(pred.transpose(1,2),y) loss.backward() optimizer.step() logger.log_value('train_loss', loss, train_batch_idx) # scheduler.step() pred = pred.argmax(2) correct = y.eq(pred.long()).sum() # Tensor elements always return tensors? # Had to use tolist to return as int acc = 100*correct.tolist()/pred.nelement() logger.log_value('train_accuracy', acc, train_batch_idx) print( 'Train:[{}|{}]\tloss: {:.4f}\taccuracy: {:.4f}'.format( epoch, batch_idx, loss, acc)) if train_batch_idx % args.test_every == 0: ''' Training will periodically test on val set or test set if validation doesn't exist ''' if len(DataLoaderDict['val']) == 1:
class Logger(object): def __init__(self, opt, tb_logger_suffix=''): self.exp_name = opt['name'] self.use_tb_logger = opt['use_tb_logger'] self.opt = opt['logger'] self.log_dir = opt['path']['log'] if not os.path.isdir(self.log_dir): os.mkdir(self.log_dir) # loss log file self.loss_log_path = os.path.join(self.log_dir, 'loss_log.txt') with open(self.loss_log_path, 'a') as log_file: log_file.write('=============== Time: ' + get_timestamp() + ' =============\n') log_file.write( '================ Training Losses ================\n') # val results log file self.val_log_path = os.path.join(self.log_dir, 'val_log.txt') with open(self.val_log_path, 'a') as log_file: log_file.write('================ Time: ' + get_timestamp() + ' ===============\n') log_file.write( '================ Validation Results ================\n') if self.use_tb_logger: # and 'debug' not in self.exp_name: from tensorboard_logger import Logger as TensorboardLogger logger_dir_num = 0 tb_logger_dir = self.log_dir.replace('experiments', 'logs') if not os.path.isdir(tb_logger_dir): os.mkdir(tb_logger_dir) existing_dirs = sorted([ dir.split('_')[0] for dir in os.listdir(tb_logger_dir) if os.path.isdir(os.path.join(tb_logger_dir, dir)) ], key=lambda x: int(x.split('_')[0])) if len(existing_dirs) > 0: logger_dir_num = int(existing_dirs[-1]) + 1 self.tb_logger = TensorboardLogger( os.path.join(tb_logger_dir, str(logger_dir_num) + tb_logger_suffix)) def print_format_results(self, mode, rlt, dont_print=False, keys_ignore_list=[]): epoch = rlt.pop('epoch') iters = rlt.pop('iters') time = rlt.pop('time') model = rlt.pop('model') if 'lr' in rlt: lr = rlt.pop('lr') message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}, lr:{:.1e}> '.format( epoch, iters, time, lr) else: message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}> '.format( epoch, iters, time) for label, value in rlt.items(): if label in keys_ignore_list or '_baseline' in label: continue if mode == 'train': message += '{:s}: {:.4e} '.format(label, value) elif mode == 'val': message += '{:s}: {:.4e} '.format(label, value) # tensorboard logger if self.use_tb_logger: # and 'debug' not in self.exp_name: self.tb_logger.log_value(label, value, iters) # print in console if not dont_print: print(message) # write in log file if mode == 'train': with open(self.loss_log_path, 'a') as log_file: log_file.write(message + '\n') elif mode == 'val': with open(self.val_log_path, 'a') as log_file: log_file.write(message + '\n')
def train(args): # model print(args.model_name) config.train_data = config.train_data + str(args.fold) + '.pth' # config.train_data = config.train_data + 'trainsfer_' + str(args.fold) + '.pth' config.model_name = args.model_name model = getattr(models, config.model_name)() model = model.to(device) # data if args.model_kind == 1: import dataset2 train_dataset = dataset2.ECGDataset(data_path=config.train_data, train=True, transform=True) train_dataloader = DataLoader(train_dataset, collate_fn=my_collate_fn, batch_size=config.batch_size, shuffle=True, num_workers=6) else: train_dataset = ECGDataset(data_path=config.train_data, train=True, transform=True) train_dataloader = DataLoader( train_dataset, #collate_fn=my_collate_fn, batch_size=config.batch_size, shuffle=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=6) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) # optimizer = optim.RMSprop(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) if args.model_kind == 1: criterion = utils.WeightedMultilabel(w) print(1) else: criterion = utils2.WeightedMultilabel(w) # criterion = utils.My_loss(w) # 模型保存文件夹 model_save_dir = '%s/%s' % (config.ckpt + str(args.model_kind), config.model_name + '_' + str(args.fold)) args.ckpt = model_save_dir # if args.ex: model_save_dir += args.ex best_f1 = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 if args.resume: if os.path.exists(args.ckpt): # 这里是存放权重的目录 # model_save_dir = args.ckpt current_w = torch.load(os.path.join(args.ckpt, config.current_w)) best_w = torch.load(os.path.join(model_save_dir, config.best_w)) best_f1 = best_w['best_f'] start_epoch = current_w['epoch'] + 1 lr = current_w['lr'] stage = current_w['stage'] model.load_state_dict(current_w['state_dict']) # 如果中断点恰好为转换stage的点 if start_epoch - 1 in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) model.load_state_dict(best_w['state_dict']) print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) else: path = '%s/%s' % (config.ckpt, config.model_name + '_transfer') print(path) current_w = torch.load(os.path.join(path, config.best_w)) model.load_state_dict(current_w['state_dict']) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= val_loss = 10 val_f1 = -1 state = {} for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_f1, best_f1 = train_epoch( model, optimizer, criterion, train_dataloader, epoch, lr, best_f1, val_dataloader, model_save_dir, state, 0) # if epoch % 2 == 1: val_loss, val_f1, _, _ = val_epoch(model, criterion, val_dataloader) print( '#epoch:%02d stage:%d train_loss:%.3e train_f1:%.3f val_loss:%0.3e val_f1:%.3f time:%s' % (epoch, stage, train_loss, train_f1, val_loss, val_f1, utils.print_time_cost(since))) logger.log_value('train_loss', train_loss, step=epoch) logger.log_value('train_f1', train_f1, step=epoch) logger.log_value('val_loss', val_loss, step=epoch) logger.log_value('val_f1', val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage, "best_f": best_f1 } if best_f1 < val_f1: save_ckpt(state, best_f1 < val_f1, model_save_dir) print('save best') else: save_ckpt(state, False, model_save_dir) best_f1 = max(best_f1, val_f1) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay # best_w = os.path.join(model_save_dir, config.best_w) # model.load_state_dict(torch.load(best_w)['state_dict']) print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr)
# Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch) print("Resuming after epoch {}".format(epoch)) epoch_start = epoch + 1 step = 0 # Evaluate on held-out set val_dataset = TSP.make_dataset(filename=opts.val_dataset, batch_size=opts.batch_size, num_samples=opts.val_size, neighbors=opts.neighbors, knn_strat=opts.knn_strat, supervised=True) avg_reward, avg_opt_gap = validate(model, val_dataset, problem, opts) tb_logger.log_value('val_ft/avg_reward', avg_reward, step) tb_logger.log_value('val_ft/opt_gap', avg_opt_gap, step) if opts.ft_strategy == "active": # Active search: finetune on the test set train_dataset = baseline.wrap_dataset(val_dataset) train_dataloader = DataLoader(train_dataset, batch_size=opts.batch_size, shuffle=True, num_workers=opts.num_workers) elif opts.ft_strategy == "fixed": # Fixed finetuning: finetune on a fixed training set train_dataset = baseline.wrap_dataset( problem.make_dataset(min_size=opts.min_size, max_size=opts.max_size,
class PlotLogger: def __init__(self, path): self.plot_instance = Logger(path) def log_value(self, name, value, step): self.plot_instance.log_value(name, value, step)
G_BA.load_state_dict(torch.load(G_BA_file)) networks = [D_B, D_A, G_AB, G_BA] optimizer_D_B = optim.Adam(D_B.parameters(), lr=lr_1) optimizer_D_A = optim.Adam(D_A.parameters(), lr=lr_1) optimizer_G_AB = optim.Adam(G_AB.parameters(), lr=lr_1) optimizer_G_BA = optim.Adam(G_BA.parameters(), lr=lr_1) optimizers = [optimizer_G_AB, optimizer_G_BA, optimizer_D_B, optimizer_D_A] pool_A = ImagePool(pool_size=50) pool_B = ImagePool(pool_size=50) pools = [pool_A, pool_B] for epoch in range(starting_epoch, nb_epochs + 1): losses = train(epoch, loader, networks, optimizers, pools, max_steps=max_steps, verbose=False) train_logger.log_value('D_B loss', losses[0], epoch) train_logger.log_value('D_A loss', losses[1], epoch) train_logger.log_value('G_AB loss', losses[2], epoch) train_logger.log_value('G_BA loss', losses[3], epoch) total_loss = sum(losses) print("\nLoss at epoch n.{} : {}".format(epoch, total_loss)) D_B_file = 'models/' + experiment_name + '/D_B_' + str(epoch) + '.pth' D_A_file = 'models/' + experiment_name + '/D_A_' + str(epoch) + '.pth' G_AB_file = 'models/' + experiment_name + '/G_AB_' + str(epoch) + '.pth' G_BA_file = 'models/' + experiment_name + '/G_BA_' + str(epoch) + '.pth' torch.save(D_B.state_dict(), D_B_file) torch.save(D_A.state_dict(), D_A_file) torch.save(G_AB.state_dict(), G_AB_file) torch.save(G_BA.state_dict(), G_BA_file)
class TrainNetwork: def __init__(self, batch_size=64, image_size=64, load_model=None, iterations_start=0, seed=None, epochs=10000, lr_decay_iter=250000, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-3, weighting_factor=0.5, regression=False, loss='MultiLabelSoftMarginLoss', network='ColorfulImageColorization', convert_on_gpu=False, do_not_log=False): # hyperparameters self.batch_size = batch_size self.image_size = image_size self.iterations = iterations_start self.seed = np.random.randint(0, 10000) if seed is None else seed self.epochs = epochs self.lr_decay_iter = lr_decay_iter self.lr0 = lr self.betas = betas self.eps = eps self.weight_decay = weight_decay self.weighting_factor = weighting_factor self.lr = self.calc_learning_rate() self.regression = regression self.loss = loss self.model_name = '{date:%Y_%m_%d__%H_%M_%S}_{net}_{loss}_{dataset}'.format( date=datetime.datetime.now(), net=network, loss=self.loss, dataset=dataset) torch.manual_seed(self.seed) self.convert_on_gpu = convert_on_gpu # tensorboard if not do_not_log: self.logger = Logger(logs_path + self.model_name) self.log_hyperparameter() # model, loss, optimizer assert network in [ 'DeepKoalarization', 'CheapConvNet', 'ColorfulImageColorization', 'DeepKoalarizationNorm' ] if self.regression: out_channels = 2 else: out_channels = int((256 / grid_size)**2) if network == 'DeepKoalarization': self.model = DeepKoalarization( out_channels=out_channels, to_rgb=(self.loss == 'PerceptualLoss')) elif network == 'DeepKoalarizationNorm': self.model = DeepKoalarizationNorm( out_channels=out_channels, to_rgb=(self.loss == 'PerceptualLoss')) elif network == 'CheapConvNet': assert self.loss != 'PerceptualLoss' self.model = CheapConvNet(out_channels=out_channels) elif network == 'ColorfulImageColorization': self.model = ColorfulImageColorization( out_channels=out_channels, to_rgb=(self.loss == 'PerceptualLoss')) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr, betas=self.betas, eps=self.eps, weight_decay=self.weight_decay) assert loss in [ 'PerceptualLoss', 'MSELoss', 'MultiLabelSoftMarginLoss', 'BCEWithLogitsLoss', 'MultinomialCrossEntropyLoss' ] if loss == 'PerceptualLoss': self.loss_fn = PerceptualLoss() elif loss == 'MSELoss': assert self.regression self.loss_fn = nn.MSELoss() elif loss == 'MultiLabelSoftMarginLoss': assert not self.regression w = torch.load(images_path + 'classification_weights_{}_{}.pth'.format( grid_size, self.weighting_factor)) self.loss_fn = nn.MultiLabelSoftMarginLoss(weight=w.view(-1, 1, 1)) elif loss == 'BCEWithLogitsLoss': assert not self.regression w = torch.load(images_path + 'classification_weights_{}_{}.pth'.format( grid_size, self.weighting_factor)) self.loss_fn = nn.BCEWithLogitsLoss(weight=w.view(-1, 1, 1)) elif loss == 'MultinomialCrossEntropyLoss': assert not self.regression w = torch.load(images_path + 'classification_weights_{}_{}.pth'.format( grid_size, self.weighting_factor)) self.loss_fn = MultinomialCrossEntropyLoss(weights=w) # load model if load_model is not None: # Load pre learned AlexNet with changed number of output classes state_dict = torch.load(trained_models_path + load_model, map_location='cpu') self.model.load_state_dict(state_dict['model']) self.optimizer.load_state_dict(state_dict['optimizer']) self.adjust_learning_rate() # Use cuda if available self.cuda = torch.cuda.is_available() if self.cuda: self.model.cuda() self.loss_fn.cuda() if load_model is not None: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # Load dataset kwargs = {'num_workers': 8, 'pin_memory': True} if self.cuda else {} self.train_loader = torch.utils.data.DataLoader(ColorizationDataset( images_path, train=True, size=(self.image_size, self.image_size), target_rgb=(loss == 'PerceptualLoss'), convert_to_categorical=(not self.regression), do_not_convert=convert_on_gpu), batch_size=batch_size, shuffle=True, **kwargs) kwargs = {'num_workers': 1, 'pin_memory': True} if self.cuda else {} self.test_loader = torch.utils.data.DataLoader(ColorizationDataset( images_path, train=False, size=(self.image_size, self.image_size), target_rgb=(loss == 'PerceptualLoss'), convert_to_categorical=(not self.regression), do_not_convert=convert_on_gpu), batch_size=8, drop_last=True, shuffle=True, **kwargs) self.test_iterator = iter(self.test_loader) def calc_learning_rate(self): """ Reduce the learning rate by factor 0.5 every lr_decay_iter :return: None """ lr = self.lr0 * (0.1**(self.iterations // self.lr_decay_iter)) return lr def adjust_learning_rate(self): for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr def reduce_learning_rate(self): lr = self.calc_learning_rate() if abs(lr - self.lr) > 1e-7: self.lr = lr self.adjust_learning_rate() def train(self): """ Train the model for one epoch and save the result as a .pth file :return: None """ self.model.train() for epoch in range(1, self.epochs + 1): train_loss_epoche = 0 batch_start_time = time.clock() batch_idx = 0 for batch_idx, (data, target) in enumerate(self.train_loader): self.reduce_learning_rate() train_loss_epoche += self.train_one_iter( data, target, epoch, batch_idx) self.iterations += 1 print('Batch ' + str(batch_idx + 1) + ' took ' + str(time.clock() - batch_start_time) + ' seconds') batch_start_time = time.clock() # Print information about current epoch train_loss_epoche /= (batch_idx + 1) print('Train Epoch: {} \tAverage loss: {:.6f}'.format( epoch, train_loss_epoche)) def train_one_iter(self, data, target, epoch, batch_idx): if self.cuda: data = data.cuda() target = target.cuda() if self.convert_on_gpu: lab = pdc.rgb2lab(data.float() / 255) data, target = torch.split(lab, [1, 2], dim=1) if not self.regression: target = conversion_batch(target) # Optimize using backpropagation self.optimizer.zero_grad() output = self.model(data) loss = self.loss_fn(output, target) loss.backward() self.optimizer.step() # Print information about current step print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format( epoch, batch_idx + 1, len(self.train_loader), loss.item())) if self.iterations % 5 == 0: # log loss test_data, test_target = self.get_next_test_batch() self.model.eval() test_output = self.model(test_data) self.model.train() test_loss = self.loss_fn(test_output, test_target) self.log_scalars(self.iterations, loss, test_loss) if self.iterations % 50 == 0: # log images self.log_images(self.iterations, test_data, test_target, test_output, data, target, output) if self.iterations % 1000 == 0 and self.iterations > 0: # Save snapshot model_name = self.model_name + '_iter{}'.format(self.iterations) torch.save( { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() }, trained_models_path + '{}.pth'.format(model_name)) if self.iterations % 600 == 0: # log stuff self.log_values_gradients(self.iterations) return loss.item() def log_hyperparameter(self): info = { 'batch_size': self.batch_size, 'image_size': self.image_size, 'seed': self.seed, 'epochs': self.epochs, 'learning_decay_iter': self.lr_decay_iter, 'learning_rate_0': self.lr0, 'betas[0]': self.betas[0], 'betas[1]': self.betas[1], 'eps_optimizer': self.eps, 'weight_decay_optimizer': self.weight_decay, 'weighting_factor': self.weighting_factor, 'regression': self.regression, 'convert_on_gpu': self.convert_on_gpu } for tag, value in info.items(): self.logger.log_value(tag, value, 0) def log_scalars(self, step, train_loss, test_loss): # adapted from https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/04-utils/tensorboard/main.py # 1. Log scalar values (scalar summary) info = { 'train_loss': train_loss.item(), 'test_loss': test_loss.item(), 'learning_rate': self.lr } for tag, value in info.items(): self.logger.log_value(tag, value, step) def log_images(self, step, test_data, test_target, test_output, train_data, train_target, train_output): # 3. Log test images (image summary) num_images = 1 test_original = self.convert_to_images(test_data[:num_images], test_target[:num_images], is_target=True) test_colorized = self.convert_to_images(test_data[:num_images], test_output[:num_images], is_target=False) train_original = self.convert_to_images(train_data[:num_images], train_target[:num_images], is_target=True) train_colorized = self.convert_to_images(train_data[:num_images], train_output[:num_images], is_target=False) info = { 'test colorized': test_colorized, 'test original': test_original, 'train colorized': train_colorized, 'train original': train_original } for tag, images in info.items(): self.logger.log_images(tag, images, step) def log_values_gradients(self, step): # adapted from https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/04-utils/tensorboard/main.py # 2. Log values and gradients of the parameters (histogram summary) for tag, value in self.model.named_parameters(): if 'feature_extractor' not in tag: tag = tag.replace('.', '/') self.logger.log_histogram(tag, value.data.cpu().numpy(), step) self.logger.log_histogram(tag + '/grad', value.grad.data.cpu().numpy(), step) def convert_to_images(self, data, output_or_target, is_target, t=0.38): if self.loss == 'PerceptualLoss': return output_or_target.detach().cpu().permute( 0, 2, 3, 1).numpy().astype(np.float64) else: return generate_images_numpy(data, output_or_target, is_target, regression=self.regression, t=t) def get_next_test_batch(self): try: data, target = next(self.test_iterator) except StopIteration: self.test_iterator = iter(self.test_loader) data, target = next(self.test_iterator) if self.cuda: data = data.cuda() target = target.cuda() if self.convert_on_gpu: lab = pdc.rgb2lab(data.float() / 255) data, target = torch.split(lab, [1, 2], dim=1) if not self.regression: target = conversion_batch(target) return data, target
def trainModel(model, trainData, validData, dataset, optim): logger = Logger(os.path.join(opt.save_path, 'tb')) iterations = 0 print(model) model.train() # Define criterion of each GPU. criterion = NMTCriterion(dataset['dicts']['tgt'].size()) start_time = time.time() for epoch in range(opt.start_epoch, opt.epochs + 1): print('') # (1) train for one epoch on the training set if opt.extra_shuffle and epoch > opt.curriculum: trainData.shuffle() # Shuffle mini batch order. batchOrder = torch.randperm(len(trainData)) total_loss, total_words, total_num_correct = 0, 0, 0 report_loss, report_tgt_words = 0, 0 report_src_words, report_num_correct = 0, 0 start = time.time() for i in range(len(trainData)): iterations += 1 batchIdx = batchOrder[i] if epoch > opt.curriculum else i # Exclude original indices. batch = trainData[batchIdx][:-1] model.zero_grad() outputs = model(batch) # Exclude <s> from targets. targets = batch[1][1:] loss, gradOutput, num_correct = memoryEfficientLoss( outputs, targets, model.generator, criterion) outputs.backward(gradOutput) # Update the parameters. optim.step() num_words = targets.data.ne(onmt.Constants.PAD).sum() report_loss += loss report_num_correct += num_correct report_tgt_words += num_words report_src_words += batch[0][1].data.sum() total_loss += loss total_num_correct += num_correct total_words += num_words if iterations % opt.log_interval == -1 % opt.log_interval: print(( "Epoch %d, %d/%d; acc: %.2f; ppl: %.2f; %.0f src tok/s; %.0f tgt tok/s; %.0fs elapsed" ) % (epoch, i + 1, len(trainData), report_num_correct / report_tgt_words * 100.0, math.exp( report_loss / report_tgt_words), report_src_words / (time.time() - start), report_tgt_words / (time.time() - start), time.time() - start_time)) # log to tensorboard logger.log_value("word_acc", float(report_num_correct) / float(report_tgt_words), step=iterations) logger.log_value("ppl", math.exp(report_loss / report_tgt_words), step=iterations) report_loss, report_tgt_words = 0, 0 report_src_words, report_num_correct = 0, 0 start = time.time() train_loss, train_acc = total_loss / total_words, total_num_correct / total_words train_ppl = math.exp(min(train_loss, 100)) print('Train perplexity: %g' % train_ppl) print('Train word accuracy: %g' % (train_acc * 100)) # (2) evaluate on the validation set valid_loss, valid_acc = eval(model, criterion, validData) valid_ppl = math.exp(min(valid_loss, 100)) print('Validation perplexity: %g' % valid_ppl) print('Validation word accuracy: %g' % (valid_acc * 100)) # (3) update the learning rate optim.updateLearningRate(valid_ppl, epoch) model_state_dict = (model.module.state_dict() if len(opt.gpus) > 1 else model.state_dict()) model_state_dict = { k: v for k, v in model_state_dict.items() if 'generator' not in k } generator_state_dict = (model.generator.module.state_dict() if len(opt.gpus) > 1 else model.generator.state_dict()) # (4) drop a checkpoint checkpoint = { 'model': model_state_dict, 'generator': generator_state_dict, 'dicts': dataset['dicts'], 'opt': opt, 'epoch': epoch, 'optim': optim } if epoch % 5 == 0: torch.save( checkpoint, os.path.join(opt.save_path, 'm_%d_acc_%.2f.pt' % (epoch, 100.0 * valid_acc)))
class Logger(object): def __init__(self, log_dir, label, titles, append_steps=1): """ log_dir : str, directory where all the logs will be written. label : str, root filename for the logs. It shouldn't contain an extension, such as .txt titles : list, title for each log attribute. append_steps : int, """ self.log_dir = log_dir self.label = label self.titles = titles self.append_steps = append_steps self.logs = {} # all title-log pairs that will be traced for this instance self.meters = {} for t in titles: self.logs[t] = [] self.meters[t] = AverageMeter() if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) self.tb_logger = TBLogger(self.log_dir) self.f_txt = open(os.path.join(self.log_dir, '{}.txt'.format(self.label)), 'w') def flush(self): self.save_as_arrays() self.save_as_figures() def close(self): self.flush() self.f_txt.close() def update(self, values, step): """ Adds a new log value for each title, also updates corresponding average meters. If step is multiple of append_steps, then self.append is called. values : list, must be of the same size as self.titles. step : int, a step number """ assert len(self.titles) == len(values) for t, v in zip(self.titles, values): self.meters[t].update(v, 1) if step % self.append_steps == 0: values = [m.avg for m in self.meters.values()] self.append(values, step) def append(self, values, step): """ Adds a new log value for each title. values : list, must be of the same size as self.titles. step : int, a step number """ assert len(self.titles) == len(values) step_log = OrderedDict() step_log['step'] = str(step) step_log['time'] = datetime.datetime.now().strftime("%y-%m-%d %H:%M:%S") for t, v in zip(self.titles, values): self.logs[t].append(v) step_log[t] = v self.tb_logger.log_value(t, v, step) json.dump(step_log, self.f_txt, indent=4) self.f_txt.write('\n') self.f_txt.flush() def save_as_arrays(self): """ Converts all logs to numpy arrays and saves them into self.log_dir. """ arrays = {} for t, v in self.logs.items(): v = np.array(v) arrays[t] = v np.savez(os.path.join(self.log_dir, '{}.npz'.format(self.label)), **arrays) def save_as_figures(self): """ First, converts all logs to numpy arrays, then plots them using matplotlib. Finally, saves the plots into self.log_dir. """ for t, v in self.logs.items(): v = np.array(v) fig = plt.figure(dpi=400) ax = fig.add_subplot(111) ax.plot(v) ax.set_title(t) ax.grid(True) fig.savefig( os.path.join(self.log_dir, '{}_{}.png'.format(self.label, t)), bbox_inches='tight' ) plt.close()
def train_cv(input_directory, output_directory): # model # 模型保存文件夹 model_save_dir = '%s/%s_%s' % ( config.ckpt, config.model_name + "_cv", time.strftime("%Y%m%d%H%M") ) #'%s/%s_%s' % (config.ckpt, args.model_name+"_cv", time.strftime("%Y%m%d%H%M")) for fold in range(config.kfold): print("***************************fold : {}***********************". format(fold)) model = getattr(models, config.model_name)(fold=fold) # if args.ckpt and not args.resume: # state = torch.load(args.ckpt, map_location='cpu') # model.load_state_dict(state['state_dict']) # print('train with pretrained weight val_f1', state['f1']) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, config.num_classes) #2019/11/11 #save dense/fc weight for pretrain 55 classes # model = MyModel() # num_ftrs = model.classifier.out_features # model.fc = nn.Linear(55, config.num_classes) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data_cv.format(fold), data_dir=input_directory, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data_cv.format(fold), data_dir=input_directory, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, drop_last=True, num_workers=4) print("fold_{}_train_datasize".format(fold), len(train_dataset), "fold_{}_val_datasize".format(fold), len(val_dataset)) # optimizer and loss optimizer = radam.RAdam( model.parameters(), lr=config.lr) #optim.Adam(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) ## utils.FocalLoss() # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', verbose=True, factor=0.1, patience=5, min_lr=1e-06, eps=1e-08) # if args.ex: model_save_dir += args.ex # best_f1 = -1 # lr = config.lr # start_epoch = 1 # stage = 1 best_f1 = -1 best_cm = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 # if args.resume: # if os.path.exists(args.ckpt): # 这里是存放权重的目录 # model_save_dir = args.ckpt # current_w = torch.load(os.path.join(args.ckpt, config.current_w)) # best_w = torch.load(os.path.join(model_save_dir, config.best_w)) # best_f1 = best_w['loss'] # start_epoch = current_w['epoch'] + 1 # lr = current_w['lr'] # stage = current_w['stage'] # model.load_state_dict(current_w['state_dict']) # # 如果中断点恰好为转换stage的点 # if start_epoch - 1 in config.stage_epoch: # stage += 1 # lr /= config.lr_decay # utils.adjust_learning_rate(optimizer, lr) # model.load_state_dict(best_w['state_dict']) # print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch( model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch( model, criterion, val_dataloader) # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader) print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \ val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n' % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \ val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since))) logger.log_value('fold{}_train_loss'.format(fold), train_loss, step=epoch) logger.log_value('fold{}_train_f1'.format(fold), train_f1, step=epoch) logger.log_value('fold{}_val_loss'.format(fold), val_loss, step=epoch) logger.log_value('fold{}_val_f1'.format(fold), val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt_cv(state, best_cm < val_cm, model_save_dir, fold, output_directory) best_cm = max(best_cm, val_cm) scheduler.step(val_cm) # scheduler.step() if val_cm < best_cm: epoch_cum += 1 else: epoch_cum = 0 # save_ckpt_cv(state, best_f1 < val_f1, model_save_dir,fold) # best_f1 = max(best_f1, val_f1) # if val_f1 < best_f1: # epoch_cum += 1 # else: # epoch_cum = 0 # if epoch in config.stage_epoch: # if epoch_cum == 5: # stage += 1 # lr /= config.lr_decay # if lr < 1e-6: # lr = 1e-6 # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # best_w = os.path.join(model_save_dir, config.best_w_cv.format(fold)) # model.load_state_dict(torch.load(best_w)['state_dict']) # print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) # utils.adjust_learning_rate(optimizer, lr) # elif epoch_cum >= 12: # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # break if epoch_cum >= 12: print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) break
def train(input_directory, output_directory): # model model = getattr(models, config.model_name)() # if args.ckpt and not args.resume: # state = torch.load(args.ckpt, map_location='cpu') # model.load_state_dict(state['state_dict']) # print('train with pretrained weight val_f1', state['f1']) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, config.num_classes) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data, data_dir=input_directory, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data, data_dir=input_directory, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=4) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss #optimizer = optim.Adam(model.parameters(), lr=config.lr) optimizer = radam.RAdam(model.parameters(), lr=config.lr, weight_decay=1e-4) #config.lr #optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, dampening=0, weight_decay=1e-4, nesterov=False) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) ## # utils.FocalLoss() # scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'max', verbose=True, factor=0.1, patience=5, min_lr=1e-06, eps=1e-08) #CosineAnnealingLR CosineAnnealingWithRestartsLR #scheduler = pytorchtools.CosineAnnealingWithRestartsLR(optimizer,T_max=30, T_mult = 1.2, eta_min=1e-6) # optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True) # scheduler = pytorchtools.CosineAnnealingLR_with_Restart(optimizer, T_max=12, T_mult=1, model=model, out_dir='./snapshot',take_snapshot=True, eta_min=1e-9) # 模型保存文件夹 model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name, time.strftime("%Y%m%d%H%M")) # if args.ex: model_save_dir += args.ex best_f1 = -1 best_cm = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 # if args.resume: # if os.path.exists(args.ckpt): # 这里是存放权重的目录 # model_save_dir = args.ckpt # current_w = torch.load(os.path.join(args.ckpt, config.current_w)) # best_w = torch.load(os.path.join(model_save_dir, config.best_w)) # best_f1 = best_w['loss'] # start_epoch = current_w['epoch'] + 1 # lr = current_w['lr'] # stage = current_w['stage'] # model.load_state_dict(current_w['state_dict']) # # 如果中断点恰好为转换stage的点 # if start_epoch - 1 in config.stage_epoch: # stage += 1 # lr /= config.lr_decay # utils.adjust_learning_rate(optimizer, lr) # model.load_state_dict(best_w['state_dict']) # print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch( model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch( model, criterion, val_dataloader) # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader) print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \ val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n' % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \ val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since))) logger.log_value('train_loss', train_loss, step=epoch) logger.log_value('train_f1', train_f1, step=epoch) logger.log_value('val_loss', val_loss, step=epoch) logger.log_value('val_f1', val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt(state, best_cm < val_cm, model_save_dir, output_directory) best_cm = max(best_cm, val_cm) scheduler.step(val_cm) # scheduler.step() if val_cm < best_cm: epoch_cum += 1 else: epoch_cum = 0 # # if epoch in config.stage_epoch: # if epoch_cum == 5: # stage += 1 # lr /= config.lr_decay # if lr < 1e-6: # lr = 1e-6 # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # best_w = os.path.join(model_save_dir, config.best_w) # model.load_state_dict(torch.load(best_w)['state_dict']) # print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) # utils.adjust_learning_rate(optimizer, lr) # elif epoch_cum >= 12: # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # break if epoch_cum >= 12: print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) break
class Logger(object): def __init__(self, opt): self.exp_name = opt['name'] self.use_tb_logger = opt['use_tb_logger'] self.opt = opt['logger'] self.log_dir = opt['path']['log'] # loss log file self.loss_log_path = os.path.join(self.log_dir, 'loss_log.txt') with open(self.loss_log_path, "a") as log_file: log_file.write('=============== Time: ' + get_timestamp() + ' =============\n') log_file.write( '================ Training Losses ================\n') # val results log file self.val_log_path = os.path.join(self.log_dir, 'val_log.txt') with open(self.val_log_path, "a") as log_file: log_file.write('================ Time: ' + get_timestamp() + ' ===============\n') log_file.write( '================ Validation Results ================\n') if self.use_tb_logger and 'debug' not in self.exp_name: from tensorboard_logger import Logger as TensorboardLogger self.tb_logger = TensorboardLogger('../tb_logger/' + self.exp_name) # def print_format_results(self, mode, rlt): # epoch = rlt.pop('epoch') # iters = rlt.pop('iters') # time = rlt.pop('time') # model = rlt.pop('model') # message = '<epoch:{:3d}, iter:{:9,d}, time: {:.2f}> '.format(epoch, iters, time) # if mode == 'train': # if 'gan' in model: # srgan, sftgan, sftgan_acd # loss_g_pixel = rlt['loss_g_pixel'] if 'loss_g_pixel' in rlt else -1 # loss_g_fea = rlt['loss_g_fea'] if 'loss_g_fea' in rlt else -1 # loss_g_gan = rlt['loss_g_gan'] if 'loss_g_gan' in rlt else -1 # loss_d_real = rlt['loss_d_real'] if 'loss_d_real' in rlt else -1 # loss_d_fake = rlt['loss_d_fake'] if 'loss_d_fake' in rlt else -1 # D_out_real = rlt['D_out_real'] if 'D_out_real' in rlt else -1 # D_out_fake = rlt['D_out_fake'] if 'D_out_fake' in rlt else -1 # lr = rlt['lr'] # # tensorboard logger - common # if self.use_tb_logger and 'debug' not in self.exp_name: # if loss_g_pixel != -1 : # self.tb_logger.log_value('loss_g_pixel', loss_g_pixel, iters) # if loss_g_fea != -1: # self.tb_logger.log_value('loss_g_fea', loss_g_fea, iters) # self.tb_logger.log_value('loss_g_gan', loss_g_gan, iters) # self.tb_logger.log_value('loss_d_real', loss_d_real, iters) # self.tb_logger.log_value('loss_d_fake', loss_d_fake, iters) # if 'loss_d_gp' in rlt: # wgan-gp # loss_d_gp = rlt['loss_d_gp'] # format_str = ('<loss_G: pixel: {:.2e}, fea: {:.2e}, gan: {:.2e}><loss_D: ' # 'real: {:.2e} , fake: {:.2e}, gp: {:.2e}><Dout: G: {:.2f}, D: {:.2f}> ' # 'lr: {:.2e}'.format(loss_g_pixel, loss_g_fea, loss_g_gan, loss_d_real, \ # loss_d_fake, loss_d_gp, D_out_real, D_out_fake, lr)) # # tensorboard logger - wgan-gp # if self.use_tb_logger and 'debug' not in self.exp_name: # self.tb_logger.log_value('loss_d_gp', loss_d_gp, iters) # self.tb_logger.log_value('Wasserstein_dist', D_out_real - D_out_fake, iters) # else: # format_str = ('<loss_G: pixel: {:.2e}, fea: {:.2e}, gan: {:.2e}><loss_D: ' # 'real: {:.2e} , fake: {:.2e}><Dout: G: {:.2f}, D: {:.2f}> ' # 'lr: {:.2e}'.format(loss_g_pixel, loss_g_fea, loss_g_gan, loss_d_real, \ # loss_d_fake, D_out_real, D_out_fake, lr)) # # tensorboard logger - vanilla gan | lsgan # if self.use_tb_logger and 'debug' not in self.exp_name: # self.tb_logger.log_value('D_out_real', D_out_real, iters) # self.tb_logger.log_value('D_out_fake', D_out_fake, iters) # else: # sr and others # loss_pixel = rlt['loss_pixel'] if 'loss_pixel' in rlt else -1 # lr = rlt['lr'] # format_str = '<loss: {:.2e}> lr: {:.2e}'.format(loss_pixel, lr) # # tensorboard logger # if self.use_tb_logger and 'debug' not in self.exp_name: # self.tb_logger.log_value('loss_pixel', loss_pixel, iters) # message += format_str # else: # for label, value in rlt.items(): # message += '%s: %.4e ' % (label, value) # # tensorboard logger # if self.use_tb_logger and 'debug' not in self.exp_name: # self.tb_logger.log_value(label, value, iters) # # print in console # print(message) # # write in log file # if mode == 'train': # with open(self.loss_log_path, "a") as log_file: # log_file.write('%s\n' % message) # elif mode == 'val': # with open(self.val_log_path, "a") as log_file: # log_file.write('%s\n' % message) def print_format_results(self, mode, rlt): epoch = rlt.pop('epoch') iters = rlt.pop('iters') time = rlt.pop('time') model = rlt.pop('model') if 'lr' in rlt: lr = rlt.pop('lr') message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}, lr:{:.1e}> '.format( epoch, iters, time, lr) else: message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}> '.format( epoch, iters, time) for label, value in rlt.items(): message += '%s: %.2e ' % (label, value) # tensorboard logger if self.use_tb_logger and 'debug' not in self.exp_name: self.tb_logger.log_value(label, value, iters) # print in console print(message) # write in log file if mode == 'train': with open(self.loss_log_path, "a") as log_file: log_file.write('%s\n' % message) elif mode == 'val': with open(self.val_log_path, "a") as log_file: log_file.write('%s\n' % message)
def main(args, net=None): global oldassignment datadir = get_data_dir(args.db) outputdir = get_output_dir(args.db) logger = None if args.tensorboard: # One should create folder for storing logs loggin_dir = os.path.join(outputdir, 'runs', 'DCC') if not os.path.exists(loggin_dir): os.makedirs(loggin_dir) loggin_dir = os.path.join(loggin_dir, '%s' % (args.id)) if args.clean_log: remove_files_in_dir(loggin_dir) logger = Logger(loggin_dir) use_cuda = torch.cuda.is_available() # Set the seed for reproducing the results random.seed(args.manualSeed) np.random.seed(args.manualSeed) torch.manual_seed(args.manualSeed) if use_cuda: torch.cuda.manual_seed_all(args.manualSeed) torch.backends.cudnn.enabled = True cudnn.benchmark = True startepoch = 0 kwargs = {'num_workers': 5, 'pin_memory': True} if use_cuda else {} # setting up dataset specific objects trainset = DCCPT_data(root=datadir, train=True, h5=args.h5) testset = DCCPT_data(root=datadir, train=False, h5=args.h5) numeval = len(trainset) + len(testset) # extracting training data from the pretrained.mat file data, labels, pairs, Z, sampweight = makeDCCinp(args) # For simplicity, I have created placeholder for each datasets and model load_pretraining = True if net is None else False if net is None: net = dp.load_predefined_extract_net(args) # reshaping data for some datasets if args.db == 'cmnist': data = data.reshape((-1, 1, 28, 28)) elif args.db == 'ccoil100': data = data.reshape((-1, 3, 128, 128)) elif args.db == 'cytf': data = data.reshape((-1, 3, 55, 55)) elif args.db == 'cyale': data = data.reshape((-1, 1, 168, 192)) totalset = torch.utils.data.ConcatDataset([trainset, testset]) # computing and initializing the hyperparams _sigma1, _sigma2, _lambda, _delta, _delta1, _delta2, lmdb, lmdb_data = computeHyperParams(pairs, Z) oldassignment = np.zeros(len(pairs)) stopping_threshold = int(math.ceil(cfg.STOPPING_CRITERION * float(len(pairs)))) # Create dataset and random batch sampler for Finetuning stage trainset = DCCFT_data(pairs, data, sampweight) batch_sampler = DCCSampler(trainset, shuffle=True, batch_size=args.batchsize) # copying model params from Pretrained (SDAE) weights file if load_pretraining: load_weights(args, outputdir, net) # creating objects for loss functions, U's are initialized to Z here # Criterion1 corresponds to reconstruction loss criterion1 = DCCWeightedELoss(size_average=True) # Criterion2 corresponds to sum of pairwise and data loss terms criterion2 = DCCLoss(Z.shape[0], Z.shape[1], Z, size_average=True) if use_cuda: net.cuda() criterion1 = criterion1.cuda() criterion2 = criterion2.cuda() # setting up data loader for training and testing phase trainloader = torch.utils.data.DataLoader(trainset, batch_sampler=batch_sampler, **kwargs) testloader = torch.utils.data.DataLoader(totalset, batch_size=args.batchsize, shuffle=False, **kwargs) # setting up optimizer - the bias params should have twice the learning rate w.r.t. weights params bias_params = filter(lambda x: ('bias' in x[0]), net.named_parameters()) bias_params = list(map(lambda x: x[1], bias_params)) nonbias_params = filter(lambda x: ('bias' not in x[0]), net.named_parameters()) nonbias_params = list(map(lambda x: x[1], nonbias_params)) optimizer = optim.Adam([{'params': bias_params, 'lr': 2*args.lr}, {'params': nonbias_params}, {'params': criterion2.parameters(), 'lr': args.lr}, ], lr=args.lr, betas=(0.99, 0.999)) # this is needed for WARM START if args.resume: filename = outputdir+'/FTcheckpoint_%d.pth.tar' % args.level if os.path.isfile(filename): print("==> loading checkpoint '{}'".format(filename)) checkpoint = torch.load(filename) net.load_state_dict(checkpoint['state_dict']) criterion2.load_state_dict(checkpoint['criterion_state_dict']) startepoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) _sigma1 = checkpoint['sigma1'] _sigma2 = checkpoint['sigma2'] _lambda = checkpoint['lambda'] _delta = checkpoint['delta'] _delta1 = checkpoint['delta1'] _delta2 = checkpoint['delta2'] else: print("==> no checkpoint found at '{}'".format(filename)) raise ValueError # This is the actual Algorithm flag = 0 for epoch in range(startepoch, args.nepoch): if logger: logger.log_value('sigma1', _sigma1, epoch) logger.log_value('sigma2', _sigma2, epoch) logger.log_value('lambda', _lambda, epoch) train(trainloader, net, optimizer, criterion1, criterion2, epoch, use_cuda, _sigma1, _sigma2, _lambda, logger) Z, U, change_in_assign, assignment = test(testloader, net, criterion2, epoch, use_cuda, _delta, pairs, numeval, flag, logger) if flag: # As long as the change in label assignment < threshold, DCC continues to run. # Note: This condition is always met in the very first epoch after the flag is set. # This false criterion is overwritten by checking for the condition twice. if change_in_assign > stopping_threshold: flag += 1 if flag == 4: break if((epoch+1) % args.M == 0): _sigma1 = max(_delta1, _sigma1 / 2) _sigma2 = max(_delta2, _sigma2 / 2) if _sigma2 == _delta2 and flag == 0: # Start checking for stopping criterion flag = 1 # Save checkpoint index = (epoch // args.M) * args.M save_checkpoint({'epoch': epoch+1, 'state_dict': net.state_dict(), 'criterion_state_dict': criterion2.state_dict(), 'optimizer': optimizer.state_dict(), 'sigma1': _sigma1, 'sigma2': _sigma2, 'lambda': _lambda, 'delta': _delta, 'delta1': _delta1, 'delta2': _delta2, }, index, filename=outputdir) output = {'Z': Z, 'U': U, 'gtlabels': labels, 'w': pairs, 'cluster':assignment} sio.savemat(os.path.join(outputdir, 'features'), output)
avg_loss = 0.0 for batch_idx, (data, target) in enumerate(train_loader): if cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() out = model(data) loss = loss_fun(out, target) loss.backward() optimizer.step() avg_loss = 0.9 * avg_loss + 0.1 * loss.item() log_train.log_value('loss', avg_loss, epoch) print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, avg_loss)) torch.save(model.state_dict(), 'colornet_params.pkl') if epoch % 50 == 0: torch.save(model.state_dict(), './param_backup/colornet_params_' + str(epoch) + '.pkl') with torch.no_grad(): avg_loss = 0.0 for batch_idx, (data, target) in enumerate(test_loader): if cuda: data, target = data.cuda(), target.cuda() out = model(data)