def data_sender(id, name, *args): context = zmq.Context() sender = context.socket(zmq.PUSH) sender.connect('ipc://@{}'.format(name)) print('start data provider {}-{}'.format(name, id)) while True: data_iter = dataset.train_dataset(id + 1) for msg in data_iter: # print(id) sender.send(dumps([id, msg]))
def train(params): total_nr_iters = config.train_base_iters batch_per_gpu = config.train_batch_per_gpu base_lr = config.base_lr line = 'network.base_lr.{}.train_iter.{}'.format(base_lr, total_nr_iters) print(line) # set model save path and log save path saveDir = config.model_dir misc_utils.ensure_dir(saveDir) fpath = os.path.join(config.output_dir, line+'.log') fid_log = open(fpath,'a') # set data input pipe program_name = config.program_name # check gpus torch.set_default_tensor_type('torch.FloatTensor') if not torch.cuda.is_available(): print('No GPU exists!') return else: num_gpus = torch.cuda.device_count() train_iter = total_nr_iters//(num_gpus*batch_per_gpu) train_lr_decay = np.array(config.lr_decay)//(num_gpus*batch_per_gpu) train_dump_interval = config.model_dump_interval//(num_gpus*batch_per_gpu) train_lr = base_lr * num_gpus bt_size = num_gpus * batch_per_gpu line = 'Num of GPUs:{}, learning rate:{:.5f}, batch size:{},\ train_iter:{}, decay_iter:{}, dump_interval:{}'.format( num_gpus,train_lr,bt_size,train_iter,train_lr_decay, train_dump_interval) print(line) print("Building netowrk.") net = network.Network() # Moves all model parameters and buffers to the GPU. net.cuda() if params.resume_weights: model_file = os.path.join(saveDir, 'dump-{}.pth'.format(params.resume_weights)) check_point = torch.load(model_file) net.load_state_dict(check_point['state_dict']) net = nn.DataParallel(net) # set the optimizer, use momentum and weight_decay optimizer = optim.SGD(net.parameters(), lr=train_lr, momentum=config.momentum, \ weight_decay=config.weight_decay) # check if resume training training_data = train_dataset() net.train() if(params.progressbar): tqdm.monitor_interval = 0 pbar = tqdm(total=train_iter, leave=False, ascii=True) dump_num = 1 start_iter = 0 if params.resume_weights: start_iter = int(params.resume_weights) * train_dump_interval if(start_iter >= train_lr_decay[0]): optimizer.param_groups[0]['lr'] = train_lr / 10 if(start_iter >= train_lr_decay[1]): optimizer.param_groups[0]['lr'] = train_lr / 100 dump_num = int(params.resume_weights) + 1 for step in range(start_iter, train_iter): # warm up if step < config.warm_iter: alpha = step / config.warm_iter lr_new = 0.1 * train_lr + 0.9 * alpha * train_lr optimizer.param_groups[0]['lr'] = lr_new elif step == config.warm_iter: optimizer.param_groups[0]['lr'] = train_lr if step == train_lr_decay[0]: optimizer.param_groups[0]['lr'] = train_lr / 10 elif step == train_lr_decay[1]: optimizer.param_groups[0]['lr'] = train_lr / 100 # get training data images, gt_boxes, img_info = process(training_data, num_gpus) optimizer.zero_grad() # forwad outputs = net(images, img_info, gt_boxes) # collect the loss total_loss = sum([outputs[key].mean() for key in outputs.keys()]) total_loss.backward() optimizer.step() if(params.progressbar): pbar.update(1) # stastic if step % config.log_dump_interval == 0: stastic_total_loss = total_loss.cpu().data.numpy() line = 'Iter {}: lr:{:.5f}, loss is {:.4f}.'.format( step, optimizer.param_groups[0]['lr'], stastic_total_loss) print(outputs) print(line) fid_log.write(line+'\n') fid_log.flush() # save the model if (step + 1)%train_dump_interval==0: fpath = os.path.join(saveDir,'dump-{}.pth'.format(dump_num)) dump_num += 1 model = dict(epoch = step, state_dict = net.module.state_dict(), optimizer = optimizer.state_dict()) torch.save(model,fpath) if(params.progressbar): pbar.close() fid_log.close()
def worker(rank, gpu_num, args): # using sublinear os.environ[ "MGB_COMP_GRAPH_OPT"] = "enable_sublinear_memory_opt=1;seq_opt.enable_seq_comp_node_opt=0" os.environ["MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER"] = '10' os.environ['MGB_CUDA_RESERVE_MEMORY'] = '1' # establish the server if is the master dist_port = args.port if rank == 0: dist.Server(port=dist_port) if gpu_num > 1: dist.init_process_group( master_ip="localhost", port=dist_port, world_size=gpu_num, rank=rank, device=rank, ) logger.info("Init process group for gpu%d done", rank) model = network.Network() params = model.parameters(requires_grad=True) model.train() # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=allreduce_cb, ) opt = optim.SGD( params, lr=cfg.basic_lr * gpu_num * cfg.batch_per_gpu, momentum=cfg.momentum, weight_decay=cfg.weight_decay, ) if cfg.pretrain_weight is not None: weights = mge.load(cfg.pretrain_weight) del weights['fc.weight'] del weights['fc.bias'] model.resnet50.load_state_dict(weights) start_epoch = 0 if args.resume_weights is not None: assert osp.exists(args.resume_weights) model_file = args.resume_weights model_dict = mge.load(model_file) start_epoch, weights = model_dict['epoch'] + 1, model_dict[ 'state_dict'] model.load_state_dict(weights, strict=False) logger.info("Prepare dataset") train_loader = dataset.train_dataset(rank) logger.info("Training...") for epoch_id in range(start_epoch, cfg.max_epoch): for param_group in opt.param_groups: param_group["lr"] = (cfg.basic_lr * gpu_num * cfg.batch_per_gpu * (cfg.lr_decay_rate**bisect.bisect_right( cfg.lr_decay_sates, epoch_id))) max_steps = cfg.nr_images_epoch // (cfg.batch_per_gpu * gpu_num) train_one_epoch(model, gm, train_loader, opt, max_steps, rank, epoch_id, gpu_num) if rank == 0: save_path = osp.join(cfg.model_dir, 'epoch-{}.pkl'.format(epoch_id + 1)) state_dict = model.state_dict() names = [k for k, _ in state_dict.items()] for name in names: if name.startswith('inputs.'): del state_dict[name] mge.save( { "epoch": epoch_id, "state_dict": state_dict }, save_path, ) logger.info("dump weights to %s", save_path)
) # Binary Cross-Entropy Loss with sigmoid attached in front. """ ============================================== Begin Training ============================================== """ lr = args.lr for epoch in range(args.start_epoch, args.epochs): # I will assume the graphs are shuffled somehow when calling train_dataset(). batch_index = 0 # More like batch counter, counting the number of distinct (conjecutre, statement) pairs are stored. batch_number = 0 # Number of batches iterated. conjecture_state_batch = [] label_batch = [] for datapoint in train_dataset(): # Collect datapoints for inter-graph batching. if (epoch == args.start_epoch) and (batch_number < args.start_batch - 1): # If starting from a saved <start_epoch> and <start_batch>, pass through prior datapoints. batch_index += 1 if batch_index < args.batch_size: continue else: batch_index = 0 batch_number += 1 continue conjecture_graph = datapoint.conjecture statement_graph = datapoint.statement
pre_trained = False torch.cuda.set_device(0) try: os.makedirs(out_file) os.makedirs(out_file + '/model/') except OSError: pass manual_seed = random.randint(1, 10000) random.seed(manual_seed) torch.manual_seed(manual_seed) cudnn.benchmark = True train_datatset_ = train_dataset(data_path, size_w, size_h, flip, time_series) val_datatset_ = train_dataset(val_path, size_w, size_h, 0, time_series) def weights_init(m): class_name = m.__class__.__name__ if class_name.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) m.bias.data.fill_(0) elif class_name.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) try: os.makedirs(out_file) os.makedirs(out_file + '/model/')
num_GPU = 1 index = 100 torch.cuda.set_device(0) try: import os os.makedirs(out_file) except OSError: pass manual_seed = random.randint(1, 10000) random.seed(manual_seed) torch.manual_seed(manual_seed) cudnn.benchmark = True train_datatset_ = train_dataset(train_path, size_w, size_h, flip, band, batch_size) val_datatset_ = train_dataset(val_path, size_w, size_h, 0, band) def weights_init(m): class_name = m.__class__.__name__ if class_name.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) m.bias.data.fill_(0) elif class_name.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) try: os.makedirs(out_file)
def train(args): if type(config.train_source) == list: training_data = multi_train_dataset(args) else: training_data = train_dataset(args) number_of_training_instances = training_data.__next__() val_data = eval_dataset(args) number_of_val_instances = val_data.__next__() total_nr_iters = args.epochs * number_of_training_instances batch_per_gpu = config.train_batch_per_gpu base_lr = config.base_lr line = 'network.base_lr.{}.train_iter.{}'.format(base_lr, total_nr_iters) print(line) # set model save path and log save path saveDir = config.model_dir misc_utils.ensure_dir(saveDir) # set data input pipe program_name = config.program_name # check gpus torch.set_default_tensor_type('torch.FloatTensor') if not torch.cuda.is_available(): print('No GPU exists!') return else: num_gpus = torch.cuda.device_count() train_iter = total_nr_iters // (num_gpus * batch_per_gpu) print('[-]', num_gpus, batch_per_gpu, total_nr_iters) new_decay = (np.array(config.lr_decay) / 450000) * total_nr_iters train_lr_decay = new_decay // (num_gpus * batch_per_gpu) train_dump_interval = number_of_training_instances // (num_gpus * batch_per_gpu) train_lr = base_lr * num_gpus bt_size = num_gpus * batch_per_gpu line = 'Num of GPUs:{}, learning rate:{:.5f}, batch size:{},\ train_iter:{}, decay_iter:{}, dump_interval:{}'.format( num_gpus, train_lr, bt_size, train_iter, train_lr_decay, train_dump_interval) print(line) print("[-]Building netowrk.") net = network.Network(args) net.cuda() best = 10e10 epoch = 0 if args.resume: print("Load base model from :", os.path.join(args.save_dir, args.output_name, 'dump_last.pth')) check_point = torch.load( os.path.join(args.save_dir, args.output_name, 'dump_last.pth')) net.load_state_dict(check_point['state_dict']) start_iter = check_point['step'] if 'val_loss' in check_point: best = check_point['val_loss'] epoch = start_iter // train_dump_interval + 1 elif args.base_model: print("Load base model from :", args.base_model) check_point = torch.load(args.base_model) net.load_state_dict(check_point['state_dict'], strict=False) start_iter = 0 else: start_iter = 0 net = nn.DataParallel(net) # set the optimizer, use momentum and weight_decay optimizer = optim.SGD(net.parameters(), lr=train_lr, momentum=config.momentum, \ weight_decay=config.weight_decay) if (start_iter >= train_lr_decay[0]): optimizer.param_groups[0]['lr'] = train_lr / 10 if (start_iter >= train_lr_decay[1]): optimizer.param_groups[0]['lr'] = train_lr / 100 # check if resume training net.train() logger = Logger(args) iter_tqdm = None val_tqdm = None for step in range(start_iter, train_iter): # warm up if step < config.warm_iter: alpha = step / config.warm_iter lr_new = 0.1 * train_lr + 0.9 * alpha * train_lr optimizer.param_groups[0]['lr'] = lr_new elif step == config.warm_iter: optimizer.param_groups[0]['lr'] = train_lr if step == train_lr_decay[0]: optimizer.param_groups[0]['lr'] = train_lr / 10 elif step == train_lr_decay[1]: optimizer.param_groups[0]['lr'] = train_lr / 100 # get training data images, gt_boxes, img_info, done_an_epoch, extra = process( args, training_data, num_gpus) if done_an_epoch: epoch += 1 optimizer.zero_grad() # forward outputs = net(images, img_info, gt_boxes, extra=extra) # collect the loss total_loss = sum([outputs[key].mean() for key in outputs.keys()]) total_loss.backward() optimizer.step() # stastic stastic_total_loss = total_loss.cpu().data.numpy() line = '[*]Epoch:{} iter<{}> lr:{:.5f}, loss:{:.4f}'.format( epoch, step, optimizer.param_groups[0]['lr'], float(stastic_total_loss)) if step % config.log_dump_interval == 0: logger.scalar_summary('lr', optimizer.param_groups[0]['lr'], step) for k, v in outputs.items(): v = float(np.mean(v.cpu().data.numpy())) logger.scalar_summary(k, v, step) line += ', ' + k + ':{:.4}'.format(v) logger.scalar_summary('total_loss', float(stastic_total_loss), step) else: for k, v in outputs.items(): v = float(np.mean(v.cpu().data.numpy())) line += ', ' + k + ':{:.4}'.format(v) if iter_tqdm is None: iter_tqdm = tqdm(total=train_iter, desc='Iteration') iter_tqdm.update(start_iter) iter_tqdm.set_description("[-] " + line) iter_tqdm.refresh() # save the best model if done_an_epoch: if args.save_per_epoch > 0: if (epoch + 1) % args.save_per_epoch == 0: fpath = os.path.join(saveDir, 'dump_{}.pth'.format(epoch)) print('[.] Saving :', fpath) model = dict(epoch=epoch, step=step, state_dict=net.module.state_dict(), optimizer=optimizer.state_dict()) torch.save(model, fpath) fpath = os.path.join(saveDir, 'dump_last.pth') print('[.] Saving :', fpath) model = dict(epoch=epoch, step=step, state_dict=net.module.state_dict(), optimizer=optimizer.state_dict()) torch.save(model, fpath) net.train() iter_tqdm.update(1) iter_tqdm.close() fpath = os.path.join(saveDir, 'dump_last.pth') print('[.] Saving :', fpath) model = dict(step=step, state_dict=net.module.state_dict(), optimizer=optimizer.state_dict()) torch.save(model, fpath)
def load_weights(name, model): path = '/models/{}/weights.h5'.format(name) if os.path.exists(path): images, _ = next(iter(train_dataset().batch(10))) model(images) model.load_weights(path)