def __init__(self, use_cuda, load_model, model_folder, train_directory, validation_directory, builder, args, multi_gpu=True): self.use_cuda = use_cuda self.load_model = load_model self.model_folder = model_folder self.validation_directory = validation_directory self.train_directory = train_directory self.args = args self.builder = builder self.logdir = join(model_folder, 'logs') self.writer = SummaryWriter(self.logdir) self.logger = Logger(self.args.log_file) self.itr = 0 # Create Model self.model = self.create_model() if multi_gpu: self.model = torch.nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count())) copy2(os.path.realpath(__file__).strip('.pyc') + '.py', self.logdir) copy2('/'.join(os.path.realpath(__file__).split('/')[:-1])+ '/models/pose_predictor_euler.py', self.logdir) copy2('/'.join(os.path.realpath(__file__).split('/')[:-1])+ '/utils/builder_utils.py', self.logdir) copy2('/'.join(os.path.realpath(__file__).split('/')[:-1])+ '/utils/builders.py', self.logdir) # Build validation set validation_builder = builder(self.args.n_views, validation_directory, IMAGE_SIZE, self.args, sample_size=SAMPLE_SIZE) validation_set = [validation_builder.build_set() for i in range(6)] validation_set = ConcatDataset(validation_set) self.len_validation_set = len(validation_set) del validation_builder self.validation_loader = DataLoader( validation_set, batch_size=16, shuffle=False, pin_memory=self.use_cuda, ) self.validation_calls = 0 # Build Training Set self.triplet_builder = builder(self.args.n_views, \ train_directory, IMAGE_SIZE, self.args, sample_size=SAMPLE_SIZE) self.training_queue = multiprocessing.Queue(1) dataset_builder_process = multiprocessing.Process(target=self.build_set, args=(self.training_queue, self.triplet_builder, self.logger), daemon=True) dataset_builder_process.start() # Get Logger # Model specific setup self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr_start, momentum=0.9) # This will diminish the learning rate at the milestones ///// 0.1, 0.01, 0.001 if not using automized scheduler self.learning_rate_scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')
class Trainer(object): def __init__(self, use_cuda, load_model, model_folder, train_directory, validation_directory, builder, loss_fn, args, multi_gpu=True): self.use_cuda = use_cuda self.load_model = load_model self.model_folder = model_folder self.validation_directory = validation_directory self.train_directory = train_directory self.args = args self.builder = builder self.loss_fn = loss_fn self.logdir = join(model_folder, 'logs') self.writer = SummaryWriter(self.logdir) self.logger = Logger(self.args.log_file) self.itr = 0 # Create Model self.model = self.create_model() if multi_gpu: self.model = torch.nn.DataParallel(self.model, device_ids=range( torch.cuda.device_count())) # Build validation set validation_builder = builder(self.args.n_views, validation_directory, IMAGE_SIZE, self.args, sample_size=SAMPLE_SIZE) validation_set = [ validation_builder.build_set() for i in range(VAL_SEQS) ] validation_set = ConcatDataset(validation_set) self.len_validation_set = len(validation_set) del validation_builder self.validation_loader = DataLoader( validation_set, batch_size=8, shuffle=False, pin_memory=self.use_cuda, ) self.validation_calls = 0 # Build Training Set self.triplet_builder = builder(self.args.n_views, \ train_directory, IMAGE_SIZE, self.args, sample_size=SAMPLE_SIZE) self.training_queue = multiprocessing.Queue(1) dataset_builder_process = multiprocessing.Process( target=self.build_set, args=(self.training_queue, self.triplet_builder, self.logger), daemon=True) dataset_builder_process.start() # Get Logger # Model specific setup self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr_start, momentum=0.9) # This will diminish the learning rate at the milestones ///// 0.1, 0.01, 0.001 if not using automized scheduler self.learning_rate_scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min') # self.criterion = nn.CrossEntropyLoss() def train(self): trn_losses_ = [] val_losses_ = [] val_acc_ = [] trn_acc_ = [] for epoch in range(self.args.start_epoch, self.args.start_epoch + self.args.epochs): print("=" * 20) self.logger.info("Starting epoch: {0} ".format(epoch)) dataset = self.training_queue.get() data_loader = DataLoader( dataset=dataset, batch_size=self.args. minibatch_size, # batch_size(epoch, self.args.max_minibatch_size), shuffle=True, pin_memory=self.use_cuda, ) train_embedding_features_buffer = [] train_images_buffer = [] correct = 0 for _ in range(0, 1): losses = [] for minibatch in data_loader: if self.use_cuda: anchor_frames = minibatch[0].cuda() #anchor_euler_reparam = minibatch[1].cuda() # load as 3x3 rotation matrix anchor_rots = minibatch[1].cuda( ) # load as 3x3 rotation matrix # frames = Variable(minibatch) loss, a_pred = self.loss_fn(self.model, anchor_frames, anchor_rots) losses.append(loss.data.cpu().numpy()) anchor_euler = euler_XYZ_to_reparam( apply(rotationMatrixToEulerAngles, anchor_rots)) correct += (torch.norm( a_pred - anchor_euler, 2) < 0.1).data.cpu().numpy( ).sum() # print(gradcheck(loss_fn, (tcn, minibatch,))) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Add embeddings train_embedding_features_buffer.append( apply(rotationMatrixToEulerAngles, anchor_rots)) train_images_buffer.append(anchor_frames) print("logging to {}".format(self.logdir)) self.writer.add_scalar('data/train_loss', np.mean(losses), self.itr) self.writer.add_scalar('data/train_correct', correct / len(data_loader), self.itr) self.itr += 1 trn_losses_.append(np.mean(losses)) self.logger.info('train loss: ', np.mean(losses)) self.logger.info( "Training score correct {correct}/{total}".format( correct=correct, total=len(data_loader))) trn_acc_.append(correct) self.writer.add_image('frame_1', minibatch[0][0], self.itr) # Get embeddings features = torch.cat( train_embedding_features_buffer[:30]).squeeze_() # features = train_embedding_features_buffer.view(train_embedding_features_buffer.shape[0]*train_embedding_features_buffer.shape[1], -1) # label = torch.Tensor(np.asarray(label_buffer)) images = torch.cat( train_images_buffer[:30]).squeeze_() #/255.0, [0, 3, 1, 2] self.writer.add_embedding(features, label_img=images, global_step=epoch) if epoch % 1 == 0: loss, correct = self.validate() self.learning_rate_scheduler.step(loss) val_losses_.append(loss) val_acc_.append(correct) if epoch % self.args.save_every == 0 and epoch != 0: self.logger.info('Saving model.') self.save_model( self.model, self.model_filename(self.args.model_name, epoch), join(self.model_folder, 'weight_files')) print("logging to {}".format(self.logdir)) plot_mean(trn_losses_, self.model_folder, 'train_loss') plot_mean(val_losses_, self.model_folder, 'validation_loss') plot_mean(trn_acc_, self.model_folder, 'train_acc') plot_mean(val_acc_, self.model_folder, 'validation_accuracy') # plot_mean(val_acc_no_margin_, self.model_folder, 'validation_accuracy_no_margin') def validate(self): # Run model on validation data and log results correct = 0 losses = [] for minibatch in self.validation_loader: if self.use_cuda: anchor_frames = minibatch[0].cuda() #anchor_euler_reparam = minibatch[1].cuda() # load as 3x3 rotation matrix anchor_rots = minibatch[1].cuda( ) # load as 3x3 rotation matrix loss, a_pred = self.loss_fn(self.model, anchor_frames, anchor_rots) losses.append(loss.data.cpu().numpy()) anchor_euler = euler_XYZ_to_reparam( apply(rotationMatrixToEulerAngles, anchor_rots)) correct += (torch.norm(a_pred - anchor_euler, 2) < 0.1).data.cpu().numpy().sum() self.writer.add_scalar('data/valid_loss', np.mean(losses), self.validation_calls) self.writer.add_scalar('data/validation_correct', correct / self.len_validation_set, self.validation_calls) self.validation_calls += 1 loss = np.mean(losses) self.logger.info("Validation score correct {correct}/{total}".format( correct=correct, total=self.len_validation_set)) self.logger.info('val loss: ', loss) return loss, correct def model_filename(self, model_name, epoch): return "{model_name}-epoch-{epoch}.pk".format(model_name=model_name, epoch=epoch) def save_model(self, model, filename, model_folder): ensure_folder(model_folder) model_path = os.path.join(model_folder, filename) torch.save(model.state_dict(), model_path) def build_set(self, queue, triplet_builder, log): while 1: datasets = [] for i in range(TRAIN_SEQS_PER_EPOCH): dataset = triplet_builder.build_set() datasets.append(dataset) dataset = ConcatDataset(datasets) # log.info('Created {0} triplets'.format(len(dataset))) queue.put(dataset) def create_model(self): model = define_model(pretrained=True) # model = PosNet() if self.load_model: model_path = os.path.join(self.model_folder, self.load_model) # map_location allows us to load models trained on cuda to cpu. model.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) if self.use_cuda: model = model.cuda() return model def batch_size(self, epoch, max_size): exponent = epoch // 100 return min(max(2**(exponent), 2), max_size)
parser.add_argument('--num_layers', type=int, default=1, help='number of layers in lstm') # parser.add_argument('--num_epochs', type=int, default=5) # parser.add_argument('--batch_size', type=int, default=128) # parser.add_argument('--num_workers', type=int, default=2) # parser.add_argument('--learning_rate', type=float, default=0.001) return parser.parse_args() args = get_args() print(args) logger = Logger(args.log_file) def batch_size(epoch, max_size): exponent = epoch // 100 return min(max(2**(exponent), 2), max_size) validation_builder = builder(args.n_views, args.validation_directory, IMAGE_SIZE, args, sample_size=int(SAMPLE_SIZE / 2.0)) validation_set = [validation_builder.build_set() for i in range(5)] validation_set = ConcatDataset(validation_set) del validation_builder
class Trainer(object): def __init__(self, use_cuda, load_model, model_folder, train_directory, validation_directory, builder, args, multi_gpu=True): self.use_cuda = use_cuda self.load_model = load_model self.model_folder = model_folder self.validation_directory = validation_directory self.train_directory = train_directory self.args = args self.builder = builder self.logdir = join(model_folder, 'logs') self.writer = SummaryWriter(self.logdir) self.logger = Logger(self.args.log_file) self.itr = 0 # Create Model self.model = self.create_model() if multi_gpu: self.model = torch.nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count())) copy2(os.path.realpath(__file__).strip('.pyc') + '.py', self.logdir) copy2('/'.join(os.path.realpath(__file__).split('/')[:-1])+ '/models/pose_predictor_euler.py', self.logdir) copy2('/'.join(os.path.realpath(__file__).split('/')[:-1])+ '/utils/builder_utils.py', self.logdir) copy2('/'.join(os.path.realpath(__file__).split('/')[:-1])+ '/utils/builders.py', self.logdir) # Build validation set validation_builder = builder(self.args.n_views, validation_directory, IMAGE_SIZE, self.args, sample_size=SAMPLE_SIZE) validation_set = [validation_builder.build_set() for i in range(6)] validation_set = ConcatDataset(validation_set) self.len_validation_set = len(validation_set) del validation_builder self.validation_loader = DataLoader( validation_set, batch_size=16, shuffle=False, pin_memory=self.use_cuda, ) self.validation_calls = 0 # Build Training Set self.triplet_builder = builder(self.args.n_views, \ train_directory, IMAGE_SIZE, self.args, sample_size=SAMPLE_SIZE) self.training_queue = multiprocessing.Queue(1) dataset_builder_process = multiprocessing.Process(target=self.build_set, args=(self.training_queue, self.triplet_builder, self.logger), daemon=True) dataset_builder_process.start() # Get Logger # Model specific setup self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr_start, momentum=0.9) # This will diminish the learning rate at the milestones ///// 0.1, 0.01, 0.001 if not using automized scheduler self.learning_rate_scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min') # self.criterion = nn.CrossEntropyLoss() def train(self): trn_losses_ = [] val_losses_= [] val_acc_margin_ = [] val_acc_no_margin_ = [] for epoch in range(self.args.start_epoch, self.args.start_epoch + self.args.epochs): print("=" * 20) self.logger.info("Starting epoch: {0} ".format(epoch)) dataset = self.training_queue.get() data_loader = DataLoader( dataset=dataset, batch_size=self.args.minibatch_size, # batch_size(epoch, self.args.max_minibatch_size), shuffle=True, pin_memory=self.use_cuda, ) train_embedding_features_buffer = [] train_images_buffer = [] for _ in range(0, ITERATE_OVER_TRIPLETS): losses = [] for minibatch, _ in data_loader: # frames = Variable(minibatch) if self.use_cuda: frames = minibatch.cuda() anchor_frames = frames[:, 0, :, :, :] positive_frames = frames[:, 1, :, :, :] negative_frames = frames[:, 2, :, :, :] anchor_output, unnormalized, _ = self.model(anchor_frames) positive_output, _, _ = self.model(positive_frames) negative_output, _, _ = self.model(negative_frames) d_positive = distance(anchor_output, positive_output) d_negative = distance(anchor_output, negative_output) loss_triplet = torch.clamp(self.args.margin + d_positive - d_negative, min=0.0).mean() loss = loss_triplet losses.append(loss.data.cpu().numpy()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Add embeddings train_embedding_features_buffer.append(anchor_output) train_images_buffer.append(anchor_frames) print("logging to {}".format(self.logdir)) self.writer.add_scalar('data/train_triplet_loss', np.mean(losses), self.itr) self.itr += 1 trn_losses_.append(np.mean(losses)) self.logger.info('train loss: ', np.mean(losses)) self.writer.add_image('frame_anchor', minibatch[0][0], 0) self.writer.add_image('frame_positive', minibatch[0][1], 1) self.writer.add_image('frame_negative', minibatch[0][2], 2) # Get embeddings features = torch.cat(train_embedding_features_buffer).squeeze_() # features = train_embedding_features_buffer.view(train_embedding_features_buffer.shape[0]*train_embedding_features_buffer.shape[1], -1) # label = torch.Tensor(np.asarray(label_buffer)) images = torch.cat(train_images_buffer).squeeze_()#/255.0, [0, 3, 1, 2] self.writer.add_embedding(features, label_img=images, global_step=epoch) if epoch % 1 == 0: acc_margin, acc_no_margin, loss = self.validate() self.learning_rate_scheduler.step(loss) val_losses_.append(loss) val_acc_margin_.append(acc_margin) val_acc_no_margin_.append(acc_no_margin) if epoch % self.args.save_every == 0 and epoch != 0: self.logger.info('Saving model.') self.save_model(self.model, self.model_filename(self.args.model_name, epoch), join(self.model_folder, 'weight_files')) print("logging to {}".format(self.logdir)) plot_mean(trn_losses_, self.model_folder, 'train_loss') plot_mean(val_losses_, self.model_folder, 'validation_loss') # plot_mean(train_acc_, self.args.model_folder, 'train_acc') plot_mean(val_acc_margin_, self.model_folder, 'validation_accuracy_margin') plot_mean(val_acc_no_margin_, self.model_folder, 'validation_accuracy_no_margin') def validate(self): # Run model on validation data and log results correct_with_margin = 0 correct_without_margin = 0 losses = [] for minibatch, _ in self.validation_loader: # frames = Variable(minibatch, require_grad=False) if self.use_cuda: frames = minibatch.cuda() anchor_frames = frames[:, 0, :, :, :] positive_frames = frames[:, 1, :, :, :] negative_frames = frames[:, 2, :, :, :] anchor_output, unnormalized, _ = self.model(anchor_frames) positive_output, _, _ = self.model(positive_frames) negative_output, _, _ = self.model(negative_frames) d_positive = distance(anchor_output, positive_output) d_negative = distance(anchor_output, negative_output) assert(d_positive.size()[0] == minibatch.size()[0]) correct_with_margin += ((d_positive + self.args.margin) < d_negative).data.cpu().numpy().sum() correct_without_margin += (d_positive < d_negative).data.cpu().numpy().sum() loss_triplet = torch.clamp(self.args.margin + d_positive - d_negative, min=0.0).mean() loss = loss_triplet losses.append(loss.data.cpu().numpy()) self.writer.add_scalar('data/validation_loss', np.mean(losses), self.validation_calls) self.writer.add_scalar('data/validation_correct_with_margin', correct_with_margin / self.len_validation_set, self.validation_calls) self.writer.add_scalar('data/validation_correct_without_margin', correct_without_margin / self.len_validation_set, self.validation_calls) self.validation_calls += 1 loss = np.mean(losses) self.logger.info('val loss: ',loss) message = "Validation score correct with margin {with_margin}/{total} and without margin {without_margin}/{total}".format( with_margin=correct_with_margin, without_margin=correct_without_margin, total=self.len_validation_set ) self.logger.info(message) return correct_with_margin, correct_without_margin, loss def model_filename(self, model_name, epoch): return "{model_name}-epoch-{epoch}.pk".format(model_name=model_name, epoch=epoch) def save_model(self, model, filename, model_folder): ensure_folder(model_folder) model_path = os.path.join(model_folder, filename) torch.save(model.state_dict(), model_path) def build_set(self, queue, triplet_builder, log): while 1: datasets = [] for i in range(3): dataset = triplet_builder.build_set() datasets.append(dataset) dataset = ConcatDataset(datasets) # log.info('Created {0} triplets'.format(len(dataset))) queue.put(dataset) def create_model(self): model = define_model(pretrained=True) # model = PosNet() if self.load_model: model_path = os.path.join( self.model_folder, self.load_model ) # map_location allows us to load models trained on cuda to cpu. model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)) if self.use_cuda: model = model.cuda() return model def batch_size(self, epoch, max_size): exponent = epoch // 100 return min(max(2 ** (exponent), 2), max_size)
def __init__(self, use_cuda, load_model, model_folder, train_directory, validation_directory, builder, loss_fn, args, multi_gpu=True): self.use_cuda = use_cuda self.load_model = load_model self.model_folder = model_folder self.validation_directory = validation_directory self.train_directory = train_directory self.args = args self.builder = builder self.loss_fn = loss_fn self.logdir = join(model_folder, 'logs') self.writer = SummaryWriter(self.logdir) self.logger = Logger(self.args.log_file) self.itr = 0 # Create Model self.model = self.create_model() if multi_gpu: self.model = torch.nn.DataParallel(self.model, device_ids=range( torch.cuda.device_count())) # Build validation set validation_builder = builder(self.args.n_views, validation_directory, IMAGE_SIZE, self.args, toRot=True, sample_size=SAMPLE_SIZE) validation_set = [ validation_builder.build_set() for i in range(VAL_SEQS) ] validation_set = ConcatDataset(validation_set) self.len_validation_set = len(validation_set) del validation_builder self.validation_loader = DataLoader( validation_set, batch_size=8, shuffle=False, pin_memory=self.use_cuda, ) self.validation_calls = 0 # Build Training Set self.triplet_builder = builder(self.args.n_views, \ train_directory, IMAGE_SIZE, self.args, toRot=True, sample_size=SAMPLE_SIZE) self.training_queue = multiprocessing.Queue(1) dataset_builder_process = multiprocessing.Process( target=self.build_set, args=(self.training_queue, self.triplet_builder, self.logger), daemon=True) dataset_builder_process.start() # Get Logger # Model specific setup # self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr_start, momentum=0.9) self.optimizer = optim.Adam(self.model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08) # This will diminish the learning rate at the milestones ///// 0.1, 0.01, 0.001 if not using automized scheduler self.learning_rate_scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min')