def __init__(self, args): super(SeperateFPAndNS, self).__init__(args) self.loss_function = args.loss self.number_of_cp = args.number_of_cp self.gpu_ids = args.gpu_ids self.all_obj_names = args.object_list # configs w.r.t. two losses self.joint_two_losses = args.joint_two_losses self.loss1_or_loss2 = None if args.loss1_w < 0.00001: self.loss1_or_loss2 = False # update loss2 only elif args.loss2_w < 0.00001: self.loss1_or_loss2 = True # update loss1 only self.ns_optim, self.fp_optim = None, None # see gradients for debugging self.vis_grad = args.vis_grad self.grad_vis = None self.train_res = args.train_res or self.vis_grad self.fp = ForcePredictor(args) self.ns = NeuralForceSimulator(args=args) # deprecating self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors( args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda()
def __init__(self, args): super(NeuralForceSimulator, self).__init__() self.clean_force = False # neural force simulator self.only_first_img_feature = True self.vis_grad = args.vis_grad self.train_res = args.train_res or self.vis_grad self.hidden_size = 512 self.image_feature_dim = 512 self.num_layers = 3 self.sequence_length = args.sequence_length self.object_feature_size = 512 self.environment = args.instance_environment self.number_of_cp = args.number_of_cp self.feature_extractor = resnet18(pretrained=args.pretrain) del self.feature_extractor.fc if not self.train_res: self.feature_extractor.eval() self.use_lstm = args.lstm self.norm_position = False if self.use_lstm: self.one_ns_layer = NSLSTM( hidden_size=self.hidden_size, layer_norm=False, image_feature_dim=self.image_feature_dim, norm_position=self.norm_position) else: self.one_ns_layer = NSWithImageFeature( hidden_size=self.hidden_size, layer_norm=False, image_feature_dim=self.image_feature_dim, norm_position=self.norm_position) self.image_embed = combine_block_w_do(512, 64, args.dropout_ratio) input_object_embed_size = torch.Tensor( [3 + 4, 100, self.object_feature_size]) self.input_object_embed = input_embedding_net( input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_encoder = nn.LSTM(input_size=self.hidden_size + 64 * 7 * 7, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) # self.ns_layer = {obj_name: MLPNS(hidden_size=64, layer_norm=False) for obj_name in self.all_obj_names} assert self.number_of_cp == 5 # for five fingers self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors( args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda() self.ns_ratio, self.phy_ratio, self.gt_ratio = 1, 1, 0 total_r = self.ns_ratio + self.phy_ratio + self.gt_ratio self.ns_ratio, self.phy_ratio, self.gt_ratio = self.ns_ratio / total_r, self.phy_ratio / total_r, \ self.gt_ratio / total_r
def __init__(self, args): super(PredictInitPoseAndForce, self).__init__(args) self.environment_layer = EnvWHumanCpFiniteDiffFast self.loss_function = args.loss self.relu = nn.LeakyReLU() self.number_of_cp = args.number_of_cp self.environment = args.instance_environment self.sequence_length = args.sequence_length self.gpu_ids = args.gpu_ids self.feature_extractor = resnet18(pretrained=args.pretrain) del self.feature_extractor.fc self.feature_extractor.eval() self.image_feature_size = 512 self.object_feature_size = 512 self.hidden_size = 512 self.num_layers = 3 # self.input_feature_size = self.image_feature_size + self.object_feature_size self.input_feature_size = self.object_feature_size self.cp_feature_size = self.number_of_cp * 3 self.image_embed = combine_block_w_do(512, 64, args.dropout_ratio) predict_initial_pose_size = torch.Tensor([(2 + 3) * 10, 100, 3 + 4]) self.predict_initial_pose = input_embedding_net(predict_initial_pose_size.long().tolist(), dropout=args.dropout_ratio) input_object_embed_size = torch.Tensor([3 + 4, 100, self.object_feature_size]) self.input_object_embed = input_embedding_net(input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) state_embed_size = torch.Tensor([EnvState.total_size + self.cp_feature_size, 100, self.object_feature_size]) self.state_embed = input_embedding_net(state_embed_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_encoder = nn.LSTM(input_size=self.hidden_size + 64 * 7 * 7, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) self.lstm_decoder = nn.LSTMCell(input_size=self.hidden_size * 2, hidden_size=self.hidden_size) forces_directions_decoder_size = torch.Tensor([self.hidden_size, 100, (3) * self.number_of_cp]) self.forces_directions_decoder = input_embedding_net(forces_directions_decoder_size.long().tolist(), dropout=args.dropout_ratio) assert args.batch_size == 1, 'have not been implemented yet, because of the environment' assert self.number_of_cp == 5 # for five fingers self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors(args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda() global DEFAULT_IMAGE_SIZE DEFAULT_IMAGE_SIZE = DEFAULT_IMAGE_SIZE.cuda()
def __init__(self, args): super(NoForceOnlyCPModel, self).__init__(args) self.environment_layer = EnvWHumanCpFiniteDiffFast self.loss_function = args.loss self.number_of_cp = args.number_of_cp self.environment = args.instance_environment self.sequence_length = args.sequence_length self.gpu_ids = args.gpu_ids feature_extractors = {'resnet18': resnet18, 'resnet50': resnet50} self.feature_extractor = feature_extractors[args.feature_extractor]( pretrained=args.pretrain) del self.feature_extractor.fc self.feature_extractor.eval() if args.feature_extractor == 'resnet18': self.image_feature_size = 512 else: self.image_feature_size = 2048 self.object_feature_size = 512 self.hidden_size = 512 self.num_layers = 3 self.input_feature_size = self.object_feature_size self.cp_feature_size = self.number_of_cp * 3 self.image_embed = combine_block_w_do(self.image_feature_size, 64, args.dropout_ratio) input_object_embed_size = torch.Tensor( [3 + 4, 100, self.object_feature_size]) self.input_object_embed = input_embedding_net( input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_encoder = nn.LSTM(input_size=64 * 7 * 7 + 512, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) contact_point_decoder_size = torch.Tensor( [self.hidden_size, 100, (3) * self.number_of_cp]) self.contact_point_decoder = input_embedding_net( contact_point_decoder_size.long().tolist(), dropout=args.dropout_ratio) assert self.number_of_cp == 5 # for five fingers self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors( args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda()
def __init__(self, args): super(BaselineRegressForce, self).__init__(args) self.environment_layer = EnvWHumanCpFiniteDiffFast self.loss_function = args.loss self.relu = nn.LeakyReLU() self.number_of_cp = args.number_of_cp self.environment = args.instance_environment self.sequence_length = args.sequence_length self.gpu_ids = args.gpu_ids self.feature_extractor = resnet18(pretrained=args.pretrain) del self.feature_extractor.fc self.feature_extractor.eval() self.image_feature_size = 512 self.object_feature_size = 512 self.hidden_size = 512 self.num_layers = 3 self.input_feature_size = self.object_feature_size self.cp_feature_size = self.number_of_cp * 3 self.image_embed = combine_block_w_do(512, 64, args.dropout_ratio) input_object_embed_size = torch.Tensor([3 + 4, 100, self.object_feature_size]) self.input_object_embed = input_embedding_net(input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) contact_point_embed_size = torch.Tensor([3 * 5, 100, self.object_feature_size]) self.contact_point_embed = input_embedding_net(contact_point_embed_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_encoder = nn.LSTM(input_size=64 * 7 * 7 + 512, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) force_decoder_size = torch.Tensor([self.hidden_size * 2, 100, (3) * self.number_of_cp]) self.force_decoder = input_embedding_net(force_decoder_size.long().tolist(), dropout=args.dropout_ratio) assert (args.mode == 'train' and args.batch_size > 1 and args.break_batch == 1) or (args.mode != 'train' and args.batch_size == 1) self.train_mode = (args.mode == 'train') assert self.number_of_cp == 5 # for five fingers self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors(args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda() if not self.train_mode: BaselineRegressForce.metric += [ metrics.ObjKeypointMetric, # During test time add it metrics.ObjRotationMetric, metrics.ObjPositionMetric, ]
def __init__(self, args): super(NoModelGTForceBaseline, self).__init__(args) self.environment_layer = EnvWHumanCpFiniteDiffFast self.loss_function = args.loss self.number_of_cp = args.number_of_cp self.environment = args.instance_environment self.sequence_length = args.sequence_length self.gpu_ids = args.gpu_ids self.dummy_layer = nn.Linear(10, 10) self.this_loss_func = self.loss(args) self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors(args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda()
def __init__(self, args): super(BatchCPHeatmapModel, self).__init__(args) self.use_syn = args.use_syn self.ori_w, self.ori_h = 1920, 1080 self.environment_layer = BatchCPGradientLayer self.loss_function = args.loss self.relu = nn.LeakyReLU() self.number_of_cp = args.number_of_cp self.environment = args.instance_environment self.sequence_length = args.sequence_length self.gpu_ids = args.gpu_ids self.feature_extractor = resnet18(pretrained=args.pretrain) del self.feature_extractor.fc self.feature_extractor.eval() self.image_feature_size = 512 self.object_feature_size = 512 self.hidden_size = 512 self.num_layers = 3 self.input_feature_size = self.object_feature_size self.cp_feature_size = self.number_of_cp * 3 plane_dim = 1024 if self.use_syn else 512 self.image_embed = combine_block_w_do(plane_dim, 64, args.dropout_ratio) self.contact_point_image_embed = combine_block_w_do( plane_dim, 64, args.dropout_ratio) input_object_embed_size = torch.Tensor( [3 + 4, 100, self.object_feature_size]) self.input_object_embed = input_embedding_net( input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) self.contact_point_input_object_embed = input_embedding_net( input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) state_embed_size = torch.Tensor([ EnvState.total_size + self.cp_feature_size, 100, self.object_feature_size ]) self.state_embed = input_embedding_net( state_embed_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_encoder = nn.LSTM(input_size=self.hidden_size + 64 * 7 * 7, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) self.contact_point_encoder = nn.LSTM(input_size=self.hidden_size + 64 * 7 * 7, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) contact_point_decoder_size = torch.Tensor( [self.hidden_size, 100, (3) * self.number_of_cp]) self.contact_point_decoder = input_embedding_net( contact_point_decoder_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_decoder = nn.LSTMCell(input_size=self.hidden_size * 2, hidden_size=self.hidden_size) forces_directions_decoder_size = torch.Tensor( [self.hidden_size, 100, (3) * self.number_of_cp]) self.forces_directions_decoder = input_embedding_net( forces_directions_decoder_size.long().tolist(), dropout=args.dropout_ratio) assert args.batch_size == 1, 'we do not use more than 1 batch size, but accumulate gradients of several steps.' assert self.number_of_cp == 5 # for five fingers self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors( args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda()
def __init__(self, args): super(ForcePredictor, self).__init__() self.image_feature_size = 512 self.object_feature_size = 512 self.hidden_size = 512 self.num_layers = 3 self.sequence_length = args.sequence_length self.number_of_cp = args.number_of_cp self.use_gt_cp = args.use_gt_cp self.vis_grad = args.vis_grad self.train_res = args.train_res or self.vis_grad self.grad_value = None # force predictor networks. self.feature_extractor = resnet18(pretrained=args.pretrain) del self.feature_extractor.fc if not self.train_res: self.feature_extractor.eval() self.input_feature_size = self.object_feature_size self.cp_feature_size = self.number_of_cp * 3 self.image_embed = combine_block_w_do(512, 64, args.dropout_ratio) self.contact_point_image_embed = combine_block_w_do( 512, 64, args.dropout_ratio) input_object_embed_size = torch.Tensor( [3 + 4, 100, self.object_feature_size]) self.input_object_embed = input_embedding_net( input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) self.contact_point_input_object_embed = input_embedding_net( input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) state_embed_size = torch.Tensor([ NoGradEnvState.total_size + self.cp_feature_size, 100, self.object_feature_size ]) self.state_embed = input_embedding_net( state_embed_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_encoder = nn.LSTM(input_size=self.hidden_size + 64 * 7 * 7, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) self.contact_point_encoder = nn.LSTM(input_size=self.hidden_size + 64 * 7 * 7, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) contact_point_decoder_size = torch.Tensor( [self.hidden_size, 100, (3) * self.number_of_cp]) self.contact_point_decoder = input_embedding_net( contact_point_decoder_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_decoder = nn.LSTMCell(input_size=self.hidden_size * 2, hidden_size=self.hidden_size) forces_directions_decoder_size = torch.Tensor( [self.hidden_size, 100, (3) * self.number_of_cp]) self.forces_directions_decoder = input_embedding_net( forces_directions_decoder_size.long().tolist(), dropout=args.dropout_ratio) assert args.batch_size == 1, 'have not been implemented yet, because of the environment' assert self.number_of_cp == 5 # for five fingers self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors( args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda()
def __init__(self, args): super(JointNS, self).__init__(args) self.image_feature_size = 512 self.object_feature_size = 512 self.hidden_size = 512 self.num_layers = 3 self.loss_function = args.loss self.number_of_cp = args.number_of_cp self.environment = args.instance_environment self.sequence_length = args.sequence_length self.gpu_ids = args.gpu_ids self.all_obj_names = args.object_list self.use_gt_cp = args.use_gt_cp self.clean_force = True # configs w.r.t. two losses self.joint_two_losses = args.joint_two_losses self.loss1_or_loss2 = None if args.loss1_w < 0.00001: self.loss1_or_loss2 = False # update loss2 only elif args.loss2_w < 0.00001: self.loss1_or_loss2 = True # update loss1 only self.loss1_optim, self.loss2_optim, self.joint_optim = None, None, None # neural force simulator self.use_image_feature = True if not self.use_image_feature: self.one_ns_layer = MLPNS(hidden_size=64, layer_norm=False) else: self.one_ns_layer = NSWithImageFeature(hidden_size=64, layer_norm=False, image_feature_dim=512) # self.ns_layer = {obj_name: MLPNS(hidden_size=64, layer_norm=False) for obj_name in self.all_obj_names} # force predictor networks. self.feature_extractor = resnet18(pretrained=args.pretrain) del self.feature_extractor.fc self.feature_extractor.eval() self.input_feature_size = self.object_feature_size self.cp_feature_size = self.number_of_cp * 3 self.image_embed = combine_block_w_do(512, 64, args.dropout_ratio) self.contact_point_image_embed = combine_block_w_do( 512, 64, args.dropout_ratio) input_object_embed_size = torch.Tensor( [3 + 4, 100, self.object_feature_size]) self.input_object_embed = input_embedding_net( input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) self.contact_point_input_object_embed = input_embedding_net( input_object_embed_size.long().tolist(), dropout=args.dropout_ratio) state_embed_size = torch.Tensor([ NoGradEnvState.total_size + self.cp_feature_size, 100, self.object_feature_size ]) self.state_embed = input_embedding_net( state_embed_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_encoder = nn.LSTM(input_size=self.hidden_size + 64 * 7 * 7, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) self.contact_point_encoder = nn.LSTM(input_size=self.hidden_size + 64 * 7 * 7, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers) contact_point_decoder_size = torch.Tensor( [self.hidden_size, 100, (3) * self.number_of_cp]) self.contact_point_decoder = input_embedding_net( contact_point_decoder_size.long().tolist(), dropout=args.dropout_ratio) self.lstm_decoder = nn.LSTMCell(input_size=self.hidden_size * 2, hidden_size=self.hidden_size) forces_directions_decoder_size = torch.Tensor( [self.hidden_size, 100, (3) * self.number_of_cp]) self.forces_directions_decoder = input_embedding_net( forces_directions_decoder_size.long().tolist(), dropout=args.dropout_ratio) assert args.batch_size == 1, 'have not been implemented yet, because of the environment' assert self.number_of_cp == 5 # for five fingers self.all_objects_keypoint_tensor = get_all_objects_keypoint_tensors( args.data) if args.gpu_ids != -1: for obj, val in self.all_objects_keypoint_tensor.items(): self.all_objects_keypoint_tensor[obj] = val.cuda() self.force_predictor_modules = [ self.feature_extractor, self.image_embed, self.contact_point_image_embed, self.input_object_embed, self.contact_point_input_object_embed, self.state_embed, self.lstm_encoder, self.contact_point_encoder, self.contact_point_decoder, self.forces_directions_decoder ] # see gradients for debugging self.vis_grad = args.vis_grad self.grad_vis = None self.train_res = args.train_res or self.vis_grad