def make_arg_parser(): parser = argparse.ArgumentParser() ImageFeatures.add_args(parser) parser.add_argument("--load_scorer", type=str, default='') parser.add_argument("--load_follower", type=str, default='') parser.add_argument("--load_traj_encoder", type=str, default='') parser.add_argument( "--feedback_method", choices=["sample", "teacher", "sample1step","sample2step","sample3step","teacher+sample","recover"], default="sample") parser.add_argument("--debug", action='store_true') parser.add_argument("--bidirectional", action='store_true') parser.add_argument("--transformer", action='store_true') parser.add_argument("--scorer", action='store_true') parser.add_argument("--coground", action='store_false') parser.add_argument("--prog_monitor", action='store_false') parser.add_argument("--dev_monitor", action='store_true') parser.add_argument("--bt_button", action='store_true') parser.add_argument("--soft_align", action='store_true') parser.add_argument("--n_iters", type=int, default=20000) parser.add_argument("--num_head", type=int, default=1) parser.add_argument("--use_pretraining", action='store_true') parser.add_argument("--grad", type=str, default='all') parser.add_argument("--pretrain_splits", nargs="+", default=[]) parser.add_argument("--n_pretrain_iters", type=int, default=50000) parser.add_argument("--no_save", action='store_true') parser.add_argument("--use_glove", action='store_true') parser.add_argument("--attn_only_verb", action='store_true') parser.add_argument("--use_train_subset", action='store_true', help="use a subset of the original train data for validation") parser.add_argument("--use_test_set", action='store_true') parser.add_argument("--seed", type=int, default=1) return parser
def make_arg_parser(): parser = argparse.ArgumentParser() ImageFeatures.add_args(parser) parser.add_argument( "--use_train_subset", action='store_true', help="use a subset of the original train data for validation") parser.add_argument("--n_iters", type=int, default=20000) parser.add_argument("--no_save", action='store_true') return parser
def make_arg_parser(): parser = argparse.ArgumentParser() ImageFeatures.add_args(parser) parser.add_argument( "--use_train_subset", action='store_true', help="use a subset of the original train data for validation") parser.add_argument("--n_iters", type=int, default=20000) parser.add_argument("--no_save", action='store_true') parser.add_argument("--result_dir", default=RESULT_DIR) parser.add_argument("--snapshot_dir", default=SNAPSHOT_DIR) parser.add_argument("--plot_dir", default=PLOT_DIR) return parser
def make_env_and_models(args, train_vocab_path, train_splits, test_splits): setup(args.seed) image_features_list = ImageFeatures.from_args(args) if args.job == None: # create vocab only during training (job == none) vocab = build_vocab(train_splits) write_vocab(vocab, TRAIN_VOCAB) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=args.batch_size, splits=train_splits, tokenizer=tok) if len(train_splits) > 0 else None test_envs = { split: (R2RBatch(image_features_list, batch_size=args.batch_size, splits=[split], tokenizer=tok), Evaluation(split, args.instrType)) for split in test_splits } agent = make_follower(args, vocab) agent.env = train_env if args.useObjLabelOrVis in ['label', 'both']: if not train_env is None: agent.pointer.wtoi = train_env.wtoi else: agent.pointer.wtoi = test_envs[test_splits[0]][0].wtoi return train_env, test_envs, agent
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda(SpeakerDecoderLSTM( len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits} return train_env, test_envs, encoder, decoder
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, batch_size=BATCH_SIZE): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=args.bidirectional, glove=glove)) decoder = try_cuda(AttnDecoderLSTM( action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok), eval.Evaluation([split])) for split in test_splits} return train_env, test_envs, encoder, decoder
def make_arg_parser(): parser = argparse.ArgumentParser() ImageFeatures.add_args(parser) parser.add_argument( "--feedback_method", choices=["sample", "teacher", "teacher+sample"], default="sample") parser.add_argument("--bidirectional", action='store_true') parser.add_argument("--n_iters", type=int, default=20000) parser.add_argument("--use_pretraining", action='store_true') parser.add_argument("--pretrain_splits", nargs="+", default=[]) parser.add_argument("--n_pretrain_iters", type=int, default=50000) parser.add_argument("--no_save", action='store_true') parser.add_argument( "--use_train_subset", action='store_true', help="use a subset of the original train data for validation") parser.add_argument("--use_test_set", action='store_true') return parser
def make_more_train_env(args, train_vocab_path, train_splits): setup(args.seed) image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=args.batch_size, splits=train_splits, tokenizer=tok) return train_env
def eval_simple_agents(args): ''' Run simple baselines on each split. ''' img_features = ImageFeatures.from_args(args) for split in ['train', 'val_seen', 'val_unseen', 'test']: env = R2RBatch(img_features, batch_size=1, splits=[split]) ev = Evaluation([split]) for agent_type in ['Stop', 'Shortest', 'Random']: outfile = '%s%s_%s_agent.json' % (train.RESULT_DIR, split, agent_type.lower()) agent = BaseAgent.get_agent(agent_type)(env, outfile) agent.test() agent.write_results() score_summary, _ = ev.score_file(outfile) print('\n%s' % agent_type) pp.pprint(score_summary)
def make_env_and_models(args, train_vocab_path, train_splits, test_splits): setup(args.seed) image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=args.batch_size, splits=train_splits, tokenizer=tok) if len(train_splits) > 0 else None test_envs = { split: (R2RBatch(image_features_list, batch_size=args.batch_size, splits=[split], tokenizer=tok), eval.Evaluation([split])) for split in test_splits} agent = make_follower(args, vocab) agent.env = train_env return train_env, test_envs, agent
print("len(val_seen_routes): {}".format(len(val_seen_routes))) print("len(val_unseen_routes): {}".format(len(val_unseen_routes))) train_scans = get_scans(train_routes) unseen_scan_routes = [ inst for inst in val_seen_routes if inst['scan'] not in train_scans ] print("num instances in val_seen without scans in train: {}".format( len(unseen_scan_routes))) return train_subset_routes, train_routes, val_seen_routes, val_unseen_routes if __name__ == "__main__": image_features = ImageFeatures("none", None, None) BASE_PATH_TEMPLATE = "tasks/R2R/data/R2R_{}.json" def load(split_name): with open(BASE_PATH_TEMPLATE.format(split_name)) as f: return json.load(f) train = load("train") val_seen = load("val_seen") val_unseen = load("val_unseen") train_scans = get_scans(train) val_seen_scans = get_scans(val_seen) val_unseen_scans = get_scans(val_unseen)
import torch import env import numpy as np import json import sys sys.path.append('build') import MatterSim import math from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB import argparse parser = argparse.ArgumentParser() from env import ImageFeatures ImageFeatures.add_args(parser) import utils from utils import read_vocab, Tokenizer, timeSince, try_cuda, vocab_pad_idx from env import R2RBatch, ImageFeatures from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB from model import CompatLanEncoderLSTM, CompatVisEncoderLSTM, dotSimilarity, EncoderLSTM,SpeakerEncoderLSTM from compatModel import compatModel args, _ = parser.parse_known_args() image_features_list= ImageFeatures.from_args(args) angle_inc = np.pi / 6. def build_viewpoint_loc_embedding(viewIndex): """ Position embedding:
'use_wordvec': False, 'bert': False, 'coground': True, 'num_head': 1, 'prog_monitor': True, 'dev_monitor': False, 'attn_only_verb': False, 'soft_align': False, 'scorer': None, 'load_follower': 'tasks/R2R/experiments/pretrain_cgPm_pertraj/snapshots/follower_cg_pm_sample2step_imagenet_mean_pooled_1heads_train_iter_1900_val_unseen-success_rate=0.478', 'language': 'en-OLD', 'prefix': 'R2R', }) image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(TRAIN_VOCAB, args.language) tok = Tokenizer(vocab) env = R2RBatch(image_features_list, batch_size=256, splits=['train', 'val_seen', 'val_unseen'], tokenizer=tok) env.batch = env.data from eval import Evaluation test_envs = { split: (R2RBatch(image_features_list, batch_size=64, splits=[split], tokenizer=tok), Evaluation([split]))
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) train_env.data.extend(hardNeg_train) # extend train data and shuffle random.shuffle(train_env.data) enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE # ============================================================================= # visEncoder = try_cuda(CompatVisEncoderLSTM( # action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, # bidirectional=bidirectional)) # ============================================================================= visEncoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) # ============================================================================= # lanEncoder = try_cuda(CompatLanEncoderLSTM( # len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, # dropout_ratio, bidirectional=True, glove=glove)) # ============================================================================= lanEncoder = try_cuda( EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=False, glove=glove)) dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size)) #visEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/speaker_final_release_enc')) #lanEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/follower_final_release_enc')) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits } #test_envs['val_seen'][0].data.extend(hardNeg_val_seen) test_envs['val_unseen'][0].data.extend(hardNeg_val_unseen) test_envs['val_unseen'][0].data = test_envs['val_unseen'][0].data[ 3000:4000] return train_env, test_envs, visEncoder, lanEncoder, dotSim
def make_arg_parser(): parser = argparse.ArgumentParser() ImageFeatures.add_args(parser) parser.add_argument("--load_scorer", type=str, default='') parser.add_argument("--load_follower", type=str, default='') parser.add_argument("--load_traj_encoder", type=str, default='') parser.add_argument("--feedback_method", choices=[ "sample", "teacher", "sample1step", "sample2step", "sample3step", "teacher+sample", "recover" ], default="sample") parser.add_argument("--debug", action='store_true') parser.add_argument("--bidirectional", action='store_true') parser.add_argument("--transformer", action='store_true') parser.add_argument("--scorer", action='store_true') parser.add_argument("--coground", action='store_false') parser.add_argument("--prog_monitor", action='store_false') parser.add_argument("--dev_monitor", action='store_true') parser.add_argument("--bt_button", action='store_true') parser.add_argument("--soft_align", action='store_true') parser.add_argument("--n_iters", type=int, default=10900) parser.add_argument("--num_head", type=int, default=1) parser.add_argument("--use_pretraining", action='store_true') parser.add_argument("--grad", type=str, default='all') parser.add_argument("--pretrain_splits", nargs="+", default=[]) parser.add_argument("--n_pretrain_iters", type=int, default=50000) parser.add_argument("--no_save", action='store_true') parser.add_argument("--use_glove", action='store_true') parser.add_argument("--WIDTH", type=int, default=640) parser.add_argument("--HEIGHT", type=int, default=480) parser.add_argument("--VFOV", type=int, default=60) parser.add_argument("--useStopFeat", type=int, default=1) parser.add_argument("--useObjLabelOrVis", type=str, default='both', help="options: vis, label, both, none") parser.add_argument("--objFeatType", type=str, default='fc7', help="options: pool5, fc7") parser.add_argument("--objVisFeatDim", type=int, default=2048) parser.add_argument("--objLanFeatDim", type=int, default=512) parser.add_argument("--objTopK", type=int, default=3) parser.add_argument("--useDect", type=bool, default=False) parser.add_argument("--instrType", type=str, default='instructions', help="options: instrutions, instructions_l") parser.add_argument("--ObjEachViewVisFeatDir", type=str, default='objEachViewVisFeatFc7Top3/') parser.add_argument("--ObjEachViewLanFeatDir", type=str, default='objEachViewLanFeatTop3/') parser.add_argument( "--labelGlovePath", type=str, # for object label embedding default='tasks/REVERIE/data/reverie4_reverie4.npy') parser.add_argument( "--grdModelPrefix", type=str, # for visual grounding default='MAttNet3/output/reverie4_reverie4/mrcn_cmr_with_st') parser.add_argument("--matterportDir", type=str, default='/home/qyk/dataset/Matterport/') parser.add_argument("--attn_only_verb", action='store_true') parser.add_argument( "--use_train_subset", action='store_true', help="use a subset of the original train data for validation") parser.add_argument("--use_test_set", action='store_true') parser.add_argument("--seed", type=int, default=1) return parser
def make_arg_parser(): parser = argparse.ArgumentParser() ImageFeatures.add_args(parser) Refer360ImageFeatures.add_args(parser) parser.add_argument( "--use_train_subset", action='store_true', help="use a subset of the original train data for validation") parser.add_argument("--bidirectional", action='store_true') parser.add_argument("--word_embedding_size", type=int, default=300) #parser.add_argument("--hidden_size", type=int, default=512) parser.add_argument("--hidden_size", type=int, default=256) parser.add_argument("--learning_rate", type=float, default=0.0001) parser.add_argument("--weight_decay", type=float, default=0.0005) parser.add_argument("--dropout_ratio", type=float, default=0.5) parser.add_argument("--feedback_method", choices=['teacher', 'sample'], default='teacher') parser.add_argument("--n_iters", type=int, default=100000) parser.add_argument("--log_every", type=int, default=5000) parser.add_argument("--save_every", type=int, default=5000) parser.add_argument("--max_input_length", type=int, default=80) parser.add_argument("--seed", type=int, default=10) parser.add_argument("--beam_size", type=int, default=1) parser.add_argument("--no_save", action='store_true') parser.add_argument("--prefix", type=str, default='R2R') parser.add_argument("--language", type=str, default='en-ALL') parser.add_argument('--wordvec_path', type=str, default='tasks/R2R/data/train_glove') parser.add_argument('--wordvec_finetune', action='store_true') parser.add_argument("--error_margin", type=float, default=3.0) parser.add_argument("--use_intermediate", action='store_true') parser.add_argument('--use_reading', action='store_true') parser.add_argument('--use_raw', action='store_true') parser.add_argument("--add_asterix", action='store_true') #parser.add_argument("--env", type=str, default='r2r') parser.add_argument('--img_features_root', type=str, default='./img_features') parser.add_argument( '--cache_root', type=str, default='/projects/vcirik/refer360/data/cached_data_15degrees/') parser.add_argument('--image_list_file', type=str, default='/projects/vcirik/refer360/data/imagelist.txt') parser.add_argument( '--refer360_root', type=str, default='/projects/vcirik/refer360/data/continuous_grounding') parser.add_argument("--angle_inc", type=int, default=30) parser.add_argument('--use_gt_actions', action='store_true') parser.add_argument('--use_absolute_location_embeddings', action='store_true') parser.add_argument('--use_stop_embeddings', action='store_true') parser.add_argument('--use_timestep_embeddings', action='store_true') parser.add_argument('--use_visited_embeddings', type=str, choices=['', 'ones', 'zeros', 'count', 'pe'], default='') parser.add_argument('--use_oracle_embeddings', action='store_true') parser.add_argument('--use_object_embeddings', action='store_true') parser.add_argument('--metrics', type=str, default='success', help='Success metric, default=success') parser.add_argument('--deaf', action='store_true') parser.add_argument('--blind', action='store_true') parser.add_argument('--no_lookahead', action='store_true') parser.add_argument('--nextstep', action='store_true') parser.add_argument("--verbose", action='store_true') return parser
def train_val(): ''' Train on the training set, and validate on seen and unseen splits. ''' # Set which GPU to use device = torch.device('cuda', hparams.device_id) # Load hyperparameters from checkpoint (if exists) if os.path.exists(hparams.load_path): print('Load model from %s' % hparams.load_path) ckpt = load(hparams.load_path, device) start_iter = ckpt['iter'] else: if not hparams.forward_agent and not hparams.random_agent and not hparams.shortest_agent: if hasattr(hparams, 'load_path') and hasattr(hparams, 'eval_only') and hparams.eval_only: sys.exit('load_path %s does not exist!' % hparams.load_path) ckpt = None start_iter = 0 end_iter = hparams.n_iters if not hasattr(hparams, 'ask_baseline'): hparams.ask_baseline = None if not hasattr(hparams, 'instruction_baseline'): hparams.instruction_baseline = None # Set random seeds torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) np.random.seed(hparams.seed) random.seed(hparams.seed) # Create or load vocab train_vocab_path = os.path.join(hparams.data_path, 'vocab.txt') if not os.path.exists(train_vocab_path): raise Exception('Vocab file not found at %s' % train_vocab_path) vocab = read_vocab([train_vocab_path]) hparams.instr_padding_idx = vocab.index('<PAD>') tokenizer = Tokenizer(vocab=vocab, encoding_length=hparams.max_instr_len) if hparams.encoder_type == 'dic': tokenizer = BTokenizer(vocab=vocab,encoding_length=hparams.max_instr_len) featurizer = ImageFeatures(hparams.img_features, device) simulator = Simulator(hparams) # Create train environment train_env = Batch(hparams, simulator, featurizer, tokenizer, split='train') # Create validation environments val_splits = ['val_seen', 'val_unseen'] eval_mode = hasattr(hparams, 'eval_only') and hparams.eval_only if eval_mode: if 'val_seen' in hparams.load_path: val_splits = ['test_seen'] elif 'val_unseen' in hparams.load_path: val_splits = ['test_unseen'] else: val_splits = ['test_seen', 'test_unseen'] end_iter = start_iter + 1 if hparams.eval_on_val: val_splits = [x.replace('test_', 'val_') for x in val_splits] val_envs_tmp = { split: ( Batch(hparams, simulator, featurizer, tokenizer, split=split), Evaluation(hparams, [split], hparams.data_path)) for split in val_splits } val_envs = {} for key, value in val_envs_tmp.items(): if '_seen' in key: val_envs[key + '_env_seen_anna'] = value val_envs[key + '_env_unseen_anna'] = value else: assert '_unseen' in key val_envs[key] = value # Build model and optimizer model = AgentModel(len(vocab), hparams, device).to(device) optimizer = optim.Adam(model.parameters(), lr=hparams.lr, weight_decay=hparams.weight_decay) best_metrics = { env_name : -1 for env_name in val_envs.keys() } best_metrics['combined'] = -1 # Load model paramters from checkpoint (if exists) if ckpt is not None: model.load_state_dict(ckpt['model_state_dict']) optimizer.load_state_dict(ckpt['optim_state_dict']) best_metrics = ckpt['best_metrics'] train_env.ix = ckpt['data_idx'] if hparams.log_every == -1: hparams.log_every = round(len(train_env.data) / \ (hparams.batch_size * 100)) * 100 print('') pprint(vars(hparams), width=1) print('') print(model) print('Number of parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad)) if hparams.random_agent or hparams.forward_agent or hparams.shortest_agent: assert eval_mode agent = SimpleAgent(hparams) else: agent = VerbalAskAgent(model, hparams, device) return train(train_env, val_envs, agent, model, optimizer, start_iter, end_iter, best_metrics, eval_mode)