def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda(SpeakerDecoderLSTM( len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits} return train_env, test_envs, encoder, decoder
def make_speaker(args): enc_hidden_size = hidden_size//2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE vocab = read_vocab(TRAIN_VOCAB) encoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda(SpeakerDecoderLSTM( len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) agent = Seq2SeqSpeaker( None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH) return agent
def make_scorer(args): bidirectional = args.bidirectional enc_hidden_size = hidden_size//2 if bidirectional else hidden_size feature_size = FEATURE_SIZE traj_encoder = try_cuda(SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=args.bidirectional)) scorer_module = try_cuda(DotScorer(enc_hidden_size, enc_hidden_size)) scorer = Scorer(scorer_module, traj_encoder) if args.load_scorer is not '': scorer.load(args.load_scorer) print(colorize('load scorer traj '+ args.load_scorer)) elif args.load_traj_encoder is not '': scorer.load_traj_encoder(args.load_traj_encoder) print(colorize('load traj encoder '+ args.load_traj_encoder)) return scorer
def make_speaker(args, action_embedding_size=-1, feature_size=-1): enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size wordvec = np.load(args.wordvec_path) vocab = read_vocab(TRAIN_VOCAB, args.language) encoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda( SpeakerDecoderLSTM(len(vocab), word_embedding_size, hidden_size, dropout_ratio, wordvec=wordvec, wordvec_finetune=args.wordvec_finetune)) agent = Seq2SeqSpeaker(None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH) return agent
def make_speaker(args, action_embedding_size=-1, feature_size=-1): enc_hidden_size = args.hidden_size // 2 if args.bidirectional else args.hidden_size wordvec = np.load(args.wordvec_path) vocab = read_vocab(TRAIN_VOCAB, args.language) word_embedding_size = get_word_embedding_size(args) encoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, args.dropout_ratio, bidirectional=args.bidirectional)) decoder = try_cuda( SpeakerDecoderLSTM(len(vocab), word_embedding_size, args.hidden_size, args.dropout_ratio, wordvec=wordvec, wordvec_finetune=args.wordvec_finetune)) agent = Seq2SeqSpeaker(None, "", encoder, decoder, args.max_input_length) return agent
weight_decay = 0.0005 #weight_decay = 0.0001 FEATURE_SIZE = 2048+128 n_iters = 5000 log_every = 100 save_every = 100 vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab) glove = np.load(glove_path) enc_hidden_size = hidden_size//2 if bidirectional else hidden_size feature_size = FEATURE_SIZE visEncoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) lanEncoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=False, glove=glove)) dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size)) agent = compatModel(None, "", visEncoder, lanEncoder, dotSim) #agent.load('tasks/R2R/snapshots/release/speaker_final_release', map_location = 'cpu') agent.load('tasks/R2R/compat/trained_1/compat_sample_imagenet_mean_pooled_train_iter_1000', map_location = 'cpu') if __name__ == "__main__": traj = {'scan':'5q7pvUzZiYa', 'path':["7dc12a67ddfc4a4a849ce620db5b777b", "0e84cf4dec784bc28b78a80bee35c550", "a77784b955454209857d745976a1676d", "67971a17c26f4e2ca117b4fca73507fe", "8db06d3a0dd44508b3c078d60126ce19", "43ac37dfa1db4a13a8a9df4e454eb016", "4bd82c990a6548a994daa97c8f52db06", "6d11ca4d41e04bb1a725c2223c36b2aa", "29fb3c58b29348558d36a9f9440a1379", "c23f26401359426982d11ca494ee739b", "397403366d784caf804d741f32fd68b9", "3c6a35e15ada4b649990d6568cce8bd9", "55e4436f528c4bf09e4550079c572f7b", "69fad7dd177847dbabf69e8fb7c00ddf", "c629c7f1cf6f47a78c45a8ae9ff82247", "21fca0d6192940e580587fe317440f56", "4b85d61dd3a94e8a812affe78f3a322d", "3c025b8e3d2040969cd00dd0e9f29b09"][:2], 'heading':0.0,'elevation_init':0.0} encoded_instructions, _ = tok.encode_sentence('') encoded_instructions = torch.tensor([encoded_instructions], device = 'cpu') rdv_test = rdv(traj)
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) train_env.data.extend(hardNeg_train) # extend train data and shuffle random.shuffle(train_env.data) enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE # ============================================================================= # visEncoder = try_cuda(CompatVisEncoderLSTM( # action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, # bidirectional=bidirectional)) # ============================================================================= visEncoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) # ============================================================================= # lanEncoder = try_cuda(CompatLanEncoderLSTM( # len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, # dropout_ratio, bidirectional=True, glove=glove)) # ============================================================================= lanEncoder = try_cuda( EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=False, glove=glove)) dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size)) #visEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/speaker_final_release_enc')) #lanEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/follower_final_release_enc')) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits } #test_envs['val_seen'][0].data.extend(hardNeg_val_seen) test_envs['val_unseen'][0].data.extend(hardNeg_val_unseen) test_envs['val_unseen'][0].data = test_envs['val_unseen'][0].data[ 3000:4000] return train_env, test_envs, visEncoder, lanEncoder, dotSim
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None, instructions_per_path=None): setup() if args.env == 'r2r': EnvBatch = R2RBatch ImgFeatures = ImageFeatures elif args.env == 'refer360': EnvBatch = Refer360Batch ImgFeatures = Refer360ImageFeatures else: raise NotImplementedError( 'this {} environment is not implemented.'.format(args.env)) image_features_list = ImgFeatures.from_args(args) feature_size = sum( [featurizer.feature_dim for featurizer in image_features_list]) + 128 if args.use_visited_embeddings: feature_size += 64 if args.use_oracle_embeddings: feature_size += 64 action_embedding_size = feature_size vocab = read_vocab(train_vocab_path, args.language) tok = Tokenizer(vocab=vocab) train_env = EnvBatch(image_features_list, splits=train_splits, tokenizer=tok, args=args) enc_hidden_size = args.hidden_size // 2 if args.bidirectional else args.hidden_size wordvec = np.load(args.wordvec_path) word_embedding_size = get_word_embedding_size(args) enc_hidden_size = 600 # refer360 >>> enc_hidden_size = 512 # refer360 >>> # enc_hidden_size = 512 # r2r >>> encoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, args.dropout_ratio, bidirectional=args.bidirectional)) word_embedding_size = 300 # refer360 >>>> word_embedding_size = 300 # r2r >>>> hidden_size = 600 # refer360 >>> hidden_size = 512 # refer360 >>> # hidden_size = 512 # >>> r2r #hidden_size = args.hidden_size decoder = try_cuda( SpeakerDecoderLSTM(len(vocab), word_embedding_size, hidden_size, args.dropout_ratio, wordvec=wordvec, wordvec_finetune=args.wordvec_finetune)) test_envs = {} for split in test_splits: b = EnvBatch(image_features_list, splits=[split], tokenizer=tok, args=args) e = eval_speaker.SpeakerEvaluation( [split], instructions_per_path=instructions_per_path, args=args) test_envs[split] = (b, e) # TODO # test_envs = { # split: (BatchEnv(image_features_list, batch_size=batch_size, # splits=[split], tokenizer=tok, # instruction_limit=test_instruction_limit, # prefix=args.prefix), # eval_speaker.SpeakerEvaluation( # [split], instructions_per_path=instructions_per_path, )) # for split in test_splits} return train_env, test_envs, encoder, decoder
glove_path = 'tasks/R2R/data/train_glove.npy' action_embedding_size = 2048 + 128 hidden_size = 512 bidirectional = False dropout_ratio = 0.5 feedback_method = 'sample' # teacher or sample learning_rate = 0.0001 weight_decay = 0.0005 feature_size = 2048 + 128 glove = np.load(glove_path) vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab) encoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, hidden_size, dropout_ratio)) decoder = try_cuda( SpeakerDecoderLSTM(len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) agent = Seq2SeqSpeaker(tok, "", encoder, decoder, MAX_INSTRUCTION_LENGTH) agent.load('tasks/R2R/snapshots/release/speaker_final_release', map_location='cpu') if __name__ == "__main__": traj = { 'scan': '5q7pvUzZiYa', 'path': [