def __init__( self, opts, features, img_spec, batch_size=64, seed=10, splits=["train"], tokenizer=None, ): self.env = PanoEnvBatch(features, img_spec, batch_size=batch_size) self.data = [] self.scans = [] self.opts = opts print("Loading {} dataset".format(splits[0])) json_data = load_datasets(splits) total_length = len(json_data) # iteratively load data into system memory for i, item in enumerate(json_data): if not is_experiment() and i >= 20: break # if this is in developing mode, load only a small amount of data # Split multiple instructions into separate entries for j, instr in enumerate(item["instructions"]): self.scans.append(item["scan"]) new_item = dict(item) new_item["instr_id"] = "%s_%d" % (item["path_id"], j) new_item["instructions"] = instr if tokenizer: if ( "instr_encoding" not in item ): # we may already include 'instr_encoding' when generating synthetic instructions new_item["instr_encoding"] = tokenizer.encode_sentence( instr) else: new_item["instr_encoding"] = item["instr_encoding"] self.data.append(new_item) print_progress( i + 1, total_length, prefix="Progress:", suffix="Complete", bar_length=50, ) self.scans = set(self.scans) self.splits = splits self.seed = seed random.seed(self.seed) random.shuffle(self.data) self.ix = 0 self.batch_size = batch_size self._load_nav_graphs() print("R2RBatch loaded with %d instructions, using splits: %s" % (len(self.data), ",".join(splits)))
def __init__(self, opts, features, img_spec, batch_size=64, seed=10, splits=['train'], tokenizer=None, configuration=False): self.env = PanoEnvBatch(features, img_spec, batch_size=batch_size) self.data = [] self.scans = [] self.opts = opts self.configuration = configuration print('Loading {} dataset'.format(splits[0])) json_data = load_datasets(splits) total_length = len(json_data) # iteratively load data into system memory for i, item in enumerate(json_data): if not is_experiment() and i >= 20: break # if this is in developing mode, load only a small amount of data # Split multiple instructions into separate entries for j, instr in enumerate(item['instructions']): self.scans.append(item['scan']) new_item = dict(item) new_item['instr_id'] = '%s_%d' % (item['path_id'], j) new_item['instructions'] = instr if configuration: new_item['configurations'] = get_configurations(instr) self.data.append((len(new_item['configurations']), new_item)) else: self.data.append(new_item) if tokenizer: if 'instr_encoding' not in item: # we may already include 'instr_encoding' when generating synthetic instructions new_item['instr_encoding'] = tokenizer.encode_sentence(instr) else: new_item['instr_encoding'] = item['instr_encoding'] print_progress(i + 1, total_length, prefix='Progress:', suffix='Complete', bar_length=50) if configuration: self.data.sort(key=lambda x: x[0]) self.data = list(map(lambda item:item[1], self.data)) else: self.seed = seed random.seed(self.seed) random.shuffle(self.data) self.scans = set(self.scans) self.splits = splits self.ix = 0 self.batch_size = batch_size self._load_nav_graphs() print('R2RBatch loaded with %d instructions, using splits: %s' % (len(self.data), ",".join(splits)))
def download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera): accession_dir = os.path.join(dest_dir, accession) utils.create_dir(accession_dir) # download experiment xml is_experiment = utils.is_experiment(accession) if fetch_meta and is_experiment: download_meta(accession, accession_dir) if fetch_meta and utils.is_run(accession): download_experiment_meta(accession, accession_dir) # download data files search_url = utils.get_file_search_query(accession, aspera) temp_file = os.path.join(dest_dir, 'temp.txt') utils.download_report_from_portal(search_url, temp_file) f = open(temp_file) lines = f.readlines() f.close() os.remove(temp_file) for line in lines[1:]: data_accession, filelist, md5list, indexlist = utils.parse_file_search_result_line( line, accession, output_format) # create run directory if downloading all data for an experiment if is_experiment: run_dir = os.path.join(accession_dir, data_accession) utils.create_dir(run_dir) target_dir = run_dir else: target_dir = accession_dir # download run/analysis XML if fetch_meta: download_meta(data_accession, target_dir) if len(filelist) == 0: if output_format is None: print 'No files available for {0}'.format(data_accession) else: print 'No files of format {0} available for {1}'.format( output_format, data_accession) continue for i in range(len(filelist)): file_url = filelist[i] md5 = md5list[i] if file_url != '': download_file(file_url, target_dir, md5, aspera) if fetch_index: for index_file in indexlist: if index_file != '': download_file(index_file, target_dir, None, aspera) if utils.is_empty_dir(target_dir): print 'Deleting directory ' + os.path.basename(target_dir) os.rmdir(target_dir)
def __init__(self, opts, features, img_spec, batch_size=64, seed=10, splits=['train'], tokenizer=None): self.env = PanoEnvBatch(features, img_spec, batch_size=batch_size) self.data = [] self.scans = [] self.opts = opts print('Loading {} dataset'.format(splits[0])) json_data = load_datasets(splits) total_length = len(json_data) # iteratively load data into system memory for i, item in enumerate(json_data): if not is_experiment() and i >= 20: break # Split multiple instructions into separate entries for j, instr in enumerate(item['instructions']): self.scans.append(item['scan']) new_item = dict(item) new_item['instr_id'] = '%s_%d' % (item['path_id'], j) new_item['instructions'] = instr if tokenizer: if 'instr_encoding' not in item: # we may already include 'instr_encoding' when generating synthetic instructions new_item['instr_encoding'] = tokenizer.encode_sentence(instr) else: new_item['instr_encoding'] = item['instr_encoding'] if 'divide' in opts.lang_embed: if opts.divide_method == 'kevin': new_item['divid_instr_encoding'] = tokenizer.divide_instr_kevin(instr, opts.max_sentence_segs) else: new_item['divid_instr_encoding'] = tokenizer.divide_instr_victor(instr, opts.max_sentence_segs) self.data.append(new_item) print_progress(i + 1, total_length, prefix='Progress:', suffix='Complete', bar_length=50) self.scans = set(self.scans) self.splits = splits self.seed = seed random.seed(self.seed) random.shuffle(self.data) self.ix = 0 self.batch_size = batch_size self._load_nav_graphs() self._load_categories() print('R2RBatch loaded with %d instructions, using splits: %s' % (len(self.data), ",".join(splits)))
def download_files(accession, format, dest_dir, fetch_index, fetch_meta, aspera): if format is None: format = utils.SUBMITTED_FORMAT accession_dir = os.path.join(dest_dir, accession) utils.create_dir(accession_dir) # download experiment xml is_experiment = utils.is_experiment(accession) if fetch_meta and is_experiment: download_meta(accession, accession_dir) # TODO download experiment xml for run accession if fetch_meta and utils.is_run(accession): download_experiment_meta(accession, accession_dir) # download data files search_url = utils.get_file_search_query(accession, format, fetch_index, aspera) temp_file = os.path.join(dest_dir, 'temp.txt') utils.download_report_from_portal(search_url, temp_file) f = open(temp_file) lines = f.readlines() f.close() os.remove(temp_file) for line in lines[1:]: data_accession, filelist, md5list, indexlist = utils.parse_file_search_result_line(line, accession, format, fetch_index) # create run directory if downloading all data for an experiment if is_experiment: run_dir = os.path.join(accession_dir, data_accession) utils.create_dir(run_dir) target_dir = run_dir else: target_dir = accession_dir # download run/analysis XML if fetch_meta: download_meta(data_accession, target_dir) if len(filelist) == 0: print 'No files of format ' + format + ' for ' + data_accession continue for i in range(len(filelist)): file_url = filelist[i] md5 = md5list[i] if file_url != '': download_file_with_md5_check(file_url, target_dir, md5, aspera) for index_file in indexlist: if index_file != '': download_file(index_file, target_dir, aspera)
for index_file in indexlist: if index_file != '': download_file(index_file, target_dir, aspera) if __name__ == '__main__': parser = set_parser() args = parser.parse_args() accession = args.accession format = args.format dest_dir = args.dest fetch_meta = args.meta fetch_index = args.index aspera = args.aspera if not utils.is_run(accession) and not utils.is_experiment(accession): print 'Error: Invalid accession. An INSDC run or experiment accession must be provided' sys.exit(1) if not utils.is_available(accession): print 'Record does not exist or is not available for accession provided' sys.exit(1) try: download_files(accession, format, dest_dir, fetch_index, fetch_meta, aspera) print 'Completed' except Exception: utils.print_error() sys.exit(1)
def main(opts): # set manual_seed and build vocab print(opts, flush=True) setup(opts, opts.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Usando {device} :)") # create a batch training environment that will also preprocess text vocab = read_vocab(opts.train_vocab) tok = Tokenizer(opts.remove_punctuation == 1, opts.reversed == 1, vocab=vocab, encoding_length=opts.max_cap_length) # create language instruction encoder encoder_kwargs = { 'opts': opts, 'vocab_size': len(vocab), 'embedding_size': opts.word_embedding_size, 'hidden_size': opts.rnn_hidden_size, 'padding_idx': padding_idx, 'dropout_ratio': opts.rnn_dropout, 'bidirectional': opts.bidirectional == 1, 'num_layers': opts.rnn_num_layers } print('Using {} as encoder ...'.format(opts.lang_embed)) if 'lstm' in opts.lang_embed: encoder = EncoderRNN(**encoder_kwargs) else: raise ValueError('Unknown {} language embedding'.format(opts.lang_embed)) print(encoder) # create policy model policy_model_kwargs = { 'opts':opts, 'img_fc_dim': opts.img_fc_dim, 'img_fc_use_batchnorm': opts.img_fc_use_batchnorm == 1, 'img_dropout': opts.img_dropout, 'img_feat_input_dim': opts.img_feat_input_dim, 'rnn_hidden_size': opts.rnn_hidden_size, 'rnn_dropout': opts.rnn_dropout, 'max_len': opts.max_cap_length, 'max_navigable': opts.max_navigable } if opts.arch == 'regretful': model = Regretful(**policy_model_kwargs) elif opts.arch == 'self-monitoring': model = SelfMonitoring(**policy_model_kwargs) elif opts.arch == 'speaker-baseline': model = SpeakerFollowerBaseline(**policy_model_kwargs) else: raise ValueError('Unknown {} model for seq2seq agent'.format(opts.arch)) print(model) encoder = encoder.to(device) model = model.to(device) params = list(encoder.parameters()) + list(model.parameters()) optimizer = torch.optim.Adam(params, lr=opts.learning_rate) # optionally resume from a checkpoint if opts.resume: model, encoder, optimizer, best_success_rate = resume_training(opts, model, encoder, optimizer) # if a secondary exp name is specified, this is useful when resuming from a previous saved # experiment and save to another experiment, e.g., pre-trained on synthetic data and fine-tune on real data if opts.exp_name_secondary: opts.exp_name += opts.exp_name_secondary feature, img_spec = load_features(opts.img_feat_dir, opts.blind) if opts.test_submission: assert opts.resume, 'The model was not resumed before running for submission.' test_env = ('test', (R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, splits=['test'], tokenizer=tok), Evaluation(['test'], opts))) agent_kwargs = { 'opts': opts, 'env': test_env[1][0], 'results_path': "", 'encoder': encoder, 'model': model, 'feedback': opts.feedback } agent = PanoSeq2SeqAgent(**agent_kwargs) # setup trainer trainer = PanoSeq2SeqTrainer(opts, agent, optimizer) epoch = opts.start_epoch - 1 trainer.eval(epoch, test_env) return # set up R2R environments if not opts.train_data_augmentation: train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=['train'], tokenizer=tok) else: train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=['synthetic'], tokenizer=tok) val_craft_splits = ['craft_seen', 'craft_unseen'] val_splits = ['val_seen', 'val_unseen'] if opts.craft_eval: val_splits += val_craft_splits val_envs = {split: (R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, splits=[split], tokenizer=tok), Evaluation([split], opts)) for split in val_splits} # create agent agent_kwargs = { 'opts': opts, 'env': train_env, 'results_path': "", 'encoder': encoder, 'model': model, 'feedback': opts.feedback } agent = PanoSeq2SeqAgent(**agent_kwargs) # setup trainer trainer = PanoSeq2SeqTrainer(opts, agent, optimizer, opts.train_iters_epoch) if opts.eval_only: success_rate = [] for val_env in val_envs.items(): success_rate.append(trainer.eval(opts.start_epoch - 1, val_env, tb_logger=None)) return # set up tensorboard logger tb_logger = set_tb_logger(opts.log_dir, opts.exp_name, opts.resume) sys.stdout.flush() best_success_rate = best_success_rate if opts.resume else 0.0 for epoch in range(opts.start_epoch, opts.max_num_epochs + 1): trainer.train(epoch, train_env, tb_logger) if epoch % opts.eval_every_epochs == 0: success_rate = [] for val_env in val_envs.items(): success_rate.append(trainer.eval(epoch, val_env, tb_logger)) success_rate_compare = success_rate[1] if is_experiment(): # remember best val_seen success rate and save checkpoint is_best = success_rate_compare >= best_success_rate best_success_rate = max(success_rate_compare, best_success_rate) print("--> Highest val_unseen success rate: {}".format(best_success_rate)) sys.stdout.flush() # save the model if it is the best so far save_checkpoint({ 'opts': opts, 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'encoder_state_dict': encoder.state_dict(), 'best_success_rate': best_success_rate, 'optimizer': optimizer.state_dict(), 'max_episode_len': opts.max_episode_len, }, is_best, checkpoint_dir=opts.checkpoint_dir, name=opts.exp_name) if opts.train_data_augmentation and epoch == opts.epochs_data_augmentation: train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=['train'], tokenizer=tok) print("--> Finished training")
if utils.is_wgs_set(accession): if output_format is not None: sequenceGet.check_format(output_format) sequenceGet.download_wgs(dest_dir, accession, output_format) elif not utils.is_available(accession): sys.stderr.write('ERROR: Record does not exist or is not available for accession provided\n') sys.exit(1) elif utils.is_sequence(accession): if output_format is not None: sequenceGet.check_format(output_format) sequenceGet.download_sequence(dest_dir, accession, output_format, expanded) elif utils.is_analysis(accession): if output_format is not None: readGet.check_read_format(output_format) readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera) elif utils.is_run(accession) or utils.is_experiment(accession): if output_format is not None: readGet.check_read_format(output_format) readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera) elif utils.is_assembly(accession): if output_format is not None: assemblyGet.check_format(output_format) assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, expanded) else: sys.stderr.write('ERROR: Invalid accession provided\n') sys.exit(1) print 'Completed' except Exception: traceback.print_exc() utils.print_error() sys.exit(1)
def main(opts): # set manual_seed and build vocab setup(opts, opts.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # create a batch training environment that will also preprocess text vocab = read_vocab(opts.train_vocab) tok = Tokenizer( opts.remove_punctuation == 1, opts.reversed == 1, vocab=vocab, encoding_length=opts.max_cap_length, ) # create language instruction encoder encoder_kwargs = { "opts": opts, "vocab_size": len(vocab), "embedding_size": opts.word_embedding_size, "hidden_size": opts.rnn_hidden_size, "padding_idx": padding_idx, "dropout_ratio": opts.rnn_dropout, "bidirectional": opts.bidirectional == 1, "num_layers": opts.rnn_num_layers, } print("Using {} as encoder ...".format(opts.lang_embed)) if "lstm" in opts.lang_embed: encoder = EncoderRNN(**encoder_kwargs) else: raise ValueError("Unknown {} language embedding".format( opts.lang_embed)) print(encoder) # create policy model policy_model_kwargs = { "opts": opts, "img_fc_dim": opts.img_fc_dim, "img_fc_use_batchnorm": opts.img_fc_use_batchnorm == 1, "img_dropout": opts.img_dropout, "img_feat_input_dim": opts.img_feat_input_dim, "rnn_hidden_size": opts.rnn_hidden_size, "rnn_dropout": opts.rnn_dropout, "max_len": opts.max_cap_length, "max_navigable": opts.max_navigable, } if opts.arch == "self-monitoring": model = SelfMonitoring(**policy_model_kwargs) elif opts.arch == "speaker-baseline": model = SpeakerFollowerBaseline(**policy_model_kwargs) else: raise ValueError("Unknown {} model for seq2seq agent".format( opts.arch)) print(model) encoder = encoder.to(device) model = model.to(device) params = list(encoder.parameters()) + list(model.parameters()) optimizer = torch.optim.Adam(params, lr=opts.learning_rate) # optionally resume from a checkpoint if opts.resume: model, encoder, optimizer, best_success_rate = resume_training( opts, model, encoder, optimizer) # if a secondary exp name is specified, this is useful when resuming from a previous saved # experiment and save to another experiment, e.g., pre-trained on synthetic data and fine-tune on real data if opts.exp_name_secondary: opts.exp_name += opts.exp_name_secondary feature, img_spec = load_features(opts.img_feat_dir) if opts.test_submission: assert (opts.resume ), "The model was not resumed before running for submission." test_env = ( "test", ( R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, splits=["test"], tokenizer=tok, ), Evaluation(["test"]), ), ) agent_kwargs = { "opts": opts, "env": test_env[1][0], "results_path": "", "encoder": encoder, "model": model, "feedback": opts.feedback, } agent = PanoSeq2SeqAgent(**agent_kwargs) # setup trainer trainer = PanoSeq2SeqTrainer(opts, agent, optimizer) epoch = opts.start_epoch - 1 trainer.eval(epoch, test_env) return # set up R2R environments if not opts.train_data_augmentation: train_env = R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=["train"], tokenizer=tok, ) else: train_env = R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=["synthetic"], tokenizer=tok, ) val_envs = { split: ( R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, splits=[split], tokenizer=tok, ), Evaluation([split]), ) for split in ["val_seen", "val_unseen"] } # create agent agent_kwargs = { "opts": opts, "env": train_env, "results_path": "", "encoder": encoder, "model": model, "feedback": opts.feedback, } agent = PanoSeq2SeqAgent(**agent_kwargs) # setup trainer trainer = PanoSeq2SeqTrainer(opts, agent, optimizer, opts.train_iters_epoch) if opts.eval_beam or opts.eval_only: success_rate = [] for val_env in val_envs.items(): success_rate.append( trainer.eval(opts.start_epoch - 1, val_env, tb_logger=None)) return # set up tensorboard logger tb_logger = set_tb_logger(opts.log_dir, opts.exp_name, opts.resume) best_success_rate = best_success_rate if opts.resume else 0.0 for epoch in range(opts.start_epoch, opts.max_num_epochs + 1): trainer.train(epoch, train_env, tb_logger) if epoch % opts.eval_every_epochs == 0: success_rate = [] for val_env in val_envs.items(): success_rate.append(trainer.eval(epoch, val_env, tb_logger)) success_rate_compare = success_rate[1] if is_experiment(): # remember best val_seen success rate and save checkpoint is_best = success_rate_compare >= best_success_rate best_success_rate = max(success_rate_compare, best_success_rate) print("--> Highest val_unseen success rate: {}".format( best_success_rate)) # save the model if it is the best so far save_checkpoint( { "opts": opts, "epoch": epoch + 1, "state_dict": model.state_dict(), "encoder_state_dict": encoder.state_dict(), "best_success_rate": best_success_rate, "optimizer": optimizer.state_dict(), "max_episode_len": opts.max_episode_len, }, is_best, checkpoint_dir=opts.checkpoint_dir, name=opts.exp_name, ) if (opts.train_data_augmentation and epoch == opts.epochs_data_augmentation): train_env = R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=["train"], tokenizer=tok, ) print("--> Finished training")