Exemplo n.º 1
0
    def __init__(
        self,
        opts,
        features,
        img_spec,
        batch_size=64,
        seed=10,
        splits=["train"],
        tokenizer=None,
    ):
        self.env = PanoEnvBatch(features, img_spec, batch_size=batch_size)
        self.data = []
        self.scans = []
        self.opts = opts

        print("Loading {} dataset".format(splits[0]))

        json_data = load_datasets(splits)
        total_length = len(json_data)

        # iteratively load data into system memory
        for i, item in enumerate(json_data):

            if not is_experiment() and i >= 20:
                break  # if this is in developing mode, load only a small amount of data

            # Split multiple instructions into separate entries
            for j, instr in enumerate(item["instructions"]):
                self.scans.append(item["scan"])
                new_item = dict(item)
                new_item["instr_id"] = "%s_%d" % (item["path_id"], j)
                new_item["instructions"] = instr
                if tokenizer:
                    if (
                            "instr_encoding" not in item
                    ):  # we may already include 'instr_encoding' when generating synthetic instructions
                        new_item["instr_encoding"] = tokenizer.encode_sentence(
                            instr)
                    else:
                        new_item["instr_encoding"] = item["instr_encoding"]
                self.data.append(new_item)
            print_progress(
                i + 1,
                total_length,
                prefix="Progress:",
                suffix="Complete",
                bar_length=50,
            )

        self.scans = set(self.scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)
        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()
        print("R2RBatch loaded with %d instructions, using splits: %s" %
              (len(self.data), ",".join(splits)))
Exemplo n.º 2
0
    def __init__(self, opts, features, img_spec, batch_size=64, seed=10, splits=['train'], tokenizer=None, configuration=False):
        self.env = PanoEnvBatch(features, img_spec, batch_size=batch_size)
        self.data = []
        self.scans = []
        self.opts = opts
        self.configuration = configuration

        print('Loading {} dataset'.format(splits[0]))

        json_data = load_datasets(splits)
        total_length = len(json_data)

        # iteratively load data into system memory
        for i, item in enumerate(json_data):

            if not is_experiment() and i >= 20: break  # if this is in developing mode, load only a small amount of data

            # Split multiple instructions into separate entries
            for j, instr in enumerate(item['instructions']):
                self.scans.append(item['scan'])
                new_item = dict(item)
                new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                new_item['instructions'] = instr
                if configuration:
                    new_item['configurations'] = get_configurations(instr)
                    self.data.append((len(new_item['configurations']), new_item))
                else:
                    self.data.append(new_item)
                if tokenizer:
                    if 'instr_encoding' not in item:  # we may already include 'instr_encoding' when generating synthetic instructions
                        new_item['instr_encoding'] = tokenizer.encode_sentence(instr)
                    else:
                        new_item['instr_encoding'] = item['instr_encoding']

            print_progress(i + 1, total_length, prefix='Progress:',
                               suffix='Complete', bar_length=50)

        if configuration:
            self.data.sort(key=lambda x: x[0])
            self.data = list(map(lambda item:item[1], self.data))

        else:
            self.seed = seed
            random.seed(self.seed)
            random.shuffle(self.data)

        self.scans = set(self.scans)
        self.splits = splits     
        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()
        print('R2RBatch loaded with %d instructions, using splits: %s' % (len(self.data), ",".join(splits)))
Exemplo n.º 3
0
def download_files(accession, output_format, dest_dir, fetch_index, fetch_meta,
                   aspera):
    accession_dir = os.path.join(dest_dir, accession)
    utils.create_dir(accession_dir)
    # download experiment xml
    is_experiment = utils.is_experiment(accession)
    if fetch_meta and is_experiment:
        download_meta(accession, accession_dir)
    if fetch_meta and utils.is_run(accession):
        download_experiment_meta(accession, accession_dir)
    # download data files
    search_url = utils.get_file_search_query(accession, aspera)
    temp_file = os.path.join(dest_dir, 'temp.txt')
    utils.download_report_from_portal(search_url, temp_file)
    f = open(temp_file)
    lines = f.readlines()
    f.close()
    os.remove(temp_file)
    for line in lines[1:]:
        data_accession, filelist, md5list, indexlist = utils.parse_file_search_result_line(
            line, accession, output_format)
        # create run directory if downloading all data for an experiment
        if is_experiment:
            run_dir = os.path.join(accession_dir, data_accession)
            utils.create_dir(run_dir)
            target_dir = run_dir
        else:
            target_dir = accession_dir
        # download run/analysis XML
        if fetch_meta:
            download_meta(data_accession, target_dir)
        if len(filelist) == 0:
            if output_format is None:
                print 'No files available for {0}'.format(data_accession)
            else:
                print 'No files of format {0} available for {1}'.format(
                    output_format, data_accession)
            continue
        for i in range(len(filelist)):
            file_url = filelist[i]
            md5 = md5list[i]
            if file_url != '':
                download_file(file_url, target_dir, md5, aspera)
        if fetch_index:
            for index_file in indexlist:
                if index_file != '':
                    download_file(index_file, target_dir, None, aspera)
        if utils.is_empty_dir(target_dir):
            print 'Deleting directory ' + os.path.basename(target_dir)
            os.rmdir(target_dir)
    def __init__(self, opts, features, img_spec, batch_size=64, seed=10, splits=['train'], tokenizer=None):
        self.env = PanoEnvBatch(features, img_spec, batch_size=batch_size)
        self.data = []
        self.scans = []
        self.opts = opts

        print('Loading {} dataset'.format(splits[0]))

        json_data = load_datasets(splits)
        total_length = len(json_data)

        # iteratively load data into system memory
        for i, item in enumerate(json_data):

            if not is_experiment() and i >= 20: break

            # Split multiple instructions into separate entries
            for j, instr in enumerate(item['instructions']):
                self.scans.append(item['scan'])
                new_item = dict(item)
                new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                new_item['instructions'] = instr
                if tokenizer:
                    if 'instr_encoding' not in item:  # we may already include 'instr_encoding' when generating synthetic instructions
                        new_item['instr_encoding'] = tokenizer.encode_sentence(instr)
                    else:
                        new_item['instr_encoding'] = item['instr_encoding']
                    if 'divide' in opts.lang_embed:
                        if opts.divide_method == 'kevin':
                            new_item['divid_instr_encoding'] = tokenizer.divide_instr_kevin(instr, opts.max_sentence_segs)
                        else:
                            new_item['divid_instr_encoding'] = tokenizer.divide_instr_victor(instr, opts.max_sentence_segs)
                self.data.append(new_item)
            print_progress(i + 1, total_length, prefix='Progress:',
                               suffix='Complete', bar_length=50)

        self.scans = set(self.scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)
        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()
        self._load_categories()
        print('R2RBatch loaded with %d instructions, using splits: %s' % (len(self.data), ",".join(splits)))
Exemplo n.º 5
0
def download_files(accession, format, dest_dir, fetch_index, fetch_meta, aspera):
    if format is None:
        format = utils.SUBMITTED_FORMAT
    accession_dir = os.path.join(dest_dir, accession)
    utils.create_dir(accession_dir)
    # download experiment xml
    is_experiment = utils.is_experiment(accession)
    if fetch_meta and is_experiment:
        download_meta(accession, accession_dir)
    # TODO download experiment xml for run accession
    if fetch_meta and utils.is_run(accession):
        download_experiment_meta(accession, accession_dir)
    # download data files
    search_url = utils.get_file_search_query(accession, format, fetch_index, aspera)
    temp_file = os.path.join(dest_dir, 'temp.txt')
    utils.download_report_from_portal(search_url, temp_file)
    f = open(temp_file)
    lines = f.readlines()
    f.close()
    os.remove(temp_file)
    for line in lines[1:]:
        data_accession, filelist, md5list, indexlist = utils.parse_file_search_result_line(line, accession,
                                                                                           format, fetch_index)
        # create run directory if downloading all data for an experiment
        if is_experiment:
            run_dir = os.path.join(accession_dir, data_accession)
            utils.create_dir(run_dir)
            target_dir = run_dir
        else:
            target_dir = accession_dir
        # download run/analysis XML
        if fetch_meta:
            download_meta(data_accession, target_dir)
        if len(filelist) == 0:
            print 'No files of format ' + format + ' for ' + data_accession
            continue
        for i in range(len(filelist)):
            file_url = filelist[i]
            md5 = md5list[i]
            if file_url != '':
                download_file_with_md5_check(file_url, target_dir, md5, aspera)
        for index_file in indexlist:
            if index_file != '':
                download_file(index_file, target_dir, aspera)
Exemplo n.º 6
0
        for index_file in indexlist:
            if index_file != '':
                download_file(index_file, target_dir, aspera)


if __name__ == '__main__':
    parser = set_parser()
    args = parser.parse_args()

    accession = args.accession
    format = args.format
    dest_dir = args.dest
    fetch_meta = args.meta
    fetch_index = args.index
    aspera = args.aspera

    if not utils.is_run(accession) and not utils.is_experiment(accession):
        print 'Error: Invalid accession. An INSDC run or experiment accession must be provided'
        sys.exit(1)

    if not utils.is_available(accession):
        print 'Record does not exist or is not available for accession provided'
        sys.exit(1)

    try:
        download_files(accession, format, dest_dir, fetch_index, fetch_meta, aspera)
        print 'Completed'
    except Exception:
        utils.print_error()
        sys.exit(1)
Exemplo n.º 7
0
def main(opts):
    # set manual_seed and build vocab
    print(opts, flush=True)

    setup(opts, opts.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f"Usando {device} :)")

    # create a batch training environment that will also preprocess text
    vocab = read_vocab(opts.train_vocab)
    tok = Tokenizer(opts.remove_punctuation == 1, opts.reversed == 1, vocab=vocab, encoding_length=opts.max_cap_length)

    # create language instruction encoder
    encoder_kwargs = {
        'opts': opts,
        'vocab_size': len(vocab),
        'embedding_size': opts.word_embedding_size,
        'hidden_size': opts.rnn_hidden_size,
        'padding_idx': padding_idx,
        'dropout_ratio': opts.rnn_dropout,
        'bidirectional': opts.bidirectional == 1,
        'num_layers': opts.rnn_num_layers
    }
    print('Using {} as encoder ...'.format(opts.lang_embed))
    if 'lstm' in opts.lang_embed:
        encoder = EncoderRNN(**encoder_kwargs)
    else:
        raise ValueError('Unknown {} language embedding'.format(opts.lang_embed))
    print(encoder)

    # create policy model
    policy_model_kwargs = {
        'opts':opts,
        'img_fc_dim': opts.img_fc_dim,
        'img_fc_use_batchnorm': opts.img_fc_use_batchnorm == 1,
        'img_dropout': opts.img_dropout,
        'img_feat_input_dim': opts.img_feat_input_dim,
        'rnn_hidden_size': opts.rnn_hidden_size,
        'rnn_dropout': opts.rnn_dropout,
        'max_len': opts.max_cap_length,
        'max_navigable': opts.max_navigable
    }

    if opts.arch == 'regretful':
        model = Regretful(**policy_model_kwargs)
    elif opts.arch == 'self-monitoring':
        model = SelfMonitoring(**policy_model_kwargs)
    elif opts.arch == 'speaker-baseline':
        model = SpeakerFollowerBaseline(**policy_model_kwargs)
    else:
        raise ValueError('Unknown {} model for seq2seq agent'.format(opts.arch))
    print(model)

    encoder = encoder.to(device)
    model = model.to(device)

    params = list(encoder.parameters()) + list(model.parameters())
    optimizer = torch.optim.Adam(params, lr=opts.learning_rate)

    # optionally resume from a checkpoint
    if opts.resume:
        model, encoder, optimizer, best_success_rate = resume_training(opts, model, encoder, optimizer)

    # if a secondary exp name is specified, this is useful when resuming from a previous saved
    # experiment and save to another experiment, e.g., pre-trained on synthetic data and fine-tune on real data
    if opts.exp_name_secondary:
        opts.exp_name += opts.exp_name_secondary

    feature, img_spec = load_features(opts.img_feat_dir, opts.blind)

    if opts.test_submission:
        assert opts.resume, 'The model was not resumed before running for submission.'
        test_env = ('test', (R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size,
                                 splits=['test'], tokenizer=tok), Evaluation(['test'], opts)))
        agent_kwargs = {
            'opts': opts,
            'env': test_env[1][0],
            'results_path': "",
            'encoder': encoder,
            'model': model,
            'feedback': opts.feedback
        }
        agent = PanoSeq2SeqAgent(**agent_kwargs)
        # setup trainer
        trainer = PanoSeq2SeqTrainer(opts, agent, optimizer)
        epoch = opts.start_epoch - 1
        trainer.eval(epoch, test_env)
        return

    # set up R2R environments
    if not opts.train_data_augmentation:
        train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed,
                                 splits=['train'], tokenizer=tok)
    else:
        train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed,
                                 splits=['synthetic'], tokenizer=tok)

    val_craft_splits = ['craft_seen', 'craft_unseen']
    val_splits = ['val_seen', 'val_unseen']
    if opts.craft_eval:
        val_splits += val_craft_splits
    val_envs = {split: (R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size,
                                     splits=[split], tokenizer=tok), Evaluation([split], opts))
                for split in val_splits}
    # create agent
    agent_kwargs = {
        'opts': opts,
        'env': train_env,
        'results_path': "",
        'encoder': encoder,
        'model': model,
        'feedback': opts.feedback
    }
    agent = PanoSeq2SeqAgent(**agent_kwargs)

    # setup trainer
    trainer = PanoSeq2SeqTrainer(opts, agent, optimizer, opts.train_iters_epoch)

    if opts.eval_only:
        success_rate = []
        for val_env in val_envs.items():
            success_rate.append(trainer.eval(opts.start_epoch - 1, val_env, tb_logger=None))
        return

    # set up tensorboard logger
    tb_logger = set_tb_logger(opts.log_dir, opts.exp_name, opts.resume)
    sys.stdout.flush()
    best_success_rate = best_success_rate if opts.resume else 0.0
    for epoch in range(opts.start_epoch, opts.max_num_epochs + 1):
        trainer.train(epoch, train_env, tb_logger)

        if epoch % opts.eval_every_epochs == 0:
            success_rate = []
            for val_env in val_envs.items():
                success_rate.append(trainer.eval(epoch, val_env, tb_logger))

            success_rate_compare = success_rate[1]

            if is_experiment():
                # remember best val_seen success rate and save checkpoint
                is_best = success_rate_compare >= best_success_rate
                best_success_rate = max(success_rate_compare, best_success_rate)
                print("--> Highest val_unseen success rate: {}".format(best_success_rate))
                sys.stdout.flush()

                # save the model if it is the best so far
                save_checkpoint({
                    'opts': opts,
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'encoder_state_dict': encoder.state_dict(),
                    'best_success_rate': best_success_rate,
                    'optimizer': optimizer.state_dict(),
                    'max_episode_len': opts.max_episode_len,
                }, is_best, checkpoint_dir=opts.checkpoint_dir, name=opts.exp_name)

        if opts.train_data_augmentation and epoch == opts.epochs_data_augmentation:
            train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed,
                                     splits=['train'], tokenizer=tok)

    print("--> Finished training")
Exemplo n.º 8
0
     if utils.is_wgs_set(accession):
         if output_format is not None:
             sequenceGet.check_format(output_format)
         sequenceGet.download_wgs(dest_dir, accession, output_format)
     elif not utils.is_available(accession):
         sys.stderr.write('ERROR: Record does not exist or is not available for accession provided\n')
         sys.exit(1)
     elif utils.is_sequence(accession):
         if output_format is not None:
             sequenceGet.check_format(output_format)
         sequenceGet.download_sequence(dest_dir, accession, output_format, expanded)
     elif utils.is_analysis(accession):
         if output_format is not None:
             readGet.check_read_format(output_format)
         readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera)
     elif utils.is_run(accession) or utils.is_experiment(accession):
         if output_format is not None:
             readGet.check_read_format(output_format)
         readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera)
     elif utils.is_assembly(accession):
         if output_format is not None:
             assemblyGet.check_format(output_format)
         assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, expanded)
     else:
         sys.stderr.write('ERROR: Invalid accession provided\n')
         sys.exit(1)
     print 'Completed'
 except Exception:
     traceback.print_exc()
     utils.print_error()
     sys.exit(1)
Exemplo n.º 9
0
def main(opts):

    # set manual_seed and build vocab
    setup(opts, opts.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # create a batch training environment that will also preprocess text
    vocab = read_vocab(opts.train_vocab)
    tok = Tokenizer(
        opts.remove_punctuation == 1,
        opts.reversed == 1,
        vocab=vocab,
        encoding_length=opts.max_cap_length,
    )

    # create language instruction encoder
    encoder_kwargs = {
        "opts": opts,
        "vocab_size": len(vocab),
        "embedding_size": opts.word_embedding_size,
        "hidden_size": opts.rnn_hidden_size,
        "padding_idx": padding_idx,
        "dropout_ratio": opts.rnn_dropout,
        "bidirectional": opts.bidirectional == 1,
        "num_layers": opts.rnn_num_layers,
    }
    print("Using {} as encoder ...".format(opts.lang_embed))
    if "lstm" in opts.lang_embed:
        encoder = EncoderRNN(**encoder_kwargs)
    else:
        raise ValueError("Unknown {} language embedding".format(
            opts.lang_embed))
    print(encoder)

    # create policy model
    policy_model_kwargs = {
        "opts": opts,
        "img_fc_dim": opts.img_fc_dim,
        "img_fc_use_batchnorm": opts.img_fc_use_batchnorm == 1,
        "img_dropout": opts.img_dropout,
        "img_feat_input_dim": opts.img_feat_input_dim,
        "rnn_hidden_size": opts.rnn_hidden_size,
        "rnn_dropout": opts.rnn_dropout,
        "max_len": opts.max_cap_length,
        "max_navigable": opts.max_navigable,
    }

    if opts.arch == "self-monitoring":
        model = SelfMonitoring(**policy_model_kwargs)
    elif opts.arch == "speaker-baseline":
        model = SpeakerFollowerBaseline(**policy_model_kwargs)
    else:
        raise ValueError("Unknown {} model for seq2seq agent".format(
            opts.arch))
    print(model)

    encoder = encoder.to(device)
    model = model.to(device)

    params = list(encoder.parameters()) + list(model.parameters())
    optimizer = torch.optim.Adam(params, lr=opts.learning_rate)

    # optionally resume from a checkpoint
    if opts.resume:
        model, encoder, optimizer, best_success_rate = resume_training(
            opts, model, encoder, optimizer)

    # if a secondary exp name is specified, this is useful when resuming from a previous saved
    # experiment and save to another experiment, e.g., pre-trained on synthetic data and fine-tune on real data
    if opts.exp_name_secondary:
        opts.exp_name += opts.exp_name_secondary

    feature, img_spec = load_features(opts.img_feat_dir)

    if opts.test_submission:
        assert (opts.resume
                ), "The model was not resumed before running for submission."
        test_env = (
            "test",
            (
                R2RPanoBatch(
                    opts,
                    feature,
                    img_spec,
                    batch_size=opts.batch_size,
                    splits=["test"],
                    tokenizer=tok,
                ),
                Evaluation(["test"]),
            ),
        )
        agent_kwargs = {
            "opts": opts,
            "env": test_env[1][0],
            "results_path": "",
            "encoder": encoder,
            "model": model,
            "feedback": opts.feedback,
        }
        agent = PanoSeq2SeqAgent(**agent_kwargs)
        # setup trainer
        trainer = PanoSeq2SeqTrainer(opts, agent, optimizer)
        epoch = opts.start_epoch - 1
        trainer.eval(epoch, test_env)
        return

    # set up R2R environments
    if not opts.train_data_augmentation:
        train_env = R2RPanoBatch(
            opts,
            feature,
            img_spec,
            batch_size=opts.batch_size,
            seed=opts.seed,
            splits=["train"],
            tokenizer=tok,
        )
    else:
        train_env = R2RPanoBatch(
            opts,
            feature,
            img_spec,
            batch_size=opts.batch_size,
            seed=opts.seed,
            splits=["synthetic"],
            tokenizer=tok,
        )

    val_envs = {
        split: (
            R2RPanoBatch(
                opts,
                feature,
                img_spec,
                batch_size=opts.batch_size,
                splits=[split],
                tokenizer=tok,
            ),
            Evaluation([split]),
        )
        for split in ["val_seen", "val_unseen"]
    }

    # create agent
    agent_kwargs = {
        "opts": opts,
        "env": train_env,
        "results_path": "",
        "encoder": encoder,
        "model": model,
        "feedback": opts.feedback,
    }
    agent = PanoSeq2SeqAgent(**agent_kwargs)

    # setup trainer
    trainer = PanoSeq2SeqTrainer(opts, agent, optimizer,
                                 opts.train_iters_epoch)

    if opts.eval_beam or opts.eval_only:
        success_rate = []
        for val_env in val_envs.items():
            success_rate.append(
                trainer.eval(opts.start_epoch - 1, val_env, tb_logger=None))
        return

    # set up tensorboard logger
    tb_logger = set_tb_logger(opts.log_dir, opts.exp_name, opts.resume)

    best_success_rate = best_success_rate if opts.resume else 0.0

    for epoch in range(opts.start_epoch, opts.max_num_epochs + 1):
        trainer.train(epoch, train_env, tb_logger)

        if epoch % opts.eval_every_epochs == 0:
            success_rate = []
            for val_env in val_envs.items():
                success_rate.append(trainer.eval(epoch, val_env, tb_logger))

            success_rate_compare = success_rate[1]

            if is_experiment():
                # remember best val_seen success rate and save checkpoint
                is_best = success_rate_compare >= best_success_rate
                best_success_rate = max(success_rate_compare,
                                        best_success_rate)
                print("--> Highest val_unseen success rate: {}".format(
                    best_success_rate))

                # save the model if it is the best so far
                save_checkpoint(
                    {
                        "opts": opts,
                        "epoch": epoch + 1,
                        "state_dict": model.state_dict(),
                        "encoder_state_dict": encoder.state_dict(),
                        "best_success_rate": best_success_rate,
                        "optimizer": optimizer.state_dict(),
                        "max_episode_len": opts.max_episode_len,
                    },
                    is_best,
                    checkpoint_dir=opts.checkpoint_dir,
                    name=opts.exp_name,
                )

        if (opts.train_data_augmentation
                and epoch == opts.epochs_data_augmentation):
            train_env = R2RPanoBatch(
                opts,
                feature,
                img_spec,
                batch_size=opts.batch_size,
                seed=opts.seed,
                splits=["train"],
                tokenizer=tok,
            )

    print("--> Finished training")