Exemplo n.º 1
0
def _main(config, config_idx):
    base_filename = config.name + '_cfg' + str(config_idx)
    logger = set_up_logger('logs/' + base_filename + '.log')
    title = '{}: {} ({}) config index {}'.format(__file__, config.name,
                                                 config.desc, config_idx)
    logger.info('START ' + title + '\n\n{}\n'.format(config))

    data = get_data(config)

    if config.device != 'cpu':
        assert 'theano' not in sys.modules
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(config.device)
    from model import get_model
    model = get_model(config, data)

    if not config.is_train:
        if config.tst_load_model_path and not model.load_if_exists(
                config.tst_load_model_path):
            raise AssertionError('Failed loading model weights from {}'.format(
                config.tst_load_model_path))
        ans_hats = _tst_epoch(config, model, data)
        write_test_predictions(ans_hats, config.tst_prd_json_path)
        logger.info('END ' + title)
        return

    # Training loop
    epoch_results = []
    max_em = -np.inf
    max_f1 = -np.inf
    np_rng = np.random.RandomState(config.seed // 2)
    for epoch in range(1, config.max_num_epochs + 1):
        trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch(
            config, model, data, epoch, np_rng)
        dev_min_loss, dev_prx_loss, dev_max_acc, dev_prx_acc, dev_em, dev_f1 = _dev_epoch(
            config, model, data)
        if dev_em > max_em:
            model.save('models/' + base_filename + '_best_em.pkl')
            max_em = dev_em
        if dev_f1 > max_f1:
            model.save('models/' + base_filename + '_best_f1.pkl')
            max_f1 = dev_f1
        if epoch % 5 == 0:
            model.save('models/' + base_filename +
                       '_e{:03d}.pkl'.format(epoch))
        epoch_results.append(
            EpochResult(trn_loss, trn_acc, dev_min_loss, dev_prx_loss,
                        dev_max_acc, dev_prx_acc, dev_em, dev_f1))
        if config.plot:
            plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png')
        logger.info(
            '\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n'
            .format(epoch, config_idx, trn_samples_per_sec,
                    config.format_compared(),
                    format_epoch_results(epoch_results)))
    logger.info('END ' + title)
Exemplo n.º 2
0
def _main(config, config_idx, train):
  base_filename = config.name + '_cfg' + str(config_idx)
  logger = set_up_logger('logs/' + base_filename + '.log')
  title = '{}: {} ({}) config index {}'.format(__file__, config.name, config.desc, config_idx)
  logger.info('START ' + title + '\n\n{}\n'.format(config))

  data = get_data(config, train)

  if config.device != 'cpu':
    assert 'theano' not in sys.modules 
    import theano.sandbox.cuda
    theano.sandbox.cuda.use(config.device)
  from model import get_model
  model = get_model(config, data)

  if not train:
    assert config.tst_load_model_path
    if not model.load(config.tst_load_model_path):
      raise AssertionError('Failed loading model weights from {}'.format(config.tst_load_model_path))
    ans_hats = _tst_epoch(config, model, data)
    write_test_predictions(ans_hats, config.pred_json_path)
    logger.info('END ' + title)
    return

  # Training loop
  epoch_results = []
  max_em = -np.inf
  max_f1 = -np.inf
  np_rng = np.random.RandomState(config.seed // 2)
  for epoch in range(1, config.max_num_epochs+1):
    trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch(config, model, data, epoch, np_rng)
    dev_loss, dev_acc, dev_em, dev_f1 = _dev_epoch(config, model, data)
    if dev_em > max_em:
      model.save('models/' + base_filename + '_best_em.pkl')
      max_em = dev_em
    if dev_f1 > max_f1:
      model.save('models/' + base_filename + '_best_f1.pkl')
      max_f1 = dev_f1
    if config.save_freq and epoch % config.save_freq == 0:
      model.save('models/' + base_filename + '_e{:03d}.pkl'.format(epoch))
    epoch_results.append(
      EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1))
    if config.plot:
      plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png')
    logger.info('\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n'.format(
      epoch, config_idx, trn_samples_per_sec, config.format_compared(), format_epoch_results(epoch_results)))
  logger.info('END ' + title)
Exemplo n.º 3
0
    _print_title('Adding random embeddings for unknown words')

    glove_with_unks_word_emb_data = _add_extra_embeddings(
        [tokenized_trn_json_path, tokenized_dev_json_path],
        glove_word_emb_data, num_extra_embeddings)
    _write_word_emb_data(
        GLOVE_PREPROC_WITH_UNKS_PATH_PREFIX + ('.split' if split else ''),
        glove_with_unks_word_emb_data)

    if write_dummy_test:
        _print_title('Writing dummy test JSON')
        _write_dummy_tst_json(DEV_JSON_PATH, DUMMY_TST_JSON_PATH)


if __name__ == '__main__':
    logger = set_up_logger(log_filename=None, datetime=False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--split',
        help=
        'whether to split unknown hyphenated words which have a known constituent token',
        action='store_true')
    parser.add_argument('--num_extra_embeddings',
                        help='number of extra random embeddings to produce',
                        action='store_true',
                        default=100000)
    parser.add_argument('--write_dummy_test',
                        help='whether to write a dummy test JSON file',
                        action='store_true')
    args = parser.parse_args()
    logger.info('\n' + str(args))
Exemplo n.º 4
0
    _tokenize_json(DEV_JSON_PATH,
                   TOKENIZED_DEV_JSON_PATH,
                   GLOVE_STRS_PATH,
                   has_answers=True)

    print_title('Adding random embeddings for unknown words')
    glove_with_unks_word_emb_data = _add_extra_embeddings(
        [TOKENIZED_TRN_JSON_PATH, TOKENIZED_DEV_JSON_PATH],
        glove_word_emb_data, GLOVE_NUM_UNK_EMBEDDINGS)

    write_word_emb_data(GLOVE_PREPROC_WITH_UNKS_PATH_PREFIX,
                        glove_with_unks_word_emb_data)


if __name__ == '__main__':
    logger = set_up_logger(log_filename='logs/setup.log')
    logger.info('Setup started.')

    parser = argparse.ArgumentParser()
    parser.add_argument('command',
                        nargs='?',
                        help='"prepare-squad", "prepare-lm" or "lm-encode"')
    parser.add_argument('--device',
                        help='device e.g. cpu, gpu0, gpu1, ...',
                        default='cpu')
    parser.add_argument('--dataset', help='"train" or "dev"', default=None)
    parser.add_argument('--sequences',
                        help='"contexts" or "questions"',
                        default=None)
    parser.add_argument('--layer', help='"L1" or "L2" or "EMB"', default=None)
    parser.add_argument('--num_shards',
Exemplo n.º 5
0
def _gpu_answers(name, anss, max_ans_len):
    assert anss.dtype == np.int32
    assert anss.shape[1] == 2
    anss_val = np.array([_np_ans_word_idxs_to_ans_idx(ans_stt, ans_end, max_ans_len) for \
                         ans_stt, ans_end in anss], dtype=np.int32)
    ans_stts_val = anss[:, 0]
    ans_ends_val = anss[:, 1]

    gpu_anss = torch.from_numpy(anss_val)
    gpu_ans_stts = torch.from_numpy(ans_stts_val)
    gpu_ans_ends = torch.from_numpy(ans_ends_val)
    return gpu_anss, gpu_ans_stts, gpu_ans_ends

config = Config()
base_filename = config.name + '_cfg' + str(0)
logger = set_up_logger('logs/' + base_filename + '.log')
title = '{}: {}'.format(__file__, config.name)
logger.info('START ' + title + '\n\n{}\n'.format(config))
data = get_data(config, train=True)
emb_val = data.word_emb_data.word_emb  # (voc size, emb_dim)
first_known_word = data.word_emb_data.first_known_word
assert config.emb_dim == emb_val.shape[1]
assert first_known_word > 0
emb_val[:first_known_word] = 0
emb = torch.from_numpy(emb_val)

#load all the data, train and dev data
trn_ctxs, trn_ctx_masks, trn_ctx_lens, trn_qtns, trn_qtn_masks, trn_qtn_lens, trn_qtn_ctx_idxs, trn_anss, trn_ans_stts, trn_ans_ends = _gpu_dataset(
    'trn', data.trn, config)
dev_ctxs, dev_ctx_masks, dev_ctx_lens, dev_qtns, dev_qtn_masks, dev_qtn_lens, dev_qtn_ctx_idxs, dev_anss, dev_ans_stts, dev_ans_ends = _gpu_dataset(
    'dev', data.dev, config)
Exemplo n.º 6
0
  tokenized_trn_json_path = TRN_JSON_PATH.replace('.json', filename_suffix)
  tokenized_dev_json_path = DEV_JSON_PATH.replace('.json', filename_suffix)
  _tokenize_json(TRN_JSON_PATH, tokenized_trn_json_path, True, GLOVE_STRS_PATH, split)
  _tokenize_json(DEV_JSON_PATH, tokenized_dev_json_path, True, GLOVE_STRS_PATH, split)

  _print_title('Adding random embeddings for unknown words')

  glove_with_unks_word_emb_data = _add_extra_embeddings(
    [tokenized_trn_json_path, tokenized_dev_json_path], glove_word_emb_data, num_extra_embeddings)
  _write_word_emb_data(
    GLOVE_PREPROC_WITH_UNKS_PATH_PREFIX + ('.split' if split else ''), glove_with_unks_word_emb_data)

  if write_dummy_test:
    _print_title('Writing dummy test JSON')
    _write_dummy_tst_json(DEV_JSON_PATH, DUMMY_TST_JSON_PATH)


if __name__ == '__main__':
  logger = set_up_logger(log_filename=None, datetime=False)
  parser = argparse.ArgumentParser()
  parser.add_argument('--split',
    help='whether to split unknown hyphenated words which have a known constituent token', action='store_false')
  parser.add_argument('--num_extra_embeddings',
    help='number of extra random embeddings to produce', type=int, default=100000)
  parser.add_argument('--write_dummy_test',
    help='whether to write a dummy test JSON file', action='store_true')
  args = parser.parse_args()
  logger.info('\n' + str(args))
  _main(args.split, args.num_extra_embeddings, args.write_dummy_test)

Exemplo n.º 7
0
def _main(config, config_idx, train):
    base_filename = config.name + '_cfg' + str(config_idx)
    logger = set_up_logger('logs/' + base_filename + '.log')
    title = '{}: {} ({}) config index {}'.format(__file__, config.name,
                                                 config.desc, config_idx)
    logger.info('START ' + title + '\n\n{}\n'.format(config))

    data = get_data(config, train)

    if config.device != 'cpu':
        assert 'theano' not in sys.modules
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(config.device)
    from model import get_model
    model = get_model(config, data)

    if not train:
        assert config.tst_load_model_path
        if not model.load(config.tst_load_model_path):
            raise AssertionError('Failed loading model weights from {}'.format(
                config.tst_load_model_path))
        ans_hats = _tst_epoch(config, model, data)
        write_test_predictions(ans_hats, config.pred_json_path)
        logger.info('END ' + title)
        return

    # Training loop
    epoch_results = []
    max_em = -np.inf
    max_f1 = -np.inf
    epochs_with_no_improvement = 0
    np_rng = np.random.RandomState(config.seed // 2)
    for epoch in range(1, config.max_num_epochs + 1):
        trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch(
            config, model, data, epoch, np_rng)
        dev_loss, dev_acc, dev_em, dev_f1 = _dev_epoch(config, model, data)
        if dev_em > max_em:
            model.save('models/' + base_filename + '_best_em.pkl')
            max_em = dev_em
            # Best EM so far, reset epochs_with_no_improvement
            epochs_with_no_improvement = 0
        if dev_f1 > max_f1:
            model.save('models/' + base_filename + '_best_f1.pkl')
            max_f1 = dev_f1
            # Best F1 so far, reset epochs_with_no_improvement
            epochs_with_no_improvement = 0
        if dev_em <= max_em and dev_f1 <= max_f1:
            # Neither dev_em nor dev_f1 are better than max, increment epochs
            # with no improvement.
            epochs_with_no_improvement += 1
        if config.save_freq and epoch % config.save_freq == 0:
            model.save('models/' + base_filename +
                       '_e{:03d}.pkl'.format(epoch))
        epoch_results.append(
            EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1))
        if config.plot:
            plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png')
        logger.info(
            '\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n'
            .format(epoch, config_idx, trn_samples_per_sec,
                    config.format_compared(),
                    format_epoch_results(epoch_results)))
        # Check if we have to do early stopping.
        if epochs_with_no_improvement > config.patience:
            logger.info("Patience exceeded.")
            break
    logger.info('END ' + title)
Exemplo n.º 8
0
def _main(config):
    base_filename = config.name
    logger_filename = 'logs/' + base_filename + '.log'
    logger = set_up_logger(logger_filename)
    title = '{}: {} ({})'.format(__file__, config.name, config.desc)
    logger.info('START ' + title + '\n\n{}\n'.format(config))

    data = get_data(
        word_emb_data_path_prefix=GLOVE_PREPROC_WITH_UNKS_PATH_PREFIX,
        tokenized_trn_json_path=TOKENIZED_TRN_JSON_PATH,
        tokenized_dev_json_path=TOKENIZED_DEV_JSON_PATH,
        max_ans_len=MAX_ANS_LEN,
        max_ctx_len=MAX_CTX_LEN)

    if config.device != 'cpu':
        assert 'theano' not in sys.modules
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(config.device)

    from model import get_model
    model = get_model(config, data)

    lm_data = get_lm_data(config.lm_layer) if config.mode == 'LM' else None

    # Training loop
    epoch_results = []
    max_em = -np.inf
    max_f1 = -np.inf
    np_rng = np.random.RandomState(config.seed // 2)
    for epoch in range(1, config.max_num_epochs + 1):
        trn_loss, trn_acc, trn_samples_per_sec, trn_num_all_samples, trn_num_valid_samples, \
          trn_mean_grad_norm, trn_max_grad_norm, trn_min_grad_norm, trn_num_unsafe_samples = \
            _trn_epoch(config, model, data, lm_data, epoch, np_rng)
        dev_loss, dev_acc, dev_em, dev_f1, dev_num_all_samples, dev_num_valid_samples = \
          _dev_epoch(config, model, data, lm_data)

        best_filename = base_filename
        if dev_em > max_em:
            model.save('models/' + best_filename + '_best_em.pkl')
            max_em = dev_em
        if dev_f1 > max_f1:
            model.save('models/' + best_filename + '_best_f1.pkl')
            max_f1 = dev_f1
        if config.save_freq and epoch % config.save_freq == 0:
            model.save('models/' + base_filename +
                       '_e{:03d}.pkl'.format(epoch))

        epoch_results.append(
            EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1))
        if config.plot:
            plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png')
        logger.info(
            ('\n\nEpc {} {}: (smp/sec: {:<.1f})'
             ' (trn: {}/{}) (dev: {}/{})'
             ' (grad: avg:{} max:{} min:{}) (low probability predictions:{})'
             '\n{}\n\nResults:\n{}\n\n').format(
                 epoch, config.name, trn_samples_per_sec,
                 trn_num_valid_samples, trn_num_all_samples,
                 dev_num_valid_samples, dev_num_all_samples,
                 trn_mean_grad_norm, trn_max_grad_norm, trn_min_grad_norm,
                 trn_num_unsafe_samples, config.format_compared(),
                 format_epoch_results(epoch_results)))

    logger.info('END ' + title)