def test_read_creates_cache_file_when_not_present(self):
     snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
     reader = SnliReader(cache_directory=self.cache_directory)
     cache_file = reader._get_cache_location_for_file_path(snli_file)
     assert not os.path.exists(cache_file)
     reader.read(snli_file)
     assert os.path.exists(cache_file)
示例#2
0
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))
    if not args.test_dataset:
        test_dataset_path = train_params.pop('test_dataset_path')
    else:
        test_dataset_path = args.test_dataset
    test_dataset = dataset_reader.read(test_dataset_path)
    if args.only_label:
        test_dataset = [
            d for d in test_dataset
            if d.fields['label'].label == args.only_label
        ]
    vocab = Vocabulary.from_files(vocab_dir)
    random.shuffle(valid_dataset)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.to(args.cuda_device)
    model.eval()

    torch.set_grad_enabled(False)

    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)

    for dataset in (valid_dataset, test_dataset):
        generator = iterator(dataset, shuffle=False, num_epochs=1)
        model.get_metrics(reset=True)
        for batch in tqdm(generator):
            batch = move_to_device(batch, cuda_device=args.cuda_device)
            model(premise=batch['premise'],
                  hypothesis=batch['hypothesis'],
                  label=batch['label'])
        metrics = model.get_metrics()
        pprint(metrics)
    def test_read_uses_existing_cache_file_when_present(self):
        snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
        snli_copy_file = str(snli_file) + ".copy"
        shutil.copyfile(snli_file, snli_copy_file)
        reader = SnliReader(cache_directory=self.cache_directory)

        # The first read will create the cache.
        instances = reader.read(snli_copy_file)
        # Now we _remove_ the data file, to be sure we're reading from the cache.
        os.remove(snli_copy_file)
        cached_instances = reader.read(snli_copy_file)
        # We should get the same instances both times.
        assert len(instances) == len(cached_instances)
        for instance, cached_instance in zip(instances, cached_instances):
            assert instance.fields == cached_instance.fields
示例#4
0
    def test_read_from_file(self):

        reader = SnliReader()
        dataset = reader.read('tests/fixtures/data/snli.jsonl')

        instance1 = {"premise": ["A", "person", "on", "a", "horse",
                                 "jumps", "over", "a", "broken", "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "training",
                                    "his", "horse", "for", "a", "competition", "."],
                     "label": "neutral"}

        instance2 = {"premise": ["A", "person", "on", "a", "horse",
                                 "jumps", "over", "a", "broken", "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "at", "a", "diner",
                                    ",", "ordering", "an", "omelette", "."],
                     "label": "contradiction"}
        instance3 = {"premise": ["A", "person", "on", "a", "horse",
                                 "jumps", "over", "a", "broken", "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "outdoors", ",", "on", "a", "horse", "."],
                     "label": "entailment"}

        assert len(dataset.instances) == 3
        fields = dataset.instances[0].fields()
        assert fields["premise"].tokens() == instance1["premise"]
        assert fields["hypothesis"].tokens() == instance1["hypothesis"]
        assert fields["label"].label() == instance1["label"]
        fields = dataset.instances[1].fields()
        assert fields["premise"].tokens() == instance2["premise"]
        assert fields["hypothesis"].tokens() == instance2["hypothesis"]
        assert fields["label"].label() == instance2["label"]
        fields = dataset.instances[2].fields()
        assert fields["premise"].tokens() == instance3["premise"]
        assert fields["hypothesis"].tokens() == instance3["hypothesis"]
        assert fields["label"].label() == instance3["label"]
    def test_cached_max_instances(self, lazy):
        snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"

        # The first read will create the cache if it's not there already.
        reader = SnliReader(cache_directory=self.cache_directory, lazy=lazy)
        instances = reader.read(snli_file)
        instance_count = sum(1 for _ in instances)
        assert instance_count > 2

        # The second read should only return two instances, even though it's from the cache.
        reader = SnliReader(cache_directory=self.cache_directory,
                            max_instances=2,
                            lazy=lazy)
        instances = reader.read(snli_file)
        instance_count = sum(1 for _ in instances)
        assert instance_count == 2
示例#6
0
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read('tests/fixtures/multinli_1.0_train.jsonl')
        instances = ensure_list(instances)

        instance0 = {"premise": ["Conceptually", "cream", "skimming", "has", "two", "basic", "dimensions", "-", "product", "and", "geography", "."],
        			 "hypothesis": ["Product", "and", "geography", "are", "what", "make", "cream", "skimming", "work", "."],
        			 "label": "neutral"}

        instance1 = {"premise": ["you", "know", "during", "the", "season", "and", "i", "guess", "at", "at", "your", "level", "uh", "you", "lose", "them", "to", "the", "next", "level", "if", "if", "they", "decide", "to", "recall", "the", "the", "parent", "team", "the", "Braves", "decide", "to", "call", "to", "recall", "a", "guy", "from", "triple", "A", "then", "a", "double", "A", "guy", "goes", "up", "to", "replace", "him", "and", "a", "single", "A", "guy", "goes", "up", "to", "replace", "him"],
                     "hypothesis": ["You", "lose", "the", "things", "to", "the", "following", "level", "if", "the", "people", "recall", "."],
                     "label": "entailment"}

        instance2 = {"premise": ["One", "of", "our", "number", "will", "carry", "out", "your", "instructions", "minutely", "."],
                     "hypothesis": ["A", "member", "of", "my", "team", "will", "execute", "your", "orders", "with", "immense", "precision", "."],
                     "label": "entailment"}

        assert len(instances) == 3
        def equals(fields, instance): 
        	assert [t.text for t in fields["premise"].tokens] == instance["premise"]
        	assert [t.text for t in fields["hypothesis"].tokens] == instance["hypothesis"]
        	assert fields["label"].label == instance["label"]

        equals(instances[0].fields, instance0)
        equals(instances[1].fields, instance1)
        equals(instances[2].fields, instance2)
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'snli.jsonl')
        instances = ensure_list(instances)

        instance1 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"training", u"his", u"horse", u"for", u"a",
                                    u"competition", u"."],
                     u"label": u"neutral"}

        instance2 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"at", u"a", u"diner", u",", u"ordering", u"an",
                                    u"omelette", u"."],
                     u"label": u"contradiction"}
        instance3 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"outdoors", u",", u"on", u"a", u"horse", u"."],
                     u"label": u"entailment"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance1[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance1[u"hypothesis"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance2[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance2[u"hypothesis"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance3[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance3[u"hypothesis"]
        assert fields[u"label"].label == instance3[u"label"]
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read('tests/fixtures/data/snli.jsonl')
        instances = ensure_list(instances)

        instance1 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "training", "his", "horse", "for", "a",
                                    "competition", "."],
                     "label": "neutral"}

        instance2 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "at", "a", "diner", ",", "ordering", "an",
                                    "omelette", "."],
                     "label": "contradiction"}
        instance3 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "outdoors", ",", "on", "a", "horse", "."],
                     "label": "entailment"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["premise"].tokens] == instance1["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance1["hypothesis"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["premise"].tokens] == instance2["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance2["hypothesis"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["premise"].tokens] == instance3["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance3["hypothesis"]
        assert fields["label"].label == instance3["label"]
示例#9
0
    def test_caching_works_with_lazy_reading(self):
        snli_file = self.FIXTURES_ROOT / "data" / "snli.jsonl"
        snli_copy_file = str(snli_file) + ".copy"
        shutil.copyfile(snli_file, snli_copy_file)
        reader = SnliReader(lazy=True)
        reader.cache_data(self.cache_directory)
        cache_file = reader._get_cache_location_for_file_path(snli_copy_file)

        # The call to read() will give us an _iterator_.  We'll iterate over it multiple times,
        # and the caching behavior should change as we go.
        instances = reader.read(snli_copy_file)
        assert isinstance(instances, _LazyInstances)

        # The first iteration will create the cache
        assert not os.path.exists(cache_file)
        first_pass_instances = []
        for instance in instances:
            first_pass_instances.append(instance)
        assert os.path.exists(cache_file)

        # Now we _remove_ the data file, to be sure we're reading from the cache.
        os.remove(snli_copy_file)
        second_pass_instances = []
        for instance in instances:
            second_pass_instances.append(instance)

        # We should get the same instances both times.
        assert len(first_pass_instances) == len(second_pass_instances)
        for instance, cached_instance in zip(first_pass_instances,
                                             second_pass_instances):
            assert instance.fields == cached_instance.fields

        # And just to be super paranoid, in case the second pass somehow bypassed the cache
        # because of a bug in `_CachedLazyInstance` that's hard to detect, we'll read the
        # instances from the cache with a non-lazy iterator and make sure they're the same.
        reader = SnliReader(lazy=False)
        reader.cache_data(self.cache_directory)
        cached_instances = reader.read(snli_copy_file)
        assert len(first_pass_instances) == len(cached_instances)
        for instance, cached_instance in zip(first_pass_instances,
                                             cached_instances):
            assert instance.fields == cached_instance.fields
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))
    vocab = Vocabulary.from_files(vocab_dir)
    random.shuffle(valid_dataset)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.eval()

    iterator = BasicIterator(batch_size=1)
    iterator.index_with(vocab)
    generator = iterator(valid_dataset)

    for i in range(10):
        batch = next(generator)
        label_token_to_index = vocab.get_token_to_index_vocabulary('labels')
        print('----')
        print(' '.join(
            model.convert_to_readable_text(batch['premise']['tokens'])[0]))
        for label, label_index in label_token_to_index.items():
            label_tensor = torch.tensor([label_index])
            enc_embs = model.embed(batch['premise']['tokens'])
            enc_mask = get_text_field_mask(batch['premise'])
            enc_hidden = model.encode(inputs=enc_embs,
                                      mask=enc_mask,
                                      drop_start_token=True)
            code, kld = model.sample_code_and_compute_kld(enc_hidden)
            generated = model.generate(code=code,
                                       label=label_tensor,
                                       max_length=enc_mask.sum(1) * 2,
                                       beam_size=10,
                                       lp_alpha=args.lp_alpha)
            text = model.convert_to_readable_text(generated[:, 0])[0]
            print(label)
            print(' '.join(text))
示例#11
0
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(start_tokens=['<s>'], end_tokens=['</s>'],)
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(
        tokenizer=tokenizer, token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(
        train_params.pop('valid_dataset_path'))
    vocab = Vocabulary.from_files(vocab_dir)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.to(args.device)
    model.eval()

    iterator = BasicIterator(batch_size=args.batch_size)
    iterator.index_with(vocab)
    generator = iterator(valid_dataset, num_epochs=1, shuffle=False)
    label_index_to_token = vocab.get_index_to_token_vocabulary('labels')

    out_file = open(args.out, 'w')

    for batch in tqdm(generator):
        premise_tokens = batch['premise']['tokens']
        enc_embs = model.embed(premise_tokens.to(args.device))
        enc_mask = get_text_field_mask(batch['premise']).to(args.device)
        enc_hidden = model.encode(inputs=enc_embs, mask=enc_mask,
                                  drop_start_token=True)
        code, kld = model.sample_code_and_compute_kld(enc_hidden)
        pre_text = model.convert_to_readable_text(premise_tokens[:, 1:])
        label_tensor = batch['label'].to(args.device)
        generated = model.generate(
            code=code, label=label_tensor, max_length=25,
            beam_size=10, lp_alpha=args.lp_alpha)
        text = model.convert_to_readable_text(generated[:, 0])
        for pre_text_b, text_b, label_index_b in zip(pre_text, text, label_tensor):
            obj = {'sentence1': ' '.join(pre_text_b), 'sentence2': ' '.join(text_b),
                   'gold_label': label_index_to_token[label_index_b.item()]}
            out_file.write(json.dumps(obj))
            out_file.write('\n')
示例#12
0
    def test_combine_input_fields(self):
        reader = SnliReader(
            tokenizer=PretrainedTransformerTokenizer("bert-base-uncased"),
            combine_input_fields=True)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" /
                                "snli.jsonl")
        instances = ensure_list(instances)

        instance1 = {
            "tokens": [
                "[CLS]",
                "a",
                "person",
                "on",
                "a",
                "horse",
                "jumps",
                "over",
                "a",
                "broken",
                "down",
                "airplane",
                ".",
                "[SEP]",
                "a",
                "person",
                "is",
                "training",
                "his",
                "horse",
                "for",
                "a",
                "competition",
                ".",
                "[SEP]",
            ],
            "label":
            "neutral",
        }

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
示例#13
0
    def test_read_only_creates_cache_file_once(self):
        snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
        reader = SnliReader(cache_directory=self.cache_directory)
        cache_file = reader._get_cache_location_for_file_path(snli_file)

        # The first read will create the cache.
        reader.read(snli_file)
        assert os.path.exists(cache_file)
        with open(cache_file, "r") as in_file:
            cache_contents = in_file.read()
        # The second and all subsequent reads should _use_ the cache, not modify it.  I looked
        # into checking file modification times, but this test will probably be faster than the
        # granularity of `os.path.getmtime()` (which only returns values in seconds).
        reader.read(snli_file)
        reader.read(snli_file)
        reader.read(snli_file)
        reader.read(snli_file)
        with open(cache_file, "r") as in_file:
            final_cache_contents = in_file.read()
        assert cache_contents == final_cache_contents
示例#14
0
serialization_dir = 'checkpoints/'

device = 5
torch.cuda.set_device(device)

batch_size = 64
hid_dim = 100
embed_dim = 300

lr = 1e-3
grad_clipping = 5
dropout = 0.1
# lazy=False将数据一次性加入内存, lazy=True边训练边加载
print('data loading, please wait...')
reader = SnliReader(lazy=True)
train_dataset = reader.read("snli_1.0/snli_1.0_train.jsonl")
dev_dataset = reader.read("snli_1.0/snli_1.0_dev.jsonl")
test_dataset = reader.read("snli_1.0/snli_1.0_test.jsonl")

if os.path.exists(vocab_dir):
    vocab = Vocabulary.from_files(vocab_dir)
else:
    vocab = Vocabulary.from_instances(chain(train_dataset, dev_dataset),
                                      max_vocab_size=max_vocab_size)
    vocab.save_to_files(vocab_dir)

print("vocab_size: {}".format(vocab.get_vocab_size()))

train_iterator = BucketIterator(batch_size=batch_size,
                                sorting_keys=[("premise", "num_tokens")])
dev_iterator = BucketIterator(batch_size=batch_size,
示例#15
0
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(filename=log_filename,
                           terminal=sys.stdout,
                           file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(filename=log_filename,
                           terminal=sys.stderr,
                           file_friendly_terminal_output=False)

    tokenizer = WordTokenizer(start_tokens=['<s>'], end_tokens=['</s>'])
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    train_labeled_dataset = filter_dataset_by_length(
        dataset=train_labeled_dataset, max_length=30)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        train_unlabeled_dataset = filter_dataset_by_length(
            dataset=train_unlabeled_dataset, max_length=30)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    pretrained_checkpoint_path = train_params.pop('pretrained_checkpoint_path',
                                                  None)
    model = SNLIModel(params=model_params, vocab=vocab)
    if pretrained_checkpoint_path:
        model.load_state_dict(
            torch.load(pretrained_checkpoint_path, map_location='cpu'))
    model.add_finetune_parameters(
        con_autoweight=train_params.pop_bool('con_autoweight', False),
        con_y_weight=train_params.pop_float('con_y_weight'),
        con_z_weight=train_params.pop_float('con_z_weight'),
        con_z2_weight=train_params.pop_float('con_z2_weight'))

    main_optimizer = optim.Adam(params=model.finetune_main_parameters(
        exclude_generator=train_params.pop_bool('exclude_generator')),
                                lr=train_params.pop_float('lr', 1e-3))
    aux_optimizer = optim.Adam(params=model.finetune_aux_parameters(),
                               lr=train_params.pop_float('aux_lr', 1e-4))

    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    kl_anneal_rate = train_params.pop_float('kl_anneal_rate', None)
    if kl_anneal_rate is None:
        kl_weight_scheduler = None
    else:
        kl_weight_scheduler = (lambda step: min(1.0, kl_anneal_rate * step))
        model.kl_weight = 0.0

    gumbel_anneal_rate = train_params.pop_float('gumbel_anneal_rate', None)
    if gumbel_anneal_rate is None:
        gumbel_temperature_scheduler = None
    else:
        gumbel_temperature_scheduler = (
            lambda step: max(0.1, 1.0 - gumbel_anneal_rate * step))
        model.gumbel_temperature = 1.0
    iters_per_epoch = train_params.pop_int(
        'iters_per_epoch',
        len(train_labeled_dataset) // labeled_batch_size)

    trainer = FineTuningTrainer(
        model=model,
        main_optimizer=main_optimizer,
        aux_optimizer=aux_optimizer,
        labeled_iterator=labeled_iterator,
        unlabeled_iterator=unlabeled_iterator,
        train_labeled_dataset=train_labeled_dataset,
        train_unlabeled_dataset=train_unlabeled_dataset,
        validation_dataset=valid_dataset,
        summary_writer=summary_writer,
        serialization_dir=save_dir,
        num_epochs=train_params.pop_int('num_epochs', 50),
        iters_per_epoch=iters_per_epoch,
        write_summary_every=100,
        validate_every=1000,
        patience=train_params.pop_int('patience', 5),
        clip_grad_max_norm=train_params.pop_float('grad_max_norm', 5.0),
        kl_weight_scheduler=kl_weight_scheduler,
        gumbel_temperature_scheduler=gumbel_temperature_scheduler,
        cuda_device=train_params.pop_int('cuda_device', 0),
    )
    trainer.train()
示例#16
0
from allennlp.data import Vocabulary
from allennlp.data.dataset_readers import SnliReader
from allennlp.data.iterators.basic_iterator import BasicIterator

from allennlp.modules.token_embedders.embedding import Embedding

from allennlp.training.util import evaluate

from model import BowmanEtAlRNN, BowmanEtAlSumOfWords, NLIPredictor, RocktaschelEtAlConditionalEncoding, RocktaschelEtAlAttention, ChenEtAlESIM

from utils import grad_zero, comb_to_str, re_read_embeddings_from_text_file

t = SnliReader()
### You can choose train/val/test datasets here
train_dataset = t.read('.data/snli_1.0/snli_1.0_train.jsonl')
val_dataset = t.read('.data/snli_1.0/snli_1.0_dev.jsonl')
test_dataset = t.read('.data/snli_1.0/snli_1.0_test.jsonl')

vocab = Vocabulary.from_instances(train_dataset + val_dataset)

vocab = Vocabulary.from_files('./.vocab/snli_vocab')

glove = Embedding(vocab.get_vocab_size(), 300)

### Choose and load model here
model = RocktaschelEtAlAttention(vocab, glove, word_by_word=False).to("cuda")
with open(
        './.serialization_data/C.E. Attention_Adam_32_0.1_0.0003_5e-05_True/best.th',
        'rb') as f:
    model.load_state_dict(torch.load(f))
示例#17
0
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" /
                                "snli.jsonl")
        instances = ensure_list(instances)

        instance1 = {
            "premise": [
                "A",
                "person",
                "on",
                "a",
                "horse",
                "jumps",
                "over",
                "a",
                "broken",
                "down",
                "airplane",
                ".",
            ],
            "hypothesis": [
                "A",
                "person",
                "is",
                "training",
                "his",
                "horse",
                "for",
                "a",
                "competition",
                ".",
            ],
            "label":
            "neutral",
        }

        instance2 = {
            "premise": [
                "A",
                "person",
                "on",
                "a",
                "horse",
                "jumps",
                "over",
                "a",
                "broken",
                "down",
                "airplane",
                ".",
            ],
            "hypothesis": [
                "A",
                "person",
                "is",
                "at",
                "a",
                "diner",
                ",",
                "ordering",
                "an",
                "omelette",
                ".",
            ],
            "label":
            "contradiction",
        }
        instance3 = {
            "premise": [
                "A",
                "person",
                "on",
                "a",
                "horse",
                "jumps",
                "over",
                "a",
                "broken",
                "down",
                "airplane",
                ".",
            ],
            "hypothesis":
            ["A", "person", "is", "outdoors", ",", "on", "a", "horse", "."],
            "label":
            "entailment",
        }

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text
                for t in fields["premise"].tokens] == instance1["premise"]
        assert [t.text for t in fields["hypothesis"].tokens
                ] == instance1["hypothesis"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text
                for t in fields["premise"].tokens] == instance2["premise"]
        assert [t.text for t in fields["hypothesis"].tokens
                ] == instance2["hypothesis"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text
                for t in fields["premise"].tokens] == instance3["premise"]
        assert [t.text for t in fields["hypothesis"].tokens
                ] == instance3["hypothesis"]
        assert fields["label"].label == instance3["label"]
示例#18
0
 def test_max_instances(self, lazy):
     snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
     reader = SnliReader(max_instances=2, lazy=lazy)
     instances = reader.read(snli_file)
     instance_count = sum(1 for _ in instances)
     assert instance_count == 2
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(filename=log_filename,
                           terminal=sys.stdout,
                           file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(filename=log_filename,
                           terminal=sys.stderr,
                           file_friendly_terminal_output=False)

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    train_labeled_dataset = filter_dataset_by_length(
        dataset=train_labeled_dataset, max_length=30)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        train_unlabeled_dataset = filter_dataset_by_length(
            dataset=train_unlabeled_dataset, max_length=30)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    model = SNLIModel(params=model_params, vocab=vocab)
    optimizer = optim.Adam(params=model.parameters(),
                           lr=train_params.pop_float('lr', 1e-3))
    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    kl_anneal_rate = train_params.pop_float('kl_anneal_rate', None)
    if kl_anneal_rate is None:
        kl_weight_scheduler = None
    else:
        kl_weight_scheduler = (lambda step: min(1.0, kl_anneal_rate * step))
        model.kl_weight = 0.0

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      labeled_iterator=labeled_iterator,
                      unlabeled_iterator=unlabeled_iterator,
                      train_labeled_dataset=train_labeled_dataset,
                      train_unlabeled_dataset=train_unlabeled_dataset,
                      validation_dataset=valid_dataset,
                      summary_writer=summary_writer,
                      serialization_dir=save_dir,
                      num_epochs=train_params.pop('num_epochs', 50),
                      iters_per_epoch=len(train_labeled_dataset) //
                      labeled_batch_size,
                      write_summary_every=100,
                      validate_every=2000,
                      patience=2,
                      clip_grad_max_norm=5,
                      kl_weight_scheduler=kl_weight_scheduler,
                      cuda_device=train_params.pop_int('cuda_device', 0),
                      early_stop=train_params.pop_bool('early_stop', True))
    trainer.train()
示例#20
0
def getMnliBow(dataset, vocab, bow_type='groundBow'):
    premises, hypothesis, labs = mnliToList(dataset, vocab)

    premises = getBow(premises, vocab, bow_type)
    hypothesis = getBow(hypothesis, vocab, bow_type)

    labels = np.zeros((len(dataset), vocab.get_vocab_size(namespace='labels')))
    for i in range(len(dataset)):
        labels[i, labs[i]] = 1

    return (premises, hypothesis, labels)

reader = SnliReader()

# train_dataset = reader.read(cached_path('datasets/multinli_1.0/multinli_1.0_train.jsonl'))
train_dataset = reader.read('tests/fixtures/train1000.jsonl') # Fixture
validation_dataset = reader.read('tests/fixtures/val1000.jsonl') # Fixture
#validation_dataset = reader.read('datasets/multinli_1.0/multinli_1.0_dev_matched.jsonl')

# print(train_dataset)

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
# vocab.print_statistics()

t_premises, t_hypothesis, t_labels = getMnliBow(train_dataset, vocab, 'freqBow')
v_premises, v_hypothesis, v_labels = getMnliBow(validation_dataset, vocab, 'freqBow')

# for i in range(3):
#     print(i)
#     print(t_premises[i])
#     print(t_hypothesis[i])
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(
        filename=log_filename, terminal=sys.stdout,
        file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(
        filename=log_filename, terminal=sys.stderr,
        file_friendly_terminal_output=False)

    tokenizer = WordTokenizer()
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(
        tokenizer=tokenizer, token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop(
        'train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    truncate_or_pad_dataset(dataset=train_labeled_dataset, length=29)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        truncate_or_pad_dataset(
            dataset=train_unlabeled_dataset, length=29)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(
        train_params.pop('valid_dataset_path'))
    truncate_or_pad_dataset(valid_dataset, length=29)

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    model = DeconvSNLIModel(params=model_params, vocab=vocab)
    optimizer = optim.Adam(params=model.parameters())
    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    trainer = SeparatedLVMTrainer(
        model=model,
        optimizer=optimizer,
        labeled_iterator=labeled_iterator,
        unlabeled_iterator=unlabeled_iterator,
        train_labeled_dataset=train_labeled_dataset,
        train_unlabeled_dataset=train_unlabeled_dataset,
        validation_dataset=valid_dataset,
        summary_writer=summary_writer,
        serialization_dir=save_dir,
        num_epochs=train_params.pop('num_epochs', 50),
        iters_per_epoch=len(train_labeled_dataset) // labeled_batch_size,
        write_summary_every=100,
        validate_every=2000,
        patience=2,
        clip_grad_max_norm=5,
        cuda_device=train_params.pop_int('cuda_device', 0)
    )
    trainer.train()
示例#22
0
import torch
from allennlp.data.dataset_readers import SnliReader
from allennlp.predictors.predictor import Predictor
from allennlp.data.tokenizers import CharacterTokenizer
from allennlp.data.token_indexers import ELMoTokenCharactersIndexer
torch.manual_seed(1)

DATA_PT = 'data/multinli_1.0/multinli_1.0_dev_matched.jsonl'
reader = SnliReader(CharacterTokenizer(),
                    {"elmo": ELMoTokenCharactersIndexer()})
data = reader.read(DATA_PT)

MODEL_PT = "models/decomposable-attention-elmo-2018.02.19.tar.gz"
predictor = Predictor.from_path(MODEL_PT)
result = predictor.predict_instance(data[0])
print(result)