Exemplo n.º 1
0
 def test_get_embedding_layer_uses_correct_embedding_dim(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace('word1')
     vocab.add_token_to_namespace('word2')
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
     embedding_weights = _read_pretrained_embeddings_file(embeddings_filename, 3, vocab)
     assert tuple(embedding_weights.size()) == (4, 3)  # 4 because of padding and OOV
     with pytest.raises(ConfigurationError):
         _read_pretrained_embeddings_file(embeddings_filename, 4, vocab)
Exemplo n.º 2
0
 def test_get_embedding_layer_uses_correct_embedding_dim(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace(u'word1')
     vocab.add_token_to_namespace(u'word2')
     embeddings_filename = unicode(self.TEST_DIR / u"embeddings.gz")
     with gzip.open(embeddings_filename, u'wb') as embeddings_file:
         embeddings_file.write(u"word1 1.0 2.3 -1.0\n".encode(u'utf-8'))
         embeddings_file.write(u"word2 0.1 0.4 -4.0\n".encode(u'utf-8'))
     embedding_weights = _read_pretrained_embeddings_file(embeddings_filename, 3, vocab)
     assert tuple(embedding_weights.size()) == (4, 3)  # 4 because of padding and OOV
     with pytest.raises(ConfigurationError):
         _read_pretrained_embeddings_file(embeddings_filename, 4, vocab)
Exemplo n.º 3
0
def load_embedding(args, vocab):
    # Randomly initialize vectors
    if args.embedding_type == "None":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=args.embedding_dim)

    # Load word2vec vectors
    elif args.embedding_type == "w2v":
        embedding_path = args.embedding_path
        save_weight_file = './{}_embedding_weight.pt'.format(args.dataset)
        if os.path.exists(save_weight_file):
            weight = torch.load(save_weight_file)
        else:
            weight = _read_pretrained_embeddings_file(
                embedding_path,
                embedding_dim=args.embedding_dim,
                vocab=vocab,
                namespace="tokens")
            torch.save(weight, save_weight_file)

        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=args.embedding_dim,
            weight=weight,
            trainable=True)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    return word_embeddings
Exemplo n.º 4
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding':  # type: ignore
        """ Construct from parameters.
        """
        # pylint: disable=arguments-differ
        num_embeddings = params.pop_int('num_embeddings', None)
        vocab_namespace = params.pop("vocab_namespace", "tokens")
        if num_embeddings is None:
            num_embeddings = vocab.get_vocab_size(vocab_namespace)
        embedding_dim = params.pop_int('embedding_dim')
        pretrained_file = params.pop("pretrained_file", None)
        padding_index = params.pop_int('padding_index', None)
        norm_type = params.pop_float('norm_type', 2.)
        keep_history = params.pop_int('keep_history', 0)

        if pretrained_file:
            weight = _read_pretrained_embeddings_file(pretrained_file,
                                                      embedding_dim,
                                                      vocab,
                                                      vocab_namespace)
        else:
            weight = None

        params.assert_empty(cls.__name__)

        return cls(num_embeddings=num_embeddings, weight=weight, embedding_dim=embedding_dim,
                   padding_index=padding_index,
                   norm_type=norm_type, keep_history=keep_history)
Exemplo n.º 5
0
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding':
     num_embeddings = params.pop_int('num_embeddings', None)
     vocab_namespace = params.pop("vocab_namespace", "tokens")
     if num_embeddings is None:
         num_embeddings = vocab.get_vocab_size(vocab_namespace)
     embedding_dim = params.pop_int('embedding_dim')
     pretrained_file = params.pop("pretrained_file", None)
     projection_dim = params.pop_int("projection_dim", None)
     trainable = params.pop_bool("trainable", True)
     padding_index = params.pop_int('padding_index', None)
     max_norm = params.pop_float('max_norm', None)
     norm_type = params.pop_float('norm_type', 2.)
     scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
     sparse = params.pop_bool('sparse', False)
     dropout = params.pop_float('dropout', None)
     params.assert_empty(cls.__name__)
     weight = _read_pretrained_embeddings_file(
         pretrained_file, embedding_dim, vocab,
         vocab_namespace) if pretrained_file else None
     return cls(num_embeddings=num_embeddings,
                embedding_dim=embedding_dim,
                projection_dim=projection_dim,
                weight=weight,
                padding_index=padding_index,
                trainable=trainable,
                max_norm=max_norm,
                norm_type=norm_type,
                scale_grad_by_freq=scale_grad_by_freq,
                sparse=sparse,
                dropout=dropout)
Exemplo n.º 6
0
    def test_fine_tune_works_with_vocab_expansion_with_pretrained_file(self):
        params = Params.from_file(self.config_file)
        # snli2 has a new token (seahorse) in it
        params["train_data_path"] = str(self.FIXTURES_ROOT / "data" / "snli2.jsonl")

        # seahorse_embeddings.gz has only token embedding for 'seahorse'.
        embeddings_filename = str(self.FIXTURES_ROOT / "data" / "seahorse_embeddings.gz")
        extra_token_vector = _read_pretrained_embeddings_file(
            embeddings_filename, 300, Vocabulary({"tokens": {"seahorse": 1}})
        )[2, :]
        unavailable_embeddings_filename = "file-not-found"

        def check_embedding_extension(user_pretrained_file, saved_pretrained_file, use_pretrained):
            trained_model = load_archive(self.model_archive).model
            original_weight = trained_model._text_field_embedder.token_embedder_tokens.weight
            # Simulate the behavior of unavailable pretrained_file being stored as an attribute.
            trained_model._text_field_embedder.token_embedder_tokens._pretrained_file = (
                saved_pretrained_file
            )
            embedding_sources_mapping = {
                "_text_field_embedder.token_embedder_tokens": user_pretrained_file
            }
            shutil.rmtree(self.serialization_dir, ignore_errors=True)
            fine_tuned_model = train_model(
                params.duplicate(),
                self.serialization_dir,
                model=trained_model,
                extend_vocab=True,
                embedding_sources_mapping=embedding_sources_mapping,
            )
            extended_weight = fine_tuned_model._text_field_embedder.token_embedder_tokens.weight
            assert original_weight.shape[0] + 1 == extended_weight.shape[0] == 25
            assert torch.all(original_weight == extended_weight[:24, :])
            if use_pretrained:
                assert torch.all(extended_weight[24, :] == extra_token_vector)
            else:
                assert torch.all(extended_weight[24, :] != extra_token_vector)

        # TEST 1: Passing correct embedding_sources_mapping should work when pretrained_file attribute
        #         wasn't stored. (Model archive was generated without behaviour of storing pretrained_file)
        check_embedding_extension(embeddings_filename, None, True)

        # TEST 2: Passing correct embedding_sources_mapping should work when pretrained_file
        #         attribute was stored and user's choice should take precedence.
        check_embedding_extension(embeddings_filename, unavailable_embeddings_filename, True)

        # TEST 3: Passing no embedding_sources_mapping should work, if available pretrained_file
        #         attribute was stored.
        check_embedding_extension(None, embeddings_filename, True)

        # TEST 4: Passing incorrect pretrained-file by mapping should raise error.
        with pytest.raises(ConfigurationError):
            check_embedding_extension(unavailable_embeddings_filename, embeddings_filename, True)

        # TEST 5: If none is available, it should NOT raise error. Pretrained file could
        #         possibly not have been used in first place.
        check_embedding_extension(None, unavailable_embeddings_filename, False)
Exemplo n.º 7
0
def build_v2w(vocab):
    print(f"vocab {NAMESPACE} size:",
          vocab.get_vocab_size(namespace=NAMESPACE))

    weights = _read_pretrained_embeddings_file(EMBEDDING_PATH,
                                               EMBEDDING_DIM,
                                               vocab,
                                               namespace=NAMESPACE)

    print('weights.shape:', weights.shape)

    v2w: EmbeddingToWord = EmbeddingToWord(
        embedding_size=EMBEDDING_DIM,
        words_count=vocab.get_vocab_size(NAMESPACE))

    v2w.init_from_embeddings(weights)

    torch.save(v2w.state_dict(), OUT_MODEL_PATH)
Exemplo n.º 8
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'Embedding':  # type: ignore
        # pylint: disable=arguments-differ
        num_embeddings = params.pop_int('num_embeddings', None)
        # If num_embeddings is present, set default namespace to None so that extend_vocab
        # call doesn't misinterpret that some namespace was originally used.
        vocab_namespace = params.pop("vocab_namespace",
                                     None if num_embeddings else "tokens")
        if num_embeddings is None:
            num_embeddings = vocab.get_vocab_size(vocab_namespace)
        embedding_dim = params.pop_int('embedding_dim')
        pretrained_file = params.pop("pretrained_file", None)
        projection_dim = params.pop_int("projection_dim", None)
        trainable = params.pop_bool("trainable", True)
        padding_index = params.pop_int('padding_index', None)
        max_norm = params.pop_float('max_norm', None)
        norm_type = params.pop_float('norm_type', 2.)
        scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
        sparse = params.pop_bool('sparse', False)
        scale = params.pop_bool('scale', False)
        params.assert_empty(cls.__name__)

        if pretrained_file:
            # If we're loading a saved model, we don't want to actually read a pre-trained
            # embedding file - the embeddings will just be in our saved weights, and we might not
            # have the original embedding file anymore, anyway.
            weight = _read_pretrained_embeddings_file(pretrained_file,
                                                      embedding_dim, vocab,
                                                      vocab_namespace)
        else:
            weight = None

        return cls(num_embeddings=num_embeddings,
                   embedding_dim=embedding_dim,
                   projection_dim=projection_dim,
                   weight=weight,
                   padding_index=padding_index,
                   trainable=trainable,
                   max_norm=max_norm,
                   norm_type=norm_type,
                   scale_grad_by_freq=scale_grad_by_freq,
                   sparse=sparse,
                   scale=scale,
                   vocab_namespace=vocab_namespace)
Exemplo n.º 9
0
def load_w2v(
        weights_file,
        vocab,
        namespace='tokens',
        device=None
) -> Embedding:
    cache_file = weights_file + '.cache.hd5'

    if os.path.exists(cache_file):
        weights = _read_embeddings_from_hdf5(cache_file,
                                             embedding_dim=SETTINGS.EMBEDDINGS_SIZE,
                                             vocab=vocab,
                                             namespace=namespace)
    else:
        weights = _read_pretrained_embeddings_file(
            weights_file,
            SETTINGS.EMBEDDINGS_SIZE,
            vocab,
            namespace=namespace
        )

        with h5py.File(cache_file, 'w') as f:
            f.create_dataset("embedding", data=weights.numpy())

    if device is not None:
        weights = weights.cuda(device)

    logger.info(f"W2V size: {weights.shape}")

    token_embedding = ThriftyEmbedding(
        trainable=False,
        weights_file=weights_file,
        num_embeddings=vocab.get_vocab_size(namespace),
        weight=weights,
        embedding_dim=SETTINGS.EMBEDDINGS_SIZE
    )

    return token_embedding
Exemplo n.º 10
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(
        lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class",
        token_indexers={"tokens": single_id_indexer},
        use_subtrees=True)
    train_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class", token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=300,
            weight=weight,
            trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embedding_dim,
                      hidden_size=512,
                      num_layers=2,
                      batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("tokens", "num_tokens")])
        iterator.index_with(vocab)
        optimizer = optim.Adam(model.parameters())
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=train_data,
                          validation_dataset=dev_data,
                          num_epochs=5,
                          patience=1,
                          cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda()  # rnn cannot do backwards in train mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(
        model)  # also save the word embedding matrix

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train()  # rnn cannot do backwards in train mode

    # initialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(iterator(targeted_dev_data,
                                         num_epochs=5,
                                         shuffle=True),
                                group_size=1):
        # get accuracy with current triggers
        utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
        model.train()  # rnn cannot do backwards in train mode

        # get gradient w.r.t. trigger embeddings for current batch
        averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

        # pass the gradients to a particular attack to generate token candidates for each token.
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        num_candidates=40,
                                                        increase_loss=True)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        num_candidates=40,
        #                                                        increase_loss=True)

        # Tries all of the candidates and returns the trigger sequence with highest loss.
        trigger_token_ids = utils.get_best_candidates(model, batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
Exemplo n.º 11
0
train_dataset = reader.read(cached_path(rest_train_fp))
validation_dataset = reader.read(cached_path(rest_dev_fp))
target = train_dataset[0].fields['target']
text = train_dataset[0].fields['text']
label = train_dataset[0].fields['label']

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
WORD_EMBEDDING_DIM = 50
CHAR_EMBEDDING_DIM = 5
CHAR_WORD_DIM = 30
HIDDEN_DIM = 50


# Model
glove_fp = cached_path('/home/andrew/glove.6B/glove.6B.50d.txt')
glove_50_weights = _read_pretrained_embeddings_file(glove_fp, 50, vocab, 'tokens_id')

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens_id'),
                            embedding_dim=WORD_EMBEDDING_DIM,
                            weight=glove_50_weights)

id_to_tokens = vocab.get_index_to_token_vocabulary(namespace='tokens_id')
token_names = list(id_to_tokens.values())

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

text_lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(WORD_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
target_lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(WORD_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
feed_forward = torch.nn.Linear(HIDDEN_DIM * 2,
                               out_features=vocab.get_vocab_size('labels'))
Exemplo n.º 12
0
Arquivo: ner.py Projeto: apmoore1/NER
def predict(cuda_device: int,
            char_encoder: str,
            data_dir: Path,
            glove_path: Path,
            temp_dir: Path,
            random_seed: int = 13370,
            numpy_seed: int = 1337,
            torch_seed: int = 133) -> List[Tuple[float, float, str]]:
    '''
    This allows you to train an NER model that has either a CNN character 
    encoder or LSTM based on the `char_encoder` argument. The encoded 
    characters are then combined with 100D Glove vectors and put through 
    a Bi-Directional LSTM.

    This is based on the following two papers:
    
    1. CNN character encoder version `Ma and Hovy \
       <https://arxiv.org/abs/1603.01354>`_
    2. LSTM character encoder version `Lample et al. \
       <https://arxiv.org/abs/1603.01360>`_

    :param cuda_device: Whether to use GPU or CPU, CPU = -1, GPU = 0
    :param char_encoder: Whether to use an LSTM or CNN. Acceptable values are: 
                         1. lstm, 2. cnn
    :param data_dir: A file path to a directory that contains three files: 
                     1. train.txt, 2. dev.txt, 3. test.txt that are the 
                     train, dev, and test files respectively in CONLL 2003 
                     format where the NER labels are in BIO format.
    :param glove_path: A file path to the `Glove 6 billion word vectors 100D \
                       <https://nlp.stanford.edu/projects/glove/>`_
    :returns: The results as a list of tuples which are 
              (dev f1 score, test f1 score, char encoder) where the list 
              represents a different trained model using the same train, dev, 
              and test split but different random seed.
    '''
    #
    # The dataset we are using has already been formatted from IOB1 to BIO
    # When reading the dataset state the coding is the orignal as this will not
    # affect the labels i.e. the labels and schema is not checked.

    label_encoding = 'BIO'
    constrain_crf_decoding = True
    dropout = 0.5

    char_embedding_dim = 30
    cnn_window_size = (3, )
    cnn_filters = 50
    cnn_output_dim = len(cnn_window_size) * cnn_filters

    lstm_char_dim = 25
    lstm_char_output_dim = lstm_char_dim * 2

    word_embedding_dim = 100
    # LSTM size is that of Ma and Hovy
    lstm_dim = 100

    # Dropout applies dropout after the encoded text and after the word embedding.

    #tensorboard_dir = Path('..', 'tensorboard ner')
    #tensorboard_dir.mkdir(parents=True, exist_ok=True)

    #train_log = SummaryWriter(Path(tensorboard_dir, "log", "train"))
    #validation_log = SummaryWriter(Path(tensorboard_dir, "log", "validation"))

    train_fp = Path(data_dir, 'train.txt')
    dev_fp = Path(data_dir, 'dev.txt')
    test_fp = Path(data_dir, 'test.txt')
    result_fp = Path(data_dir, 'results.json')
    result_data = []
    if result_fp.exists():
        with result_fp.open('r') as json_file:
            result_data = json.load(json_file)

    indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens',
                                       lowercase_tokens=True),
        'chars': TokenCharactersIndexer(namespace='token_characters')
    }

    conll_reader = Conll2003DatasetReader(token_indexers=indexers)
    train_dataset = conll_reader.read(cached_path(train_fp))
    dev_dataset = conll_reader.read(cached_path(dev_fp))
    test_dataset = conll_reader.read(cached_path(test_fp))

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset +
                                      test_dataset)

    char_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size("token_characters"),
        embedding_dim=char_embedding_dim)

    if char_encoder.strip().lower() == 'lstm':
        character_lstm = torch.nn.LSTM(char_embedding_dim,
                                       lstm_char_dim,
                                       batch_first=True,
                                       bidirectional=True)
        character_lstm_wrapper = PytorchSeq2VecWrapper(character_lstm)
        token_character_encoder = TokenCharactersEncoder(
            embedding=char_embedding, encoder=character_lstm_wrapper)
        total_char_embedding_dim = lstm_char_output_dim
    elif char_encoder.strip().lower() == 'cnn':
        character_cnn = CnnEncoder(embedding_dim=char_embedding_dim,
                                   num_filters=cnn_filters,
                                   ngram_filter_sizes=cnn_window_size,
                                   output_dim=cnn_output_dim)
        token_character_encoder = TokenCharactersEncoder(
            embedding=char_embedding, encoder=character_cnn)
        total_char_embedding_dim = cnn_output_dim
    else:
        raise ValueError('The Character encoder can only be `lstm` or `cnn` '
                         f'and not {char_encoder}')

    glove_path = cached_path(glove_path)
    glove_100_weights = _read_pretrained_embeddings_file(
        glove_path, word_embedding_dim, vocab, 'tokens')
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=word_embedding_dim,
                                weight=glove_100_weights)

    word_embeddings = BasicTextFieldEmbedder({
        "tokens": token_embedding,
        "chars": token_character_encoder
    })

    total_embedding_dim = word_embedding_dim + total_char_embedding_dim
    lstm = torch.nn.LSTM(total_embedding_dim,
                         lstm_dim,
                         batch_first=True,
                         bidirectional=True)
    lstm_wrapper = PytorchSeq2SeqWrapper(lstm)

    model = CrfTagger(vocab,
                      word_embeddings,
                      lstm_wrapper,
                      label_encoding=label_encoding,
                      dropout=dropout,
                      constrain_crf_decoding=constrain_crf_decoding)

    optimizer = optim.SGD(model.parameters(), lr=0.015, weight_decay=1e-8)
    schedule = LearningRateWithoutMetricsWrapper(
        torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9524))
    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    temp_dir_fp = str(temp_dir.resolve())
    temp_folder_path = tempfile.mkdtemp(dir=temp_dir_fp)

    set_random_env(cuda_device, random_seed, numpy_seed, torch_seed)
    trainer = Trainer(model=model,
                      grad_clipping=5.0,
                      learning_rate_scheduler=schedule,
                      serialization_dir=temp_folder_path,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      shuffle=True,
                      cuda_device=cuda_device,
                      patience=5,
                      num_epochs=1000)

    #trainer._tensorboard = TensorboardWriter(train_log=train_log,
    #                                        validation_log=validation_log)
    interesting_metrics = trainer.train()
    best_model_weights = Path(temp_folder_path, 'best.th')
    best_model_state = torch.load(best_model_weights)
    model.load_state_dict(best_model_state)
    test_result = evaluate(model, test_dataset, iterator, cuda_device)
    dev_result = evaluate(model, dev_dataset, iterator, cuda_device)
    test_f1 = test_result['f1-measure-overall']
    dev_f1 = dev_result['f1-measure-overall']
    result_data.append((dev_f1, test_f1, char_encoder))

    with result_fp.open('w+') as json_file:
        json.dump(result_data, json_file)
    print(f'{interesting_metrics}')
    return result_data
Exemplo n.º 13
0
def get_model(pretrained_file: str, WORD_EMB_DIM: int, vocab: Vocabulary,
              num_tags: int):
    """
    This creates a new model and returns it along with some other variables.
    :param pretrained_file:
    :param WORD_EMB_DIM:
    :param vocab:
    :param num_tags:
    :return:
    """

    CNN_EMB_DIM = 128
    CHAR_EMB_DIM = 16

    weight = _read_pretrained_embeddings_file(pretrained_file, WORD_EMB_DIM,
                                              vocab, "tokens")
    token_embedding = Embedding(num_embeddings=weight.shape[0],
                                embedding_dim=weight.shape[1],
                                weight=weight,
                                vocab_namespace="tokens")
    char_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size("token_characters"),
        embedding_dim=CHAR_EMB_DIM,
        vocab_namespace="token_characters")

    char_encoder = CnnEncoder(
        embedding_dim=CHAR_EMB_DIM,
        num_filters=CNN_EMB_DIM,
        ngram_filter_sizes=[3],
        conv_layer_activation=Activation.by_name("relu")())
    token_characters_embedding = TokenCharactersEncoder(
        embedding=char_embedding, encoder=char_encoder)

    if USING_BERT:
        print("USING BERT EMBEDDINGS")
        bert_emb = PretrainedBertEmbedder("bert-base-multilingual-cased")
        tfe = BasicTextFieldEmbedder(
            {
                "bert": bert_emb,
                "token_characters": token_characters_embedding
            },
            embedder_to_indexer_map={
                "bert": ["bert", "bert-offsets"],
                "token_characters": ["token_characters"]
            },
            allow_unmatched_keys=True)

        EMBEDDING_DIM = CNN_EMB_DIM + 768
    else:
        EMBEDDING_DIM = CNN_EMB_DIM + WORD_EMB_DIM
        tfe = BasicTextFieldEmbedder({
            "tokens":
            token_embedding,
            "token_characters":
            token_characters_embedding
        })

    HIDDEN_DIM = 256

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      bidirectional=True,
                      dropout=0.5,
                      num_layers=2))

    model = MarginalCrfTagger(vocab,
                              tfe,
                              encoder,
                              num_tags,
                              include_start_end_transitions=False,
                              calculate_span_f1=True,
                              dropout=0.5,
                              label_encoding="BIOUL",
                              constrain_crf_decoding=True)

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    if torch.cuda.is_available():
        print("Using GPU")
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    return model, optimizer, cuda_device
Exemplo n.º 14
0
def main(file, embeddings, model, emb_wt_key, namespace, output_dir):
    archive = load_archive(model)
    config = archive.config
    os.makedirs(output_dir, exist_ok=True)
    config.to_file(os.path.join(output_dir, CONFIG_NAME))

    model = archive.model
    # first expand the vocabulary
    dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    instances = dataset_reader.read(file)
    vocab = model.vocab

    # get all the tokens in the new file
    namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(
        lambda: defaultdict(int))
    for instance in Tqdm.tqdm(instances):
        instance.count_vocab_items(namespace_token_counts)
    old_token_size = vocab.get_vocab_size(namespace)
    print("Before expansion: Number of instances in {} namespace: {}".format(
        namespace, old_token_size))
    if namespace not in namespace_token_counts:
        logger.error(
            "No tokens found for namespace: {} in the new input file".format(
                namespace))
    # identify the new tokens in the new instances
    token_to_add = set()
    token_hits = 0
    for token, count in namespace_token_counts[namespace].items():
        if token not in vocab._token_to_index[namespace]:
            # new token, must add
            token_to_add.add(token)
        else:
            token_hits += 1
    print("Found {} existing tokens and {} new tokens in {}".format(
        token_hits, len(token_to_add), file))

    # add the new tokens to the vocab
    for token in token_to_add:
        vocab.add_token_to_namespace(token=token, namespace=namespace)
    archived_parameters = dict(model.named_parameters())

    # second, expand the embedding matrix
    for name, weights in archived_parameters.items():
        # find the wt matrix for the embeddings
        if name == emb_wt_key:
            if weights.dim() != 2:
                logger.error(
                    "Expected an embedding matrix for the parameter: {} instead"
                    "found {} tensor".format(emb_wt_key, weights.shape))
            emb_dim = weights.shape[-1]
            print("Before expansion: Size of emb matrix: {}".format(
                weights.shape))
            # Loading embeddings for old and new tokens since that is cleaner than copying all
            # the embedding loading logic here
            all_embeddings = _read_pretrained_embeddings_file(
                embeddings, emb_dim, vocab, namespace)
            # concatenate the new entries i.e last token_to_add embeddings to the original weights
            if len(token_to_add) > 0:
                weights.data = torch.cat(
                    [weights.data, all_embeddings[-len(token_to_add):, :]])
            print("After expansion: Size of emb matrix: {}".format(
                weights.shape))

    # save the files needed by the model archiver
    model_path = os.path.join(output_dir, "weight.th")
    model_state = model.state_dict()
    torch.save(model_state, model_path)
    vocab.save_to_files(os.path.join(output_dir, "vocabulary"))
    archive_model(output_dir, weights="weight.th")

    # more debug messages
    new_token_size = vocab.get_vocab_size(namespace)
    for name, weights in archived_parameters.items():
        if name == emb_wt_key:
            print("Size of emb matrix: {}".format(weights.shape))
    print("After expansion: Number of instances in {} namespace: {}".format(
        namespace, new_token_size))
Exemplo n.º 15
0
def main():
    target_namespace = "target_tokens"
    if not USE_COPY:
        reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace=target_namespace)
            })
    else:
        reader = CopyNetDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_namespace=target_namespace)
    train_dataset = reader.read('./data/data_train.tsv')
    validation_dataset = reader.read('./data/data_val.tsv')

    vocab = Vocabulary.from_instances(train_dataset,
                                      min_count={
                                          'tokens': 3,
                                          'target_tokens': 3
                                      })

    en_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size('tokens'),
        embedding_dim=SRC_EMBEDDING_DIM,
        pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt")
    assert en_embedding.weight.requires_grad
    datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file,
                                             SRC_EMBEDDING_DIM, vocab)
    datas.requires_grad = True
    en_embedding.weight.data = datas
    print(en_embedding.weight.data)
    assert en_embedding.weight.requires_grad
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(SRC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      bidirectional=True,
                      dropout=0.3,
                      num_layers=1))
    #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM,
    #                                      hidden_dim=HIDDEN_DIM,
    #                                      projection_dim=128, feedforward_hidden_dim=128,
    #                                      num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
    attention = DotProductAttention()

    if not USE_COPY:
        model = SimpleSeq2Seq(vocab,
                              source_embedder,
                              encoder,
                              MAX_DECODING_STEPS,
                              target_embedding_dim=TGT_EMBEDDING_DIM,
                              target_namespace='target_tokens',
                              attention=attention,
                              beam_size=8,
                              use_bleu=True)
    else:
        model = MyCopyNet(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps=MAX_DECODING_STEPS,
                          target_embedding_dim=TGT_EMBEDDING_DIM,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=8,
                          tgt_embedder_pretrain_file=
                          "../opennmt/glove_dir/glove.840B.300d.txt")
    model.to(torch.device('cuda'))
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[("source_tokens", "num_tokens")],
                              padding_noise=0.2)

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=22,
                      patience=4,
                      serialization_dir="./checkpoints",
                      cuda_device=CUDA_DEVICE,
                      summary_interval=100)
    trainer.train()
    print(en_embedding.weight.data)
    predictor = Seq2SeqPredictor(model, reader)

    # Dump all predictions to a file
    # TODO (DNGros): Is there an automatic way in allennlp to do this??
    pred_toks = []
    with open("pred.txt", "w") as outfile:
        for instance in tqdm(validation_dataset):
            pred = predictor.predict_instance(instance)
            toks = pred['predicted_tokens']
            if toks:
                outfile.write(" ".join(toks[0]) + "\n")
            else:
                outfile.write("" + "\n")