def test_get_embedding_layer_uses_correct_embedding_dim(self): vocab = Vocabulary() vocab.add_token_to_namespace('word1') vocab.add_token_to_namespace('word2') embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) embedding_weights = _read_pretrained_embeddings_file(embeddings_filename, 3, vocab) assert tuple(embedding_weights.size()) == (4, 3) # 4 because of padding and OOV with pytest.raises(ConfigurationError): _read_pretrained_embeddings_file(embeddings_filename, 4, vocab)
def test_get_embedding_layer_uses_correct_embedding_dim(self): vocab = Vocabulary() vocab.add_token_to_namespace(u'word1') vocab.add_token_to_namespace(u'word2') embeddings_filename = unicode(self.TEST_DIR / u"embeddings.gz") with gzip.open(embeddings_filename, u'wb') as embeddings_file: embeddings_file.write(u"word1 1.0 2.3 -1.0\n".encode(u'utf-8')) embeddings_file.write(u"word2 0.1 0.4 -4.0\n".encode(u'utf-8')) embedding_weights = _read_pretrained_embeddings_file(embeddings_filename, 3, vocab) assert tuple(embedding_weights.size()) == (4, 3) # 4 because of padding and OOV with pytest.raises(ConfigurationError): _read_pretrained_embeddings_file(embeddings_filename, 4, vocab)
def load_embedding(args, vocab): # Randomly initialize vectors if args.embedding_type == "None": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=args.embedding_dim) # Load word2vec vectors elif args.embedding_type == "w2v": embedding_path = args.embedding_path save_weight_file = './{}_embedding_weight.pt'.format(args.dataset) if os.path.exists(save_weight_file): weight = torch.load(save_weight_file) else: weight = _read_pretrained_embeddings_file( embedding_path, embedding_dim=args.embedding_dim, vocab=vocab, namespace="tokens") torch.save(weight, save_weight_file) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=args.embedding_dim, weight=weight, trainable=True) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) return word_embeddings
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding': # type: ignore """ Construct from parameters. """ # pylint: disable=arguments-differ num_embeddings = params.pop_int('num_embeddings', None) vocab_namespace = params.pop("vocab_namespace", "tokens") if num_embeddings is None: num_embeddings = vocab.get_vocab_size(vocab_namespace) embedding_dim = params.pop_int('embedding_dim') pretrained_file = params.pop("pretrained_file", None) padding_index = params.pop_int('padding_index', None) norm_type = params.pop_float('norm_type', 2.) keep_history = params.pop_int('keep_history', 0) if pretrained_file: weight = _read_pretrained_embeddings_file(pretrained_file, embedding_dim, vocab, vocab_namespace) else: weight = None params.assert_empty(cls.__name__) return cls(num_embeddings=num_embeddings, weight=weight, embedding_dim=embedding_dim, padding_index=padding_index, norm_type=norm_type, keep_history=keep_history)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding': num_embeddings = params.pop_int('num_embeddings', None) vocab_namespace = params.pop("vocab_namespace", "tokens") if num_embeddings is None: num_embeddings = vocab.get_vocab_size(vocab_namespace) embedding_dim = params.pop_int('embedding_dim') pretrained_file = params.pop("pretrained_file", None) projection_dim = params.pop_int("projection_dim", None) trainable = params.pop_bool("trainable", True) padding_index = params.pop_int('padding_index', None) max_norm = params.pop_float('max_norm', None) norm_type = params.pop_float('norm_type', 2.) scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False) sparse = params.pop_bool('sparse', False) dropout = params.pop_float('dropout', None) params.assert_empty(cls.__name__) weight = _read_pretrained_embeddings_file( pretrained_file, embedding_dim, vocab, vocab_namespace) if pretrained_file else None return cls(num_embeddings=num_embeddings, embedding_dim=embedding_dim, projection_dim=projection_dim, weight=weight, padding_index=padding_index, trainable=trainable, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse, dropout=dropout)
def test_fine_tune_works_with_vocab_expansion_with_pretrained_file(self): params = Params.from_file(self.config_file) # snli2 has a new token (seahorse) in it params["train_data_path"] = str(self.FIXTURES_ROOT / "data" / "snli2.jsonl") # seahorse_embeddings.gz has only token embedding for 'seahorse'. embeddings_filename = str(self.FIXTURES_ROOT / "data" / "seahorse_embeddings.gz") extra_token_vector = _read_pretrained_embeddings_file( embeddings_filename, 300, Vocabulary({"tokens": {"seahorse": 1}}) )[2, :] unavailable_embeddings_filename = "file-not-found" def check_embedding_extension(user_pretrained_file, saved_pretrained_file, use_pretrained): trained_model = load_archive(self.model_archive).model original_weight = trained_model._text_field_embedder.token_embedder_tokens.weight # Simulate the behavior of unavailable pretrained_file being stored as an attribute. trained_model._text_field_embedder.token_embedder_tokens._pretrained_file = ( saved_pretrained_file ) embedding_sources_mapping = { "_text_field_embedder.token_embedder_tokens": user_pretrained_file } shutil.rmtree(self.serialization_dir, ignore_errors=True) fine_tuned_model = train_model( params.duplicate(), self.serialization_dir, model=trained_model, extend_vocab=True, embedding_sources_mapping=embedding_sources_mapping, ) extended_weight = fine_tuned_model._text_field_embedder.token_embedder_tokens.weight assert original_weight.shape[0] + 1 == extended_weight.shape[0] == 25 assert torch.all(original_weight == extended_weight[:24, :]) if use_pretrained: assert torch.all(extended_weight[24, :] == extra_token_vector) else: assert torch.all(extended_weight[24, :] != extra_token_vector) # TEST 1: Passing correct embedding_sources_mapping should work when pretrained_file attribute # wasn't stored. (Model archive was generated without behaviour of storing pretrained_file) check_embedding_extension(embeddings_filename, None, True) # TEST 2: Passing correct embedding_sources_mapping should work when pretrained_file # attribute was stored and user's choice should take precedence. check_embedding_extension(embeddings_filename, unavailable_embeddings_filename, True) # TEST 3: Passing no embedding_sources_mapping should work, if available pretrained_file # attribute was stored. check_embedding_extension(None, embeddings_filename, True) # TEST 4: Passing incorrect pretrained-file by mapping should raise error. with pytest.raises(ConfigurationError): check_embedding_extension(unavailable_embeddings_filename, embeddings_filename, True) # TEST 5: If none is available, it should NOT raise error. Pretrained file could # possibly not have been used in first place. check_embedding_extension(None, unavailable_embeddings_filename, False)
def build_v2w(vocab): print(f"vocab {NAMESPACE} size:", vocab.get_vocab_size(namespace=NAMESPACE)) weights = _read_pretrained_embeddings_file(EMBEDDING_PATH, EMBEDDING_DIM, vocab, namespace=NAMESPACE) print('weights.shape:', weights.shape) v2w: EmbeddingToWord = EmbeddingToWord( embedding_size=EMBEDDING_DIM, words_count=vocab.get_vocab_size(NAMESPACE)) v2w.init_from_embeddings(weights) torch.save(v2w.state_dict(), OUT_MODEL_PATH)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding': # type: ignore # pylint: disable=arguments-differ num_embeddings = params.pop_int('num_embeddings', None) # If num_embeddings is present, set default namespace to None so that extend_vocab # call doesn't misinterpret that some namespace was originally used. vocab_namespace = params.pop("vocab_namespace", None if num_embeddings else "tokens") if num_embeddings is None: num_embeddings = vocab.get_vocab_size(vocab_namespace) embedding_dim = params.pop_int('embedding_dim') pretrained_file = params.pop("pretrained_file", None) projection_dim = params.pop_int("projection_dim", None) trainable = params.pop_bool("trainable", True) padding_index = params.pop_int('padding_index', None) max_norm = params.pop_float('max_norm', None) norm_type = params.pop_float('norm_type', 2.) scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False) sparse = params.pop_bool('sparse', False) scale = params.pop_bool('scale', False) params.assert_empty(cls.__name__) if pretrained_file: # If we're loading a saved model, we don't want to actually read a pre-trained # embedding file - the embeddings will just be in our saved weights, and we might not # have the original embedding file anymore, anyway. weight = _read_pretrained_embeddings_file(pretrained_file, embedding_dim, vocab, vocab_namespace) else: weight = None return cls(num_embeddings=num_embeddings, embedding_dim=embedding_dim, projection_dim=projection_dim, weight=weight, padding_index=padding_index, trainable=trainable, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse, scale=scale, vocab_namespace=vocab_namespace)
def load_w2v( weights_file, vocab, namespace='tokens', device=None ) -> Embedding: cache_file = weights_file + '.cache.hd5' if os.path.exists(cache_file): weights = _read_embeddings_from_hdf5(cache_file, embedding_dim=SETTINGS.EMBEDDINGS_SIZE, vocab=vocab, namespace=namespace) else: weights = _read_pretrained_embeddings_file( weights_file, SETTINGS.EMBEDDINGS_SIZE, vocab, namespace=namespace ) with h5py.File(cache_file, 'w') as f: f.create_dataset("embedding", data=weights.numpy()) if device is not None: weights = weights.cuda(device) logger.info(f"W2V size: {weights.shape}") token_embedding = ThriftyEmbedding( trainable=False, weights_file=weights_file, num_embeddings=vocab.get_vocab_size(namespace), weight=weights, embedding_dim=SETTINGS.EMBEDDINGS_SIZE ) return token_embedding
def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer( lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader( granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) train_data = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') reader = StanfordSentimentTreeBankDatasetReader( granularity="2-class", token_indexers={"tokens": single_id_indexer}) dev_data = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in train mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight( model) # also save the word embedding matrix # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in train mode # initialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1): # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in train mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
train_dataset = reader.read(cached_path(rest_train_fp)) validation_dataset = reader.read(cached_path(rest_dev_fp)) target = train_dataset[0].fields['target'] text = train_dataset[0].fields['text'] label = train_dataset[0].fields['label'] vocab = Vocabulary.from_instances(train_dataset + validation_dataset) WORD_EMBEDDING_DIM = 50 CHAR_EMBEDDING_DIM = 5 CHAR_WORD_DIM = 30 HIDDEN_DIM = 50 # Model glove_fp = cached_path('/home/andrew/glove.6B/glove.6B.50d.txt') glove_50_weights = _read_pretrained_embeddings_file(glove_fp, 50, vocab, 'tokens_id') token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens_id'), embedding_dim=WORD_EMBEDDING_DIM, weight=glove_50_weights) id_to_tokens = vocab.get_index_to_token_vocabulary(namespace='tokens_id') token_names = list(id_to_tokens.values()) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) text_lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(WORD_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) target_lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(WORD_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) feed_forward = torch.nn.Linear(HIDDEN_DIM * 2, out_features=vocab.get_vocab_size('labels'))
def predict(cuda_device: int, char_encoder: str, data_dir: Path, glove_path: Path, temp_dir: Path, random_seed: int = 13370, numpy_seed: int = 1337, torch_seed: int = 133) -> List[Tuple[float, float, str]]: ''' This allows you to train an NER model that has either a CNN character encoder or LSTM based on the `char_encoder` argument. The encoded characters are then combined with 100D Glove vectors and put through a Bi-Directional LSTM. This is based on the following two papers: 1. CNN character encoder version `Ma and Hovy \ <https://arxiv.org/abs/1603.01354>`_ 2. LSTM character encoder version `Lample et al. \ <https://arxiv.org/abs/1603.01360>`_ :param cuda_device: Whether to use GPU or CPU, CPU = -1, GPU = 0 :param char_encoder: Whether to use an LSTM or CNN. Acceptable values are: 1. lstm, 2. cnn :param data_dir: A file path to a directory that contains three files: 1. train.txt, 2. dev.txt, 3. test.txt that are the train, dev, and test files respectively in CONLL 2003 format where the NER labels are in BIO format. :param glove_path: A file path to the `Glove 6 billion word vectors 100D \ <https://nlp.stanford.edu/projects/glove/>`_ :returns: The results as a list of tuples which are (dev f1 score, test f1 score, char encoder) where the list represents a different trained model using the same train, dev, and test split but different random seed. ''' # # The dataset we are using has already been formatted from IOB1 to BIO # When reading the dataset state the coding is the orignal as this will not # affect the labels i.e. the labels and schema is not checked. label_encoding = 'BIO' constrain_crf_decoding = True dropout = 0.5 char_embedding_dim = 30 cnn_window_size = (3, ) cnn_filters = 50 cnn_output_dim = len(cnn_window_size) * cnn_filters lstm_char_dim = 25 lstm_char_output_dim = lstm_char_dim * 2 word_embedding_dim = 100 # LSTM size is that of Ma and Hovy lstm_dim = 100 # Dropout applies dropout after the encoded text and after the word embedding. #tensorboard_dir = Path('..', 'tensorboard ner') #tensorboard_dir.mkdir(parents=True, exist_ok=True) #train_log = SummaryWriter(Path(tensorboard_dir, "log", "train")) #validation_log = SummaryWriter(Path(tensorboard_dir, "log", "validation")) train_fp = Path(data_dir, 'train.txt') dev_fp = Path(data_dir, 'dev.txt') test_fp = Path(data_dir, 'test.txt') result_fp = Path(data_dir, 'results.json') result_data = [] if result_fp.exists(): with result_fp.open('r') as json_file: result_data = json.load(json_file) indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True), 'chars': TokenCharactersIndexer(namespace='token_characters') } conll_reader = Conll2003DatasetReader(token_indexers=indexers) train_dataset = conll_reader.read(cached_path(train_fp)) dev_dataset = conll_reader.read(cached_path(dev_fp)) test_dataset = conll_reader.read(cached_path(test_fp)) vocab = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset) char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_characters"), embedding_dim=char_embedding_dim) if char_encoder.strip().lower() == 'lstm': character_lstm = torch.nn.LSTM(char_embedding_dim, lstm_char_dim, batch_first=True, bidirectional=True) character_lstm_wrapper = PytorchSeq2VecWrapper(character_lstm) token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_lstm_wrapper) total_char_embedding_dim = lstm_char_output_dim elif char_encoder.strip().lower() == 'cnn': character_cnn = CnnEncoder(embedding_dim=char_embedding_dim, num_filters=cnn_filters, ngram_filter_sizes=cnn_window_size, output_dim=cnn_output_dim) token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_cnn) total_char_embedding_dim = cnn_output_dim else: raise ValueError('The Character encoder can only be `lstm` or `cnn` ' f'and not {char_encoder}') glove_path = cached_path(glove_path) glove_100_weights = _read_pretrained_embeddings_file( glove_path, word_embedding_dim, vocab, 'tokens') token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=word_embedding_dim, weight=glove_100_weights) word_embeddings = BasicTextFieldEmbedder({ "tokens": token_embedding, "chars": token_character_encoder }) total_embedding_dim = word_embedding_dim + total_char_embedding_dim lstm = torch.nn.LSTM(total_embedding_dim, lstm_dim, batch_first=True, bidirectional=True) lstm_wrapper = PytorchSeq2SeqWrapper(lstm) model = CrfTagger(vocab, word_embeddings, lstm_wrapper, label_encoding=label_encoding, dropout=dropout, constrain_crf_decoding=constrain_crf_decoding) optimizer = optim.SGD(model.parameters(), lr=0.015, weight_decay=1e-8) schedule = LearningRateWithoutMetricsWrapper( torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9524)) iterator = BucketIterator(batch_size=64, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) temp_dir_fp = str(temp_dir.resolve()) temp_folder_path = tempfile.mkdtemp(dir=temp_dir_fp) set_random_env(cuda_device, random_seed, numpy_seed, torch_seed) trainer = Trainer(model=model, grad_clipping=5.0, learning_rate_scheduler=schedule, serialization_dir=temp_folder_path, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, shuffle=True, cuda_device=cuda_device, patience=5, num_epochs=1000) #trainer._tensorboard = TensorboardWriter(train_log=train_log, # validation_log=validation_log) interesting_metrics = trainer.train() best_model_weights = Path(temp_folder_path, 'best.th') best_model_state = torch.load(best_model_weights) model.load_state_dict(best_model_state) test_result = evaluate(model, test_dataset, iterator, cuda_device) dev_result = evaluate(model, dev_dataset, iterator, cuda_device) test_f1 = test_result['f1-measure-overall'] dev_f1 = dev_result['f1-measure-overall'] result_data.append((dev_f1, test_f1, char_encoder)) with result_fp.open('w+') as json_file: json.dump(result_data, json_file) print(f'{interesting_metrics}') return result_data
def get_model(pretrained_file: str, WORD_EMB_DIM: int, vocab: Vocabulary, num_tags: int): """ This creates a new model and returns it along with some other variables. :param pretrained_file: :param WORD_EMB_DIM: :param vocab: :param num_tags: :return: """ CNN_EMB_DIM = 128 CHAR_EMB_DIM = 16 weight = _read_pretrained_embeddings_file(pretrained_file, WORD_EMB_DIM, vocab, "tokens") token_embedding = Embedding(num_embeddings=weight.shape[0], embedding_dim=weight.shape[1], weight=weight, vocab_namespace="tokens") char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_characters"), embedding_dim=CHAR_EMB_DIM, vocab_namespace="token_characters") char_encoder = CnnEncoder( embedding_dim=CHAR_EMB_DIM, num_filters=CNN_EMB_DIM, ngram_filter_sizes=[3], conv_layer_activation=Activation.by_name("relu")()) token_characters_embedding = TokenCharactersEncoder( embedding=char_embedding, encoder=char_encoder) if USING_BERT: print("USING BERT EMBEDDINGS") bert_emb = PretrainedBertEmbedder("bert-base-multilingual-cased") tfe = BasicTextFieldEmbedder( { "bert": bert_emb, "token_characters": token_characters_embedding }, embedder_to_indexer_map={ "bert": ["bert", "bert-offsets"], "token_characters": ["token_characters"] }, allow_unmatched_keys=True) EMBEDDING_DIM = CNN_EMB_DIM + 768 else: EMBEDDING_DIM = CNN_EMB_DIM + WORD_EMB_DIM tfe = BasicTextFieldEmbedder({ "tokens": token_embedding, "token_characters": token_characters_embedding }) HIDDEN_DIM = 256 encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=0.5, num_layers=2)) model = MarginalCrfTagger(vocab, tfe, encoder, num_tags, include_start_end_transitions=False, calculate_span_f1=True, dropout=0.5, label_encoding="BIOUL", constrain_crf_decoding=True) optimizer = optim.Adam(model.parameters(), lr=0.001) if torch.cuda.is_available(): print("Using GPU") cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 return model, optimizer, cuda_device
def main(file, embeddings, model, emb_wt_key, namespace, output_dir): archive = load_archive(model) config = archive.config os.makedirs(output_dir, exist_ok=True) config.to_file(os.path.join(output_dir, CONFIG_NAME)) model = archive.model # first expand the vocabulary dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) instances = dataset_reader.read(file) vocab = model.vocab # get all the tokens in the new file namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict( lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) old_token_size = vocab.get_vocab_size(namespace) print("Before expansion: Number of instances in {} namespace: {}".format( namespace, old_token_size)) if namespace not in namespace_token_counts: logger.error( "No tokens found for namespace: {} in the new input file".format( namespace)) # identify the new tokens in the new instances token_to_add = set() token_hits = 0 for token, count in namespace_token_counts[namespace].items(): if token not in vocab._token_to_index[namespace]: # new token, must add token_to_add.add(token) else: token_hits += 1 print("Found {} existing tokens and {} new tokens in {}".format( token_hits, len(token_to_add), file)) # add the new tokens to the vocab for token in token_to_add: vocab.add_token_to_namespace(token=token, namespace=namespace) archived_parameters = dict(model.named_parameters()) # second, expand the embedding matrix for name, weights in archived_parameters.items(): # find the wt matrix for the embeddings if name == emb_wt_key: if weights.dim() != 2: logger.error( "Expected an embedding matrix for the parameter: {} instead" "found {} tensor".format(emb_wt_key, weights.shape)) emb_dim = weights.shape[-1] print("Before expansion: Size of emb matrix: {}".format( weights.shape)) # Loading embeddings for old and new tokens since that is cleaner than copying all # the embedding loading logic here all_embeddings = _read_pretrained_embeddings_file( embeddings, emb_dim, vocab, namespace) # concatenate the new entries i.e last token_to_add embeddings to the original weights if len(token_to_add) > 0: weights.data = torch.cat( [weights.data, all_embeddings[-len(token_to_add):, :]]) print("After expansion: Size of emb matrix: {}".format( weights.shape)) # save the files needed by the model archiver model_path = os.path.join(output_dir, "weight.th") model_state = model.state_dict() torch.save(model_state, model_path) vocab.save_to_files(os.path.join(output_dir, "vocabulary")) archive_model(output_dir, weights="weight.th") # more debug messages new_token_size = vocab.get_vocab_size(namespace) for name, weights in archived_parameters.items(): if name == emb_wt_key: print("Size of emb matrix: {}".format(weights.shape)) print("After expansion: Number of instances in {} namespace: {}".format( namespace, new_token_size))
def main(): target_namespace = "target_tokens" if not USE_COPY: reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace=target_namespace) }) else: reader = CopyNetDatasetReader( source_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_namespace=target_namespace) train_dataset = reader.read('./data/data_train.tsv') validation_dataset = reader.read('./data/data_val.tsv') vocab = Vocabulary.from_instances(train_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) en_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=SRC_EMBEDDING_DIM, pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt") assert en_embedding.weight.requires_grad datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file, SRC_EMBEDDING_DIM, vocab) datas.requires_grad = True en_embedding.weight.data = datas print(en_embedding.weight.data) assert en_embedding.weight.requires_grad encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(SRC_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=0.3, num_layers=1)) #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM, # hidden_dim=HIDDEN_DIM, # projection_dim=128, feedforward_hidden_dim=128, # num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) attention = DotProductAttention() if not USE_COPY: model = SimpleSeq2Seq(vocab, source_embedder, encoder, MAX_DECODING_STEPS, target_embedding_dim=TGT_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) else: model = MyCopyNet(vocab, source_embedder, encoder, max_decoding_steps=MAX_DECODING_STEPS, target_embedding_dim=TGT_EMBEDDING_DIM, target_namespace=target_namespace, attention=attention, beam_size=8, tgt_embedder_pretrain_file= "../opennmt/glove_dir/glove.840B.300d.txt") model.to(torch.device('cuda')) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=64, sorting_keys=[("source_tokens", "num_tokens")], padding_noise=0.2) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=22, patience=4, serialization_dir="./checkpoints", cuda_device=CUDA_DEVICE, summary_interval=100) trainer.train() print(en_embedding.weight.data) predictor = Seq2SeqPredictor(model, reader) # Dump all predictions to a file # TODO (DNGros): Is there an automatic way in allennlp to do this?? pred_toks = [] with open("pred.txt", "w") as outfile: for instance in tqdm(validation_dataset): pred = predictor.predict_instance(instance) toks = pred['predicted_tokens'] if toks: outfile.write(" ".join(toks[0]) + "\n") else: outfile.write("" + "\n")