def setUp(self): super().setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = Initializer.from_params( Params({ "type": "constant", "val": 1. })) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.})) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( self.vocab, deepcopy(params)) self.embedding = Embedding.from_params(self.vocab, params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = lambda tensor: torch.nn.init.constant(tensor, 1.) initializer = InitializerApplicator(default_initializer=constant_init) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def build_model(vocab: Vocabulary, bert_model: str = None) -> Model: if bert_model: embedder = BasicTextFieldEmbedder({"bert": PretrainedTransformerEmbedder(model_name=bert_model, train_parameters=True)}) encoder = BertPooler(pretrained_model=bert_model, requires_grad=True) else: # (3) How to get vectors for each Token ID: # (3.1) embed each token token_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("token_vocab")) # pretrained_file='https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz' # (3.2) embed each character in each token character_embedding = Embedding(embedding_dim=3, num_embeddings=vocab.get_vocab_size("character_vocab")) cnn_encoder = CnnEncoder(embedding_dim=3, num_filters=4, ngram_filter_sizes=[3,]) token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder) # (3.3) embed the POS of each token pos_tag_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("pos_tag_vocab")) # Each TokenEmbedders embeds its input, and the result is concatenated in an arbitrary (but consistent) order # cf: https://docs.allennlp.org/master/api/modules/text_field_embedders/basic_text_field_embedder/ embedder = BasicTextFieldEmbedder( token_embedders={"tokens": token_embedding, "token_characters": token_encoder, "pos_tags": pos_tag_embedding} ) # emb_dim = 10 + 4 + 10 = 24 encoder = BagOfEmbeddingsEncoder(embedding_dim=24, averaged=True) # ^ # average the embeddings across time, rather than simply summing # (ie. we will divide the summed embeddings by the length of the sentence). return SimpleClassifier(vocab, embedder, encoder)
def __init__(self, vocab_size, embedding_size, char_vocab_size, char_embedding_size, num_filter, ngram_filter_size, num_classes, bert_weight_path=False): super().__init__() self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_size) init.uniform_(self.char_embedding.weight, -0.1, 0.1) if bert_weight_path: self.bert = PretrainedBertEmbedder(bert_weight_path) else: self.embedding = nn.Embedding(vocab_size, embedding_dim=embedding_size) init.uniform_(self.embedding.weight, -0.1, 0.1) self.bert = None self.cnn_encoder = CnnEncoder(char_embedding_size, num_filters=num_filter, ngram_filter_sizes=ngram_filter_size) self.char_encoder = TokenCharactersEncoder(self.char_embedding, self.cnn_encoder) if bert_weight_path: embedding_size = 768 self.linear_layer = nn.Linear(embedding_size + num_filter, num_classes) init.xavier_normal_(self.linear_layer.weight)
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace(u"1", u"token_characters") self.vocab.add_token_to_namespace(u"2", u"token_characters") self.vocab.add_token_to_namespace(u"3", u"token_characters") self.vocab.add_token_to_namespace(u"4", u"token_characters") params = Params({ u"embedding": { u"embedding_dim": 2, u"vocab_namespace": u"token_characters" }, u"encoder": { u"type": u"cnn", u"embedding_dim": 2, u"num_filters": 4, u"ngram_filter_sizes": [1, 2], u"output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params[u"embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params[u"encoder"]) constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.) initializer = InitializerApplicator([(u".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def get_model(vocab: Vocabulary) -> CrfTagger: hidden_dimension = 256 layers = 2 bidirectional = True total_embedding_dim = 0 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=100, trainable=True) total_embedding_dim += 100 params = Params({ "embedding": { "embedding_dim": 16, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 16, "num_filters": 128, "ngram_filter_sizes": [3], "conv_layer_activation": "relu", }, }) char_embedding = TokenCharactersEncoder.from_params(vocab=vocab, params=params) total_embedding_dim += 128 active_embedders = { "tokens": token_embedding, "token_characters": char_embedding, } word_embeddings = BasicTextFieldEmbedder(active_embedders) network = LSTM(total_embedding_dim, hidden_dimension, num_layers=layers, batch_first=True, bidirectional=bidirectional) encoder = PytorchSeq2SeqWrapper(network, stateful=True) # Finally, we can instantiate the model. model = CrfTagger( vocab=vocab, text_field_embedder=word_embeddings, encoder=encoder, label_encoding="BIO", constrain_crf_decoding=True, calculate_span_f1=True, ) return model
def get_embedder(self, vocab, Word_embedding_dim, char_embeddedng_dim, CNN_num_filters, CNN_encoder_dim): # The word embedding will transform every word to a "Word_embedding_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, Word_embedding_dim) indexers_dict = dict() if (Word_embedding_dim > 0): word_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_ids"), embedding_dim=Word_embedding_dim) word_embedding = word_embedding.to(device=self.cf_a.device, dtype=self.cf_a.dtype) indexers_dict["tokens"] = word_embedding if (CNN_encoder_dim > 0): # The char embedding will transform every character into a ""char_embeddedng_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, max_word_length, char_embeddedng_dim) char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_chars"), embedding_dim=char_embeddedng_dim) # The Encoder will apply the cNN over the max_word_length dimension # Having a tensor (batch_size, max_sentence_length, num_filters * ngram_filter_sizes) character_cnn = CnnEncoder(ngram_filter_sizes=(1, 1), embedding_dim=char_embeddedng_dim, num_filters=CNN_num_filters, output_dim=CNN_encoder_dim) # We concatenate the char embdding and Encoding token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_cnn) token_character_encoder = token_character_encoder.to( device=self.cf_a.device, dtype=self.cf_a.dtype) indexers_dict["chars"] = token_character_encoder ### Now we finally create the finally embedder indicating what are the token ids it embedds text_field_embedder = BasicTextFieldEmbedder(indexers_dict) return text_field_embedder
def construct_model(vocab, args): # token embedding word_embedding = Embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": "glove\\glove.vocab.100d.txt", "embedding_dim": 100, "trainable": True, "padding_index": 0 })) word_embedding = BasicTextFieldEmbedder({ "token_words": word_embedding }) char_embedding = BasicTextFieldEmbedder({ "token_characters": TokenCharactersEncoder(embedding=Embedding(embedding_dim=20, num_embeddings=262), encoder=CnnEncoder(embedding_dim=20, ngram_filter_sizes=[5], num_filters=50)), }) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=100, num_layers=1, hidden_size=100, bidirectional=True, batch_first=True)) model = FollowUpSnippetModel(vocab=vocab, word_embedder=word_embedding, char_embedder=char_embedding, tokens_encoder=lstm, model_args=args) return model
def build_model(vocab: Vocabulary) -> Model: print("Building the model") vocab_size_tokens = vocab.get_vocab_size("tokens") vocab_size_chars = vocab.get_vocab_size("token_characters") embedder = BasicTextFieldEmbedder({"tokens": Embedding(embedding_dim=embedding_dim, pretrained_file=f"{cur_dir}/glove/glove.6B.200d.txt", trainable=False, num_embeddings=vocab_size_tokens, vocab=vocab),\ "elmo": ElmoTokenEmbedder(weight_file="https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", do_layer_norm=False, dropout=0.0),\ "token_characters":TokenCharactersEncoder(embedding=Embedding(embedding_dim=16, num_embeddings=vocab_size_chars, vocab=vocab), \ encoder=CnnEncoder(embedding_dim=16, num_filters=128, ngram_filter_sizes=[3]))}) encoder = PytorchTransformer(input_dim=1352, num_layers=6, positional_encoding="sinusoidal") # embedder = BasicTextFieldEmbedder({"tokens": Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size)}) # encoder = BagOfEmbeddingsEncoder(embedding_dim=embedding_dim) # embedder = BasicTextFieldEmbedder({"tokens": PretrainedTransformerMismatchedEmbedder("bert-large-uncased")}) # encoder = LstmSeq2SeqEncoder(input_size=1024, hidden_size=1024, num_layers=2, dropout=0.5, bidirectional=True) if args.pseudo: return PseudoCrfTagger(vocab, embedder, encoder, \ label_encoding="BIOUL", include_start_end_transitions=False, num_virtual_models = num_virtual_models) else: return CrfTagger(vocab, embedder, encoder, \ label_encoding="BIOUL", include_start_end_transitions=False)
def build_embeddings(args, vocab, pretrained_embs=None): ''' Build embeddings according to options in args ''' d_emb, d_char = 0, args.d_char token_embedder = {} # Word embeddings if args.word_embs != 'none': if args.word_embs in ['glove', 'fastText'] and pretrained_embs is not None: log.info("\tUsing word embeddings from %s", args.word_embs_file) word_embs = pretrained_embs d_word = pretrained_embs.size()[-1] else: log.info("\tLearning word embeddings from scratch!") word_embs = None d_word = args.d_word embeddings = Embedding(vocab.get_vocab_size('tokens'), d_word, weight=word_embs, trainable=False, padding_index=vocab.get_token_index('@@PADDING@@')) token_embedder["words"] = embeddings d_emb += d_word else: log.info("\tNot using word embeddings!") # Handle cove if args.cove: sys.path.append(args.path_to_cove) try: from cove import MTLSTM as cove_lstm cove_emb = cove_lstm(n_vocab=vocab.get_vocab_size('tokens'), vectors=embeddings.weight.data) d_emb += 600 log.info("\tUsing CoVe embeddings!") except ImportError: log.info("Failed to import CoVE!") else: cove_emb = None # Character embeddings if args.char_embs: log.info("\tUsing character embeddings!") char_embeddings = Embedding(vocab.get_vocab_size('chars'), d_char) filter_sizes = tuple([int(i) for i in args.char_filter_sizes.split(',')]) char_encoder = CnnEncoder(d_char, num_filters=args.n_char_filters, ngram_filter_sizes=filter_sizes, output_dim=d_char) char_embedder = TokenCharactersEncoder(char_embeddings, char_encoder, dropout=args.dropout_embs) d_emb += d_char token_embedder["chars"] = char_embedder else: log.info("\tNot using character embeddings!") # Handle elmo if args.elmo: log.info("Loading ELMo from files:") log.info("ELMO_OPT_PATH = %s", ELMO_OPT_PATH) log.info("ELMO_WEIGHTS_PATH = %s", ELMO_WEIGHTS_PATH) if args.elmo_chars_only: log.info("\tUsing ELMo character CNN only!") #elmo_embedder = elmo_embedder._elmo._elmo_lstm._token_embedder elmo_embedder = ElmoCharacterEncoder(options_file=ELMO_OPT_PATH, weight_file=ELMO_WEIGHTS_PATH, requires_grad=False) d_emb += 512 else: log.info("\tUsing full ELMo!") elmo_embedder = ElmoTokenEmbedder(options_file=ELMO_OPT_PATH, weight_file=ELMO_WEIGHTS_PATH, dropout=args.dropout) d_emb += 1024 token_embedder["elmo"] = elmo_embedder embedder = BasicTextFieldEmbedder(token_embedder) assert d_emb, "You turned off all the embeddings, ya goof!" return d_emb, embedder, cove_emb
def predict(cuda_device: int, char_encoder: str, data_dir: Path, glove_path: Path, temp_dir: Path, random_seed: int = 13370, numpy_seed: int = 1337, torch_seed: int = 133) -> List[Tuple[float, float, str]]: ''' This allows you to train an NER model that has either a CNN character encoder or LSTM based on the `char_encoder` argument. The encoded characters are then combined with 100D Glove vectors and put through a Bi-Directional LSTM. This is based on the following two papers: 1. CNN character encoder version `Ma and Hovy \ <https://arxiv.org/abs/1603.01354>`_ 2. LSTM character encoder version `Lample et al. \ <https://arxiv.org/abs/1603.01360>`_ :param cuda_device: Whether to use GPU or CPU, CPU = -1, GPU = 0 :param char_encoder: Whether to use an LSTM or CNN. Acceptable values are: 1. lstm, 2. cnn :param data_dir: A file path to a directory that contains three files: 1. train.txt, 2. dev.txt, 3. test.txt that are the train, dev, and test files respectively in CONLL 2003 format where the NER labels are in BIO format. :param glove_path: A file path to the `Glove 6 billion word vectors 100D \ <https://nlp.stanford.edu/projects/glove/>`_ :returns: The results as a list of tuples which are (dev f1 score, test f1 score, char encoder) where the list represents a different trained model using the same train, dev, and test split but different random seed. ''' # # The dataset we are using has already been formatted from IOB1 to BIO # When reading the dataset state the coding is the orignal as this will not # affect the labels i.e. the labels and schema is not checked. label_encoding = 'BIO' constrain_crf_decoding = True dropout = 0.5 char_embedding_dim = 30 cnn_window_size = (3, ) cnn_filters = 50 cnn_output_dim = len(cnn_window_size) * cnn_filters lstm_char_dim = 25 lstm_char_output_dim = lstm_char_dim * 2 word_embedding_dim = 100 # LSTM size is that of Ma and Hovy lstm_dim = 100 # Dropout applies dropout after the encoded text and after the word embedding. #tensorboard_dir = Path('..', 'tensorboard ner') #tensorboard_dir.mkdir(parents=True, exist_ok=True) #train_log = SummaryWriter(Path(tensorboard_dir, "log", "train")) #validation_log = SummaryWriter(Path(tensorboard_dir, "log", "validation")) train_fp = Path(data_dir, 'train.txt') dev_fp = Path(data_dir, 'dev.txt') test_fp = Path(data_dir, 'test.txt') result_fp = Path(data_dir, 'results.json') result_data = [] if result_fp.exists(): with result_fp.open('r') as json_file: result_data = json.load(json_file) indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True), 'chars': TokenCharactersIndexer(namespace='token_characters') } conll_reader = Conll2003DatasetReader(token_indexers=indexers) train_dataset = conll_reader.read(cached_path(train_fp)) dev_dataset = conll_reader.read(cached_path(dev_fp)) test_dataset = conll_reader.read(cached_path(test_fp)) vocab = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset) char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_characters"), embedding_dim=char_embedding_dim) if char_encoder.strip().lower() == 'lstm': character_lstm = torch.nn.LSTM(char_embedding_dim, lstm_char_dim, batch_first=True, bidirectional=True) character_lstm_wrapper = PytorchSeq2VecWrapper(character_lstm) token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_lstm_wrapper) total_char_embedding_dim = lstm_char_output_dim elif char_encoder.strip().lower() == 'cnn': character_cnn = CnnEncoder(embedding_dim=char_embedding_dim, num_filters=cnn_filters, ngram_filter_sizes=cnn_window_size, output_dim=cnn_output_dim) token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_cnn) total_char_embedding_dim = cnn_output_dim else: raise ValueError('The Character encoder can only be `lstm` or `cnn` ' f'and not {char_encoder}') glove_path = cached_path(glove_path) glove_100_weights = _read_pretrained_embeddings_file( glove_path, word_embedding_dim, vocab, 'tokens') token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=word_embedding_dim, weight=glove_100_weights) word_embeddings = BasicTextFieldEmbedder({ "tokens": token_embedding, "chars": token_character_encoder }) total_embedding_dim = word_embedding_dim + total_char_embedding_dim lstm = torch.nn.LSTM(total_embedding_dim, lstm_dim, batch_first=True, bidirectional=True) lstm_wrapper = PytorchSeq2SeqWrapper(lstm) model = CrfTagger(vocab, word_embeddings, lstm_wrapper, label_encoding=label_encoding, dropout=dropout, constrain_crf_decoding=constrain_crf_decoding) optimizer = optim.SGD(model.parameters(), lr=0.015, weight_decay=1e-8) schedule = LearningRateWithoutMetricsWrapper( torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9524)) iterator = BucketIterator(batch_size=64, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) temp_dir_fp = str(temp_dir.resolve()) temp_folder_path = tempfile.mkdtemp(dir=temp_dir_fp) set_random_env(cuda_device, random_seed, numpy_seed, torch_seed) trainer = Trainer(model=model, grad_clipping=5.0, learning_rate_scheduler=schedule, serialization_dir=temp_folder_path, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, shuffle=True, cuda_device=cuda_device, patience=5, num_epochs=1000) #trainer._tensorboard = TensorboardWriter(train_log=train_log, # validation_log=validation_log) interesting_metrics = trainer.train() best_model_weights = Path(temp_folder_path, 'best.th') best_model_state = torch.load(best_model_weights) model.load_state_dict(best_model_state) test_result = evaluate(model, test_dataset, iterator, cuda_device) dev_result = evaluate(model, dev_dataset, iterator, cuda_device) test_f1 = test_result['f1-measure-overall'] dev_f1 = dev_result['f1-measure-overall'] result_data.append((dev_f1, test_f1, char_encoder)) with result_fp.open('w+') as json_file: json.dump(result_data, json_file) print(f'{interesting_metrics}') return result_data
def get_model(pretrained_file: str, WORD_EMB_DIM: int, vocab: Vocabulary, num_tags: int): """ This creates a new model and returns it along with some other variables. :param pretrained_file: :param WORD_EMB_DIM: :param vocab: :param num_tags: :return: """ CNN_EMB_DIM = 128 CHAR_EMB_DIM = 16 weight = _read_pretrained_embeddings_file(pretrained_file, WORD_EMB_DIM, vocab, "tokens") token_embedding = Embedding(num_embeddings=weight.shape[0], embedding_dim=weight.shape[1], weight=weight, vocab_namespace="tokens") char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_characters"), embedding_dim=CHAR_EMB_DIM, vocab_namespace="token_characters") char_encoder = CnnEncoder( embedding_dim=CHAR_EMB_DIM, num_filters=CNN_EMB_DIM, ngram_filter_sizes=[3], conv_layer_activation=Activation.by_name("relu")()) token_characters_embedding = TokenCharactersEncoder( embedding=char_embedding, encoder=char_encoder) if USING_BERT: print("USING BERT EMBEDDINGS") bert_emb = PretrainedBertEmbedder("bert-base-multilingual-cased") tfe = BasicTextFieldEmbedder( { "bert": bert_emb, "token_characters": token_characters_embedding }, embedder_to_indexer_map={ "bert": ["bert", "bert-offsets"], "token_characters": ["token_characters"] }, allow_unmatched_keys=True) EMBEDDING_DIM = CNN_EMB_DIM + 768 else: EMBEDDING_DIM = CNN_EMB_DIM + WORD_EMB_DIM tfe = BasicTextFieldEmbedder({ "tokens": token_embedding, "token_characters": token_characters_embedding }) HIDDEN_DIM = 256 encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=0.5, num_layers=2)) model = MarginalCrfTagger(vocab, tfe, encoder, num_tags, include_start_end_transitions=False, calculate_span_f1=True, dropout=0.5, label_encoding="BIOUL", constrain_crf_decoding=True) optimizer = optim.Adam(model.parameters(), lr=0.001) if torch.cuda.is_available(): print("Using GPU") cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 return model, optimizer, cuda_device
def build_model(vocab: Vocabulary) -> Model: print("Building the model") EMBEDDING_DIM = 300 ELMO_DIM = 1024 NUM_FILTERS = 60 NGRAM_FILTER_SIZES = (2, 3, 4, 5, 6) #out_dim for char = len(NGRAM_FILTER_SIZES) * NUM_FILTERS HIDDEN_DIM = 300 F_OUT1 = 900 F_OUT2 = 200 F_OUT = 2 RMF_DIM = 140 RMF_DIM_OUT = 100 elmo_options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" elmo_weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file, weight_file=elmo_weight_file) # This is for encoding the characters in each token. character_embedding = Embedding(vocab = vocab, embedding_dim = EMBEDDING_DIM, vocab_namespace = 'character_vocab' ) cnn_encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, ngram_filter_sizes = NGRAM_FILTER_SIZES ) token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder) # This is for embedding the part of speech tag of each token. pos_tag_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='pos_tag_vocab' ) text_embedder = BasicTextFieldEmbedder(token_embedders={'elmo_tokens': elmo_embedding, 'token_characters': token_encoder, 'pos_tags': pos_tag_embedding }) ##encoder encoder = PytorchSeq2VecWrapper(torch.nn.LSTM( EMBEDDING_DIM + ELMO_DIM + len(NGRAM_FILTER_SIZES) * NUM_FILTERS , HIDDEN_DIM, num_layers = 2, batch_first=True, bidirectional = True )) ## FF to combines two lstm inputs final_linear_layer = FeedForward(HIDDEN_DIM * 4 + RMF_DIM_OUT, 3, [F_OUT1, F_OUT2, F_OUT], torch.nn.ReLU(), 0.3 ) ## FF to combines two lstm inputs rmf_linear_layer = FeedForward(RMF_DIM, 1, RMF_DIM_OUT, torch.nn.Sigmoid(), 0.3 ) #Matching model model = Matcher(vocab = vocab, text_field_embedder = text_embedder, encoder = encoder, rmf_layer = rmf_linear_layer, classifier_feedforward = final_linear_layer ) return model
def run_experiment(use_similarity_targets, embedding_type, rnn_type, hparams): log = {} log["name"] = "{} {} {} {}".format( rnn_type, embedding_type, "similarity_target" if use_similarity_targets else "hard_target", hparams["update_targets"] ) vocab = Vocabulary().from_files(hparams["vocab_path"]) if embedding_type == "Chord": # data reader reader = CpmDatasetReader() # chord embedder token_embedding = Embedding( num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=hparams["chord_token_embedding_dim"], ) chord_embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) elif embedding_type == "Note": # data reader note_tokenizer = NoteTokenizer() note_indexer = TokenCharactersIndexer( namespace="notes", min_padding_length=4, character_tokenizer=note_tokenizer ) reader = CpmDatasetReader( token_indexers={"tokens": SingleIdTokenIndexer(), "notes": note_indexer} ) # chord embedder token_embedding = Embedding( num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=hparams["chord_token_embedding_dim"], ) note_token_embedding = Embedding( vocab.get_vocab_size("notes"), hparams["note_embedding_dim"] ) note_encoder = CnnEncoder( num_filters=hparams["cnn_encoder_num_filters"], ngram_filter_sizes=hparams["cnn_encoder_n_gram_filter_sizes"], embedding_dim=hparams["note_embedding_dim"], output_dim=hparams["note_level_embedding_dim"], ) note_embedding = TokenCharactersEncoder( note_token_embedding, note_encoder) chord_embedder = BasicTextFieldEmbedder( {"tokens": token_embedding, "notes": note_embedding} ) else: raise ValueError("Unknown embedding type:", embedding_type) # read data train_dataset = reader.read(os.path.join( hparams["data_path"], "train.txt")) val_dataset = reader.read(os.path.join(hparams["data_path"], "val.txt")) test_dataset = reader.read(os.path.join(hparams["data_path"], "test.txt")) # contextualizer contextual_input_dim = chord_embedder.get_output_dim() if rnn_type == "RNN": contextualizer = PytorchSeq2SeqWrapper( torch.nn.RNN( contextual_input_dim, hparams["rnn_hidden_dim"], batch_first=True, bidirectional=False ) ) elif rnn_type == "LSTM": contextualizer = PytorchSeq2SeqWrapper( torch.nn.LSTM( contextual_input_dim, hparams["lstm_hidden_dim"], batch_first=True, bidirectional=False ) ) elif rnn_type == "GRU": contextualizer = PytorchSeq2SeqWrapper( torch.nn.GRU( contextual_input_dim, hparams["gru_hidden_dim"], batch_first=True, bidirectional=False ) ) else: raise ValueError("Unknown rnn type:", rnn_type) if use_similarity_targets: vocab_size = vocab.get_vocab_size("tokens") similarity_targets = Embedding( num_embeddings=vocab_size, embedding_dim=vocab_size, weight=torch.load(hparams["similarity_target_path"]), trainable=False, ) else: similarity_targets = None iterator = BucketIterator( batch_size=hparams["batch_size"], sorting_keys=[ ("input_tokens", "num_tokens")] ) iterator.index_with(vocab) batches_per_epoch = math.ceil(len(train_dataset) / hparams["batch_size"]) model_hparams = { "dropout": None, "similarity_targets": similarity_targets, "update_targets": hparams["update_targets"], "T_initial": hparams["T_initial"], "decay_rate": hparams["decay_rate"], "batches_per_epoch": batches_per_epoch, "fc_hidden_dim": hparams["fc_hidden_dim"] } # chord progression model model = Cpm( vocab, chord_embedder, contextualizer, model_hparams ) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) print("GPU available.") else: cuda_device = -1 optimizer = optim.Adam(model.parameters(), lr=hparams["lr"]) ts = time.gmtime() saved_model_path = os.path.join( hparams["saved_model_path"], time.strftime("%Y-%m-%d %H-%M-%S", ts)) serialization_dir = os.path.join(saved_model_path, "checkpoints") trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=val_dataset, serialization_dir=serialization_dir, patience=hparams["patience"], num_epochs=hparams["num_epochs"], cuda_device=cuda_device, ) trainer.train() saved_model_path = os.path.join( saved_model_path, "{}.th".format(log["name"])) torch.save(model.state_dict(), saved_model_path) predictor = Predictor(model=model, iterator=iterator, cuda_device=cuda_device) pred_metrics = predictor.predict(test_dataset) log["metrics"] = pred_metrics log["saved_mode_path"] = saved_model_path return log
# Having a tensor (batch_size, max_sentence_length, Word_embedding_dim) word_embedding = Embedding(num_embeddings=vocab.get_vocab_size("token_ids"), embedding_dim=Word_embedding_dim) # The char embedding will transform every character into a ""char_embeddedng_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, max_word_length, char_embeddedng_dim) char_embedding = Embedding(num_embeddings=vocab.get_vocab_size("token_chars"), embedding_dim=char_embeddedng_dim) # The Encoder will apply the cNN over the max_word_length dimension # Having a tensor (batch_size, max_sentence_length, num_filters * ngram_filter_sizes) character_cnn = CnnEncoder(embedding_dim=char_embeddedng_dim, num_filters=CNN_num_filters, output_dim=CNN_encoder_dim) # We concatenate the char embdding and Encoding token_character_encoder = TokenCharactersEncoder(embedding=char_embedding, encoder=character_cnn) ### Now we finally create the finally embedder indicating what are the token ids it embedds text_field_embedder = BasicTextFieldEmbedder({ "tokens": word_embedding, "chars": token_character_encoder }) ## Apply the Embedding to the batch # This will have shape: (batch_size, sentence_length, word_embedding_dim + character_cnn_output_dim) embedded_text = text_field_embedder(tensor_dict["text_field"]) print(embedded_text.shape) dimensions = list(embedded_text.size()) print("Post embedding with our TextFieldEmbedder: ") print("Batch Size: ", dimensions[0])
def build_model(vocab: Vocabulary) -> Model: print("Building the model") EMBEDDING_DIM = 300 HIDDEN_DIM = 300 NUM_FILTERS = 60 NGRAM_FILTER_SIZES = (2, 3, 4, 5, 6) #out_dim for char = len(NGRAM_FILTER_SIZES) * NUM_FILTERS F_OUT = 200 elmo_options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" elmo_weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file, weight_file=elmo_weight_file) character_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='character_vocab') cnn_encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, ngram_filter_sizes=NGRAM_FILTER_SIZES) token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder) pos_tag_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='pos_tag_vocab') ner_tag_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='ner_tag_vocab') word_embedding = Embedding(vocab=vocab, embedding_dim=EMBEDDING_DIM, vocab_namespace='token_vocab') utterance_embedder = BasicTextFieldEmbedder( token_embedders={ 'elmo_tokens': elmo_embedding, 'token_characters': token_encoder, 'pos_tags': pos_tag_embedding, 'ner_tags': ner_tag_embedding }) #slot embed slot_embedder = BasicTextFieldEmbedder(token_embedders={ 'elmo_tokens': elmo_embedding, 'token_characters': token_encoder, }) utterance_lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(2 * EMBEDDING_DIM + 1024 + len(NGRAM_FILTER_SIZES) * NUM_FILTERS, HIDDEN_DIM, num_layers=2, batch_first=True, bidirectional=True)) slot_lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(1024 + len(NGRAM_FILTER_SIZES) * NUM_FILTERS, HIDDEN_DIM, num_layers=2, batch_first=True, bidirectional=True)) similarity = LinearMatrixAttention(tensor_1_dim=2 * HIDDEN_DIM, tensor_2_dim=2 * HIDDEN_DIM, combination="x,y,x*y", activation=Activation.by_name('tanh')()) modeling_lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM( 2 * 5 * HIDDEN_DIM, # bi-direction HIDDEN_DIM, num_layers=2, batch_first=True, bidirectional=True)) #step1_utterance utterance_embedder2 = BasicTextFieldEmbedder( token_embedders={ 'elmo_tokens': elmo_embedding, 'token_characters': token_encoder, 'pos_tags': pos_tag_embedding, 'ner_tags': ner_tag_embedding }) utterance_lstm2 = PytorchSeq2SeqWrapper( torch.nn.LSTM(2 * EMBEDDING_DIM + 1024 + len(NGRAM_FILTER_SIZES) * NUM_FILTERS, HIDDEN_DIM, num_layers=2, batch_first=True, bidirectional=True)) ## FF to combines two lstm inputs final_linear_layer = FeedForward(2 * HIDDEN_DIM, 2, [HIDDEN_DIM, F_OUT], torch.nn.ReLU(), 0.3) #CRF model model = CrfTagger(vocab=vocab, utterance_embedder=utterance_embedder, utterance_embedder2=utterance_embedder2, slot_embedder=slot_embedder, utterance_encoder=utterance_lstm, utterance_encoder2=utterance_lstm2, slot_encoder=slot_lstm, matrix_attention=similarity, modeling_layer=modeling_lstm, fc_ff_layer=final_linear_layer) return model
dataset_reader = Seq2SeqDatasetReaderV1(source_token_indexers=source_token_indexers, target_token_indexers=target_token_indexers) train_data = list(dataset_reader.read(args.train_file)) vocab = Vocabulary.from_instances(train_data) # valid_data = dataset_reader.read(args.valid_file) test_data = dataset_reader.read(args.test_file) src_embedding = Embedding(embedding_dim=args.emb_dim, vocab_namespace="source_tokens", vocab=vocab) src_char_embedding = Embedding(embedding_dim=args.emb_dim, vocab_namespace="source_char_tokens", vocab=vocab) src_char_encoder = TokenCharactersEncoder(embedding=src_char_embedding, encoder=GruSeq2VecEncoder(input_size=args.emb_dim, hidden_size=args.hid_dim)) tgt_embedding = Embedding(embedding_dim=args.emb_dim, vocab_namespace="target_tokens", vocab=vocab) tgt_char_embedding = Embedding(embedding_dim=args.emb_dim, vocab_namespace="target_char_tokens", vocab=vocab) tgt_char_encoder = TokenCharactersEncoder(embedding=tgt_char_embedding, encoder=GruSeq2VecEncoder(input_size=args.emb_dim, hidden_size=args.hid_dim)) src_embedders = BasicTextFieldEmbedder({ "tokens": src_embedding, "character_tokens": src_char_encoder }) tgt_embedders = BasicTextFieldEmbedder({
def build_embeddings(args, vocab, tasks, pretrained_embs=None): ''' Build embeddings according to options in args ''' d_emb, d_char = 0, args.d_char token_embedders = {} # Word embeddings n_token_vocab = vocab.get_vocab_size('tokens') if args.word_embs != 'none': if args.word_embs in ['glove', 'fastText' ] and pretrained_embs is not None: word_embs = pretrained_embs assert word_embs.size()[0] == n_token_vocab d_word = word_embs.size()[1] log.info("\tUsing pre-trained word embeddings: %s", str(word_embs.size())) else: log.info("\tLearning word embeddings from scratch!") word_embs = None d_word = args.d_word embeddings = Embedding( num_embeddings=n_token_vocab, embedding_dim=d_word, weight=word_embs, trainable=False, padding_index=vocab.get_token_index('@@PADDING@@')) token_embedders["words"] = embeddings d_emb += d_word else: embeddings = None log.info("\tNot using word embeddings!") # Handle cove cove_layer = None if args.cove: assert embeddings is not None assert args.word_embs == "glove", "CoVe requires GloVe embeddings." assert d_word == 300, "CoVe expects 300-dimensional GloVe embeddings." try: from .cove.cove import MTLSTM as cove_lstm # Have CoVe do an internal GloVe lookup, but don't add residual. # We'll do this manually in modules.py; see # SentenceEncoder.forward(). cove_layer = cove_lstm(n_vocab=n_token_vocab, vectors=embeddings.weight.data) # Control whether CoVe is trainable. for param in cove_layer.parameters(): param.requires_grad = bool(args.cove_fine_tune) d_emb += 600 # 300 x 2 for biLSTM activations log.info("\tUsing CoVe embeddings!") except ImportError as e: log.info("Failed to import CoVe!") raise e # Character embeddings if args.char_embs: log.info("\tUsing character embeddings!") char_embeddings = Embedding(vocab.get_vocab_size('chars'), d_char) filter_sizes = tuple( [int(i) for i in args.char_filter_sizes.split(',')]) char_encoder = CnnEncoder(d_char, num_filters=args.n_char_filters, ngram_filter_sizes=filter_sizes, output_dim=d_char) char_embedder = TokenCharactersEncoder(char_embeddings, char_encoder, dropout=args.dropout_embs) d_emb += d_char token_embedders["chars"] = char_embedder else: log.info("\tNot using character embeddings!") # If we want separate ELMo scalar weights (a different ELMo representation for each classifier, # then we need count and reliably map each classifier to an index used by # allennlp internal ELMo. if args.sep_embs_for_skip: # Determine a deterministic list of classifier names to use for each task. classifiers = sorted(set(map(lambda x: x._classifier_name, tasks))) # Reload existing classifier map, if it exists. classifier_save_path = args.run_dir + "/classifier_task_map.json" if os.path.isfile(classifier_save_path): loaded_classifiers = json.load( open(args.run_dir + "/classifier_task_map.json", 'r')) else: # No file exists, so assuming we are just starting to pretrain. If pretrain is to be # skipped, then there's a way to bypass this assertion by explicitly allowing for a missing # classiifer task map. assert_for_log( args.do_pretrain or args.allow_missing_task_map, "Error: {} should already exist.".format(classifier_save_path)) if args.allow_missing_task_map: log.warning("Warning: classifier task map not found in model" " directory. Creating a new one from scratch.") loaded_classifiers = { "@pretrain@": 0 } # default is always @pretrain@ # Add the new tasks and update map, keeping the internal ELMo index consistent. max_number_classifiers = max(loaded_classifiers.values()) offset = 1 for classifier in classifiers: if classifier not in loaded_classifiers: loaded_classifiers[ classifier] = max_number_classifiers + offset offset += 1 log.info("Classifiers:{}".format(loaded_classifiers)) open(classifier_save_path, 'w+').write(json.dumps(loaded_classifiers)) # Every index in classifiers needs to correspond to a valid ELMo output representation. num_reps = 1 + max(loaded_classifiers.values()) else: # All tasks share the same scalars. # Not used if self.elmo_chars_only = 1 (i.e. no elmo) loaded_classifiers = {"@pretrain@": 0} num_reps = 1 if args.elmo: log.info("Loading ELMo from files:") log.info("ELMO_OPT_PATH = %s", ELMO_OPT_PATH) if args.elmo_chars_only: log.info("\tUsing ELMo character CNN only!") log.info("ELMO_WEIGHTS_PATH = %s", ELMO_WEIGHTS_PATH) elmo_embedder = ElmoCharacterEncoder(options_file=ELMO_OPT_PATH, weight_file=ELMO_WEIGHTS_PATH, requires_grad=False) d_emb += 512 else: log.info("\tUsing full ELMo! (separate scalars/task)") if args.elmo_weight_file_path != 'none': assert os.path.exists(args.elmo_weight_file_path), "ELMo weight file path \"" + \ args.elmo_weight_file_path + "\" does not exist." weight_file = args.elmo_weight_file_path else: weight_file = ELMO_WEIGHTS_PATH log.info("ELMO_WEIGHTS_PATH = %s", weight_file) elmo_embedder = ElmoTokenEmbedderWrapper( options_file=ELMO_OPT_PATH, weight_file=weight_file, num_output_representations=num_reps, # Dropout is added by the sentence encoder later. dropout=0.) d_emb += 1024 token_embedders["elmo"] = elmo_embedder # Wrap ELMo and other embedders, and concatenates the resulting # representations alone the last (vector) dimension. embedder = ElmoTextFieldEmbedder(token_embedders, loaded_classifiers, elmo_chars_only=args.elmo_chars_only, sep_embs_for_skip=args.sep_embs_for_skip) assert d_emb, "You turned off all the embeddings, ya goof!" return d_emb, embedder, cove_layer
# Note that we added the batch dimension at the front. Don't worry too much # about the magic 'token_characters' key - that is hard-coded to be produced # by the TokenCharactersIndexer, and accepted by TokenCharactersEncoder; # you don't have to produce those yourself in normal settings, it's done for you. token_tensor = { 'indexer2': { 'token_characters': torch.tensor([[[1, 3, 0], [4, 2, 3], [1, 9, 5], [6, 0, 0]]]) } } character_embedding = Embedding(num_embeddings=10, embedding_dim=3) cnn_encoder = CnnEncoder(embedding_dim=3, num_filters=4, ngram_filter_sizes=(3, )) token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder) # Again here, the 'indexer2' key is arbitrary. It just has to match whatever key # you gave to the corresponding TokenIndexer in your data code, which ends up # as the top-level key in the token_tensor dictionary. embedder = BasicTextFieldEmbedder(token_embedders={'indexer2': token_encoder}) embedded_tokens = embedder(token_tensor) print("With a character CNN:", embedded_tokens) # This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer # and a TokenCharactersIndexer; see the code snippet above. This time we're using # more intuitive names for the indexers and embedders. token_tensor = { 'tokens': { 'tokens': torch.LongTensor([[2, 4, 3, 5]])