def from_params(cls, vocab: Vocabulary, params: Params) -> 'SpanConstituencyParser': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) span_extractor = SpanExtractor.from_params(params.pop("span_extractor")) encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) feed_forward_params = params.pop("feedforward", None) if feed_forward_params is not None: feedforward_layer = FeedForward.from_params(feed_forward_params) else: feedforward_layer = None pos_tag_embedding_params = params.pop("pos_tag_embedding", None) if pos_tag_embedding_params is not None: pos_tag_embedding = Embedding.from_params(vocab, pos_tag_embedding_params) else: pos_tag_embedding = None initializer = InitializerApplicator.from_params(params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) evalb_directory_path = params.pop("evalb_directory_path", None) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, span_extractor=span_extractor, encoder=encoder, feedforward_layer=feedforward_layer, pos_tag_embedding=pos_tag_embedding, initializer=initializer, regularizer=regularizer, evalb_directory_path=evalb_directory_path)
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( self.vocab, deepcopy(params)) self.embedding = Embedding.from_params(self.vocab, params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = lambda tensor: torch.nn.init.constant(tensor, 1.) initializer = InitializerApplicator(default_initializer=constant_init) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def setUp(self): super(TestNgramWordsEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("i", "ngram_words") self.vocab.add_token_to_namespace("go", "ngram_words") self.vocab.add_token_to_namespace("to", "ngram_words") self.vocab.add_token_to_namespace("school", "ngram_words") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "ngram_words" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = NgramWordsEncoder.from_params(self.vocab, deepcopy(params)) self.embedding = Embedding.from_params(self.vocab, params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = lambda tensor: torch.nn.init.constant(tensor, 1.) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace(u"1", u"token_characters") self.vocab.add_token_to_namespace(u"2", u"token_characters") self.vocab.add_token_to_namespace(u"3", u"token_characters") self.vocab.add_token_to_namespace(u"4", u"token_characters") params = Params({ u"embedding": { u"embedding_dim": 2, u"vocab_namespace": u"token_characters" }, u"encoder": { u"type": u"cnn", u"embedding_dim": 2, u"num_filters": 4, u"ngram_filter_sizes": [1, 2], u"output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params[u"embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params[u"encoder"]) constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.) initializer = InitializerApplicator([(u".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def setUp(self): super().setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = Initializer.from_params( Params({ "type": "constant", "val": 1. })) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.})) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, inp_dim, hid_dim, dropout: float = 0.4, dropout_emb: float = 0.2, pretrain_embedding_file=None, gather='sum'): super(EncDoc, self).__init__(vocab, regularizer) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=inp_dim) if dropout_emb > 0: self._lexical_dropout = torch.nn.Dropout(p=dropout_emb) else: self._lexical_dropout = lambda x: x self.hid_dim = hid_dim self.sent_enc = EncWord2Sent(inp_dim=inp_dim, hid_dim=hid_dim, dropout=dropout, gather=gather) if pretrain_embedding_file is not None: logger = logging.getLogger() logger.info( "Loading word embedding: {}".format(pretrain_embedding_file)) token_embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": pretrain_embedding_file, "embedding_dim": inp_dim })) print("token_embedding size: {}".format( token_embedding.num_embeddings)) self._text_field_embedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) self.sent2doc = EncWord2Sent(inp_dim=self.sent_enc.get_output_dim(), hid_dim=hid_dim, nenc_lay=1, dropout=dropout)
def __init__(self, device, inp_dim, hid_dim, compression, vocab, dropout: float = 0.4, dropout_emb: float = 0.2, pretrain_embedding_file=None): super().__init__() self.compression = compression self.hid_dim = hid_dim self.sent_enc = EncSent(device=device, inp_dim=inp_dim, hid_dim=hid_dim, compression=compression) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=inp_dim) if dropout_emb > 0: self._lexical_dropout = torch.nn.Dropout(p=dropout_emb) else: self._lexical_dropout = lambda x: x if pretrain_embedding_file is not None: logger = logging.getLogger() logger.info( "Loading word embedding: {}".format(pretrain_embedding_file)) token_embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": pretrain_embedding_file, "embedding_dim": inp_dim })) self._text_field_embedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) self.sent2doc = EncWord2Sent(device=device, inp_dim=self.sent_enc.get_output_dim(), hidden_dim=hid_dim, nenc_lay=2, dropout=dropout)
def glove_embeddings(vocab: Vocabulary, file_path: Path, dimension: int, training: bool = True, namespace: str = 'tokens' ) -> BasicTextFieldEmbedder: "Pre-trained embeddings using GloVe" token_embedding = Embedding.from_params(vocab, Params({ "embedding_dim": dimension, "vocab_namespace": 'tokens', "pretrained_file": str(file_path), "trainable": training, })) word_embeddings = BasicTextFieldEmbedder({namespace: token_embedding}) return word_embeddings
def from_params(cls, vocab: Vocabulary, params: Params) -> 'SyllableEmbedder': # type: ignore # pylint: disable=arguments-differ embedding_params: Params = params.pop("syllable_embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. If num_embeddings is present, set default namespace # to None so that extend_vocab call doesn't misinterpret that some namespace was originally used. default_namespace = None if embedding_params.get( "num_embeddings", None) else "token_characters" embedding_params.setdefault("vocab_namespace", default_namespace) embedding = Embedding.from_params(vocab, embedding_params) encoder_params: Params = params.pop("syllable_encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop_float("dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout)
def __init__(self, embedding_file, vocab): super(GloveEncoder, self).__init__() _out_dim = 100 self.token_embedding = Embedding.from_params( vocab=vocab, params=Params({'pretrained_file': embedding_file, 'embedding_dim': GLOVE_EMBEDDING_DIM}) ) self.embed = BasicTextFieldEmbedder({"tokens": self.token_embedding}) self.encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM( batch_first=True, bidirectional=True, input_size=GLOVE_EMBEDDING_DIM, hidden_size=_out_dim )) self._dropout = torch.nn.Dropout(0.5)
def __init__(self, params: Params, vocab: Vocabulary) -> None: super().__init__(vocab=vocab) enc_hidden_dim = params.pop_int('enc_hidden_dim', 300) disc_hidden_dim = params.pop_int('disc_hidden_dim', 1200) disc_num_layers = params.pop_int('disc_num_layers', 1) emb_dropout = params.pop_float('emb_dropout', 0.0) disc_dropout = params.pop_float('disc_dropout', 0.0) l2_weight = params.pop_float('l2_weight', 0.0) self.emb_dropout = nn.Dropout(emb_dropout) self.disc_dropout = nn.Dropout(disc_dropout) self._l2_weight = l2_weight self._token_embedder = Embedding.from_params( vocab=vocab, params=params.pop('token_embedder')) self._discriminator_encoder = PytorchSeq2VecWrapper( nn.LSTM(input_size=self._token_embedder.get_output_dim(), hidden_size=enc_hidden_dim, batch_first=True)) self._discriminator = FeedForward( input_dim=4 * self._discriminator_encoder.get_output_dim(), hidden_dims=[disc_hidden_dim] * disc_num_layers + [self._NUM_LABELS], num_layers=disc_num_layers + 1, activations=[Activation.by_name('relu')()] * disc_num_layers + [Activation.by_name('linear')()]) # Metrics self._metrics = { 'labeled': { 'discriminator_entropy': ScalarMetric(), 'discriminator_accuracy': CategoricalAccuracy(), 'loss': ScalarMetric() } }
def construct_model(vocab, args): # token embedding word_embedding = Embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": "glove\\glove.vocab.100d.txt", "embedding_dim": 100, "trainable": True, "padding_index": 0 })) word_embedding = BasicTextFieldEmbedder({ "token_words": word_embedding }) char_embedding = BasicTextFieldEmbedder({ "token_characters": TokenCharactersEncoder(embedding=Embedding(embedding_dim=20, num_embeddings=262), encoder=CnnEncoder(embedding_dim=20, ngram_filter_sizes=[5], num_filters=50)), }) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=100, num_layers=1, hidden_size=100, bidirectional=True, batch_first=True)) model = FollowUpSnippetModel(vocab=vocab, word_embedder=word_embedding, char_embedder=char_embedding, tokens_encoder=lstm, model_args=args) return model
def from_params(cls, vocab: Vocabulary, params: Params, constructor_to_call=None, constructor_to_inspect=None) -> 'ProLocalModel': embedder_params = params.pop("text_field_embedder") token_params = embedder_params.pop("tokens") embedding = Embedding.from_params(vocab=vocab, params=token_params) text_field_embedder = BasicTextFieldEmbedder( token_embedders={'tokens': embedding}) # text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) seq2seq_encoder_params = params.pop("seq2seq_encoder") seq2seq_encoder = Seq2SeqEncoder.from_params(seq2seq_encoder_params) initializer = InitializerApplicator( ) #.from_params(params.pop("initializer", [])) params.assert_empty(cls.__name__) # print(cls) return cls(vocab=vocab, text_field_embedder=text_field_embedder, seq2seq_encoder=seq2seq_encoder, initializer=initializer)
def load_decomposable_attention_elmo_softmax_model(): NEGATIVE_PERCENTAGE = 100 # EMBEDDING_TYPE = "" # LOSS_TYPE = "" # NLL # LOSS_TYPE = "_nll" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: model.load_state_dict(torch.load(f, map_location='cuda:0')) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor(model, dataset_reader=reader) return model, predictor
def embeddings_returner(self, vocab=None): ''' Either the name of the pretrained model to use (e.g. bert-base-uncased),or the path to the .tar.gz file with the model weights. :param args: vocab_size and vocab is needed only when pretrained embeddings is used. :return: embedder ''' ''' "bert-base-uncased", do_lower_case=True "bert-base-cased" , do_lower_case=False https://github.com/huggingface/pytorch-transformers/issues/712 https://qiita.com/uedake722/items/b7f4b75b4d77d9bd358b ''' if self.embedding_strategy == 'bert': self.bertmodel_dir = '' if self.ifbert_use_whichmodel == 'general': self.bertmodel_dir += 'bert-base-uncased/' # recomendded ver is uncased, in original repository self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir # included in pytorch_transformers, so we replace it with model name itself self.bert_weight_filepath = copy.copy('bert-base-uncased') elif self.ifbert_use_whichmodel == 'scibert': self.bertmodel_dir += 'scibert_scivocab_uncased/' # recomendded ver is uncased, in original repository self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir self.bert_weight_filepath = self.bertmodel_relative_dirpath + 'weights.tar.gz' elif self.ifbert_use_whichmodel == 'biobert': self.bertmodel_dir += 'biobert_v1.1_pubmed/' # currently cased version only supported self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir self.bert_weight_filepath = self.bertmodel_relative_dirpath + 'weights.tar.gz' # including bert_config.json and bin. # Load embedder bert_embedder = PretrainedBertEmbedder( pretrained_model=self.bert_weight_filepath, top_layer_only=self.bert_top_layer_only, requires_grad=self.emb_requires_grad) return bert_embedder, bert_embedder.get_output_dim( ), BasicTextFieldEmbedder({'tokens': bert_embedder}, allow_unmatched_keys=True) elif self.embedding_strategy == 'elmo': if self.ifelmo_use_whichmodel == 'general': options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elif self.ifelmo_use_whichmodel == 'pubmed': options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_weights_PubMed_only.hdf5' elif self.ifelmo_use_whichmodel == 'bioelmo': options_file = self.elmo_src_dir + 'BioELMo/weights/biomed_elmo_options.json' weight_file = self.elmo_src_dir + 'BioELMo/weights/biomed_elmo_weights.hdf5' else: options_file = -1 weight_file = -1 assert options_file != -1 elmo_embedder = ElmoTokenEmbedder( options_file=options_file, weight_file=weight_file, requires_grad=self.emb_requires_grad) return elmo_embedder, elmo_embedder.get_output_dim( ), BasicTextFieldEmbedder({'tokens': elmo_embedder}) elif self.embedding_strategy == 'pretrained': print('\nGloVe pretrained vocab loading\n') if 'glove' in self.args.ifpretrained_use_whichmodel: embedding_dim = 300 else: embedding_dim = 200 pretrain_emb_embedder = Embedding.from_params( vocab=vocab, params=Params({ 'pretrained_file': self.glove_embeddings_file, 'embedding_dim': embedding_dim, 'trainable': False, 'padding_index': 0 })) return pretrain_emb_embedder, pretrain_emb_embedder.get_output_dim( ), BasicTextFieldEmbedder({'tokens': pretrain_emb_embedder})
def __init__(self, params: Params, vocab: Vocabulary) -> None: super().__init__(vocab=vocab) disc_hidden_dim = params.pop_int('disc_hidden_dim', 1200) disc_num_layers = params.pop_int('disc_num_layers', 1) code_dist_type = params.pop_choice('code_dist_type', ['gaussian', 'vmf'], default_to_first_choice=True) code_dim = params.pop_int('code_dim', 500) emb_dropout = params.pop_float('emb_dropout', 0.0) disc_dropout = params.pop_float('disc_dropout', 0.0) latent_dropout = params.pop_float('latent_dropout', 0.0) l2_weight = params.pop_float('l2_weight', 0.0) self.emb_dropout = nn.Dropout(emb_dropout) self.disc_dropout = nn.Dropout(disc_dropout) self.latent_dropout = nn.Dropout(latent_dropout) self._l2_weight = l2_weight self._token_embedder = Embedding.from_params( vocab=vocab, params=params.pop('token_embedder')) self._encoder = nn.Sequential( nn.Conv1d(in_channels=300, out_channels=300, kernel_size=5, stride=2), nn.Conv1d(in_channels=300, out_channels=600, kernel_size=5, stride=2), nn.Conv1d(in_channels=600, out_channels=500, kernel_size=5, stride=2)) self._generator = nn.Sequential( nn.ConvTranspose1d(in_channels=500, out_channels=600, kernel_size=5, stride=2), nn.ReLU(), nn.ConvTranspose1d(in_channels=600, out_channels=300, kernel_size=5, stride=2), nn.ReLU(), nn.ConvTranspose1d(in_channels=300, out_channels=300, kernel_size=5, stride=2), nn.ReLU()) self._generator_projector = nn.Linear( in_features=300, out_features=vocab.get_vocab_size(), bias=False) self._generator_projector.weight = self._token_embedder.weight if code_dist_type == 'vmf': vmf_kappa = params.pop_int('vmf_kappa', 150) self._code_generator = VmfCodeGenerator(input_dim=500, code_dim=code_dim, kappa=vmf_kappa) elif code_dist_type == 'gaussian': self._code_generator = GaussianCodeGenerator(input_dim=500, code_dim=code_dim) else: raise ValueError('Unknown code_dist_type') self._discriminator = FeedForward( input_dim=4 * self._code_generator.get_output_dim(), hidden_dims=[disc_hidden_dim] * disc_num_layers + [self._NUM_LABELS], num_layers=disc_num_layers + 1, activations=[Activation.by_name('relu')()] * disc_num_layers + [Activation.by_name('linear')()], dropout=disc_dropout) self._kl_weight = 1.0 self._discriminator_weight = params.pop_float('discriminator_weight', 0.1) self._gumbel_temperature = 1.0 # Metrics self._metrics = { 'generator_loss': ScalarMetric(), 'kl_divergence': ScalarMetric(), 'discriminator_accuracy': CategoricalAccuracy(), 'discriminator_loss': ScalarMetric(), 'loss': ScalarMetric() }
"pre_trained_embedding": "../data/glove.42B.300d.txt", "model": "knrm", "train_data": "../data/triples.train.tsv", "validation_data": "../data/tuples.validation.tsv", "test_data": "../data/tuples.test.tsv", } # # data loading # vocab = Vocabulary.from_files(config["vocab_directory"]) tokens_embedder = Embedding.from_params( vocab, Params({ "pretrained_file": config["pre_trained_embedding"], "embedding_dim": 300, "trainable": True, "padding_index": 0 })) word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder}) # recommended default params for the models (but you may change them if you want) if config["model"] == "knrm": model = KNRM(word_embedder, n_kernels=11) elif config["model"] == "conv_knrm": model = Conv_KNRM(word_embedder, n_grams=3, n_kernels=11, conv_out_dim=128) elif config["model"] == "match_pyramid": model = MatchPyramid(word_embedder, conv_output_size=[16, 16, 16, 16, 16], conv_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3],
def instantiate_word_embedding(self): embedding_params = Params(OmegaConf.to_container(self.c.dataset.embedding)) token_embedding = Embedding.from_params(self.vocab, embedding_params) self.word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
def train(self, args_hpo, index): """ trains the model, and return the metrics to the meta optimizer. :param args_hpo: :param index: :return: """ PrintColors.prYellow('\n===== training with: {}'.format(args_hpo)) PrintColors.prGreen('----- in {} mode -----'.format('train')) ''' ============ LOAD DATA ================================================================================ ''' starting_time = time.time() lm_dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS) train_data, val_data = (lm_dataset_reader.read(folder) for folder in [_train_data_path, _val_data_path]) lm_vocabulary = Vocabulary.from_instances(train_data + val_data) iterator = BasicIterator(batch_size=args_hpo.batch_size) iterator.index_with(lm_vocabulary) ''' ============ DEFINE MODEL ============================================================================= ''' ''' the class params 'pop' its parameters i.e. they disappear after first use. So we instantiate a Params instance for each model defining execution. More than that, they turn dicts into Mutable mappings and destroys the original dict. So here's your copy allennlp. Thanks. (I still love you) ''' token_embedding = Embedding.from_params(vocab=lm_vocabulary, params=Params(copy.deepcopy(GLOBAL_CONSTANTS.GLOVE_PARAMS_CONFIG))) token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({'tokens': token_embedding}) ''' define encoder to wrap up an lstm feature extractor ''' contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=args_hpo.word_embedding_size, hidden_size=args_hpo.ed_ncoder_size, bidirectional=False, batch_first=True)) model = LanguageModel(vocab=lm_vocabulary, text_field_embedder=token_embedder, contextualizer=contextualizer, dropout=args_hpo.dropout, regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=args_hpo.l2))]), )\ .cuda(_device) ''' ============ TRAIN ================================================================================ ''' ''' callbacks ''' if index == 0: for file in os.listdir(os.path.join(*['.', 'lm_models'])): path = os.path.join(*['.', 'lm_models', file]) if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) serialization_path = 'models_lm_{}_{}'.format(_tag, index) serialization_path_longer = os.path.join(*['.', 'lm_models', serialization_path]) vocab_path = 'vocab_lm_{}_{}'.format(_tag, index) vocab_dir_longer = os.path.join(*['.', 'lm_models', vocab_path]) if not os.path.exists(serialization_path_longer): os.mkdir(serialization_path_longer) callbacks = list() ''' for validation ''' callbacks.append(validate.Validate(validation_data=val_data, validation_iterator=iterator)) ''' for early stopping. it tracks 'loss' returned by model.forward() ''' callbacks.append(track_metrics.TrackMetrics(patience=3)) ''' for grad clipping ''' callbacks.append(gradient_norm_and_clip.GradientNormAndClip(grad_clipping=args_hpo.clip)) ''' for checkpointing TODO: NOTE:serialization path CANNOT exist before training ?? ''' model_checkpointer = checkpointer.Checkpointer(serialization_dir=serialization_path_longer, num_serialized_models_to_keep=1) callbacks.append(checkpoint.Checkpoint(checkpointer=model_checkpointer)) ''' for sample generations ''' callback_trainer = CallbackTrainer( model=model, training_data=train_data, iterator=iterator, optimizer=torch.optim.Adam(model.parameters(), lr=args_hpo.lr), num_epochs=_n_epochs, serialization_dir=serialization_path_longer, cuda_device=_device, callbacks=callbacks ) ''' trainer saves the model, but the vocabulary needs to be saved, too ''' lm_vocabulary.save_to_files(vocab_dir_longer) ''' check the metric names to synchronize with the class ''' metrics = callback_trainer.train() metrics['time_consumed(hrs)'] = round((time.time() - starting_time) / 3600, 4) return metrics
EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
def train(train_dataset, val_dataset, cfg): # Vocabularyを生成 VOCAB_SIZE = cfg.w2v.vocab_size vocab = Vocabulary.from_instances(train_dataset + val_dataset, max_vocab_size=VOCAB_SIZE) BATCH_SIZE = cfg.training.batch_size # パディング済みミニバッチを生成してくれるIterator iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # 東北大が提供している学習済み日本語 Wikipedia エンティティベクトルを使用する # http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/ model_name = cfg.w2v.model_name norm = cfg.w2v.norm cwd = hydra.utils.get_original_cwd() params = Params({ 'embedding_dim': 200, 'padding_index': 0, 'pretrained_file': os.path.join(cwd, f'embs/jawiki.{model_name}_vectors.200d.txt'), 'norm_type': norm }) token_embedding = Embedding.from_params(vocab=vocab, params=params) HIDDEN_SIZE = cfg.model.hidden_size dropout = cfg.model.dropout word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) encoder: Seq2SeqEncoder = PytorchSeq2SeqWrapper( nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_SIZE, bidirectional=True, batch_first=True)) model = ClassifierWithAttn(word_embeddings, encoder, vocab, dropout) model.train() USE_GPU = True if USE_GPU and torch.cuda.is_available(): model = model.cuda(0) LR = cfg.training.learning_rate EPOCHS = cfg.training.epoch patience = cfg.training.patience if cfg.training.patience > 0 else None optimizer = optim.Adam(model.parameters(), lr=LR) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=val_dataset, patience=patience, cuda_device=0 if USE_GPU else -1, num_epochs=EPOCHS) metrics = trainer.train() logger.info(metrics) return model, metrics
config["validation_end_candidate_set_from_to"][1]) test_candidate_set = None if "test_candidate_set_max" in config and "test_candidate_set_path" in config: test_candidate_set = parse_candidate_set( config["test_candidate_set_path"], config["test_candidate_set_max"]) # embedding layer (use pre-trained, but make it trainable as well) if config["token_embedder_type"] == "embedding": vocab = Vocabulary.from_files(config["vocab_directory"]) tokens_embedder = Embedding.from_params( vocab, Params({ "pretrained_file": config["pre_trained_embedding"], "embedding_dim": config["pre_trained_embedding_dim"], "trainable": config["train_embedding"], "padding_index": 0, "sparse": config["sparse_gradient_embedding"] })) elif config["token_embedder_type"] == "fasttext": vocab = None #FastTextVocab(config["fasttext_vocab_mapping"]) tokens_embedder = FastTextEmbeddingBag(numpy.load( config["fasttext_weights"]), sparse=True) elif config["token_embedder_type"] == "elmo": vocab = None tokens_embedder = ElmoTokenEmbedder(config["elmo_options_file"], config["elmo_weights_file"]) else:
iterator.index_with(vocab) val_iterator = BucketIterator( batch_size=config.eval_batch_size, sorting_keys=[("text", "num_tokens")], ) val_iterator.index_with(vocab) if args.embedding_type == 'glove': param_dict = { "pretrained_file": "(https://nlp.stanford.edu/data/glove.6B.zip)#glove.6B.300d.txt", "embedding_dim": 300 } params = Params(params=param_dict) token_embedding = Embedding.from_params(vocab=vocab, params=params) elif args.embedding_type == 'elmo': token_embedding = ElmoTokenEmbedder(args.options_file, args.weights_file, requires_grad=args.finetune_embeddings) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) if args.encoder_type == 'bag': encoder = BagOfEmbeddingsEncoder(word_embeddings.get_output_dim()) elif args.encoder_type == 'lstm': encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz, bidirectional=True, batch_first=True))
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, word_embedding_dim: int = 200, hidden_dim: int = 200, dropout_emb: float = 0.5, min_dec_step: int = 2, max_decoding_steps=3, fix_edu_num=-1, dropout: float = 0.5, alpha: float = 0.5, span_encoder_type='self_attentive', use_elmo: bool = True, attn_type: str = 'general', schedule_ratio_from_ground_truth: float = 0.8, pretrain_embedding_file=None, nenc_lay: int = 2, mult_orac_sampling: bool = False, word_token_indexers=None, compression: bool = True, dbg: bool = False, dec_avd_trigram_rep: bool = True, aggressive_compression: int = -1, compress_leadn: int = -1, subsentence: bool = False, gather='mean', keep_threshold: float = 0.5, abs_board_file: str = "/home/cc/exComp/board.txt", abs_dir_root: str = "/scratch/cluster/jcxu", serilization_name: str = "", ) -> None: super(Seq2IdxSum, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder elmo_weight = os.path.join( abs_dir_root, "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") # if not os.path.isfile(elmo_weight): # import subprocess # x = "wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5 -P {}".format(abs_dir_root) # subprocess.run(x.split(" ")) self.device = get_device() self.vocab = vocab self.dbg = dbg self.loss_thres = keep_threshold self.compression = compression self.comp_leadn = compress_leadn # Just encode the whole document without looking at compression options self.enc_doc = EncDoc(inp_dim=word_embedding_dim, hid_dim=hidden_dim, vocab=vocab, dropout=dropout, dropout_emb=dropout_emb, pretrain_embedding_file=pretrain_embedding_file, gather=gather) self.sent_dec = SentRNNDecoder( rnn_type='lstm', dec_hidden_size=self.enc_doc.get_output_dim(), dec_input_size=self.enc_doc.get_output_dim(), dropout=dropout, fixed_dec_step=fix_edu_num, max_dec_steps=max_decoding_steps, min_dec_steps=min_dec_step, schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth, dec_avd_trigram_rep=dec_avd_trigram_rep, mult_orac_sample_one=mult_orac_sampling, abs_board_file=abs_board_file, valid_tmp_path=abs_dir_root, serilization_name=serilization_name) if compression: self.compression_dec = CompressDecoder( context_dim=hidden_dim * 2, dec_state_dim=hidden_dim * 2, enc_hid_dim=hidden_dim, text_field_embedder=self.enc_doc._text_field_embedder, aggressive_compression=aggressive_compression, keep_threshold=keep_threshold, abs_board_file=abs_board_file, gather=gather, dropout=dropout, dropout_emb=dropout_emb, valid_tmp_path=abs_dir_root, serilization_name=serilization_name, vocab=vocab, elmo=use_elmo, elmo_weight=elmo_weight) self.aggressive_compression = aggressive_compression self.use_elmo = use_elmo if use_elmo: options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, 1, dropout=0) # print(self.elmo.get_output_dim()) self._context_layer = PytorchSeq2SeqWrapper( torch.nn.LSTM(word_embedding_dim + self.elmo.get_output_dim(), hidden_dim, batch_first=True, bidirectional=True)) else: self._context_layer = PytorchSeq2SeqWrapper( torch.nn.LSTM(word_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=word_embedding_dim) if pretrain_embedding_file is not None: logger = logging.getLogger() logger.info( "Loading word embedding: {}".format(pretrain_embedding_file)) token_embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": pretrain_embedding_file, "embedding_dim": word_embedding_dim })) self._text_field_embedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) # if span_encoder_type == 'self_attentive': # self._span_encoder = SelfAttentiveSpanExtractor( # self._context_layer.get_output_dim() # ) # else: # raise NotImplementedError self._dropout = torch.nn.Dropout(p=dropout) self._max_decoding_steps = max_decoding_steps self._fix_edu_num = fix_edu_num if compression: pass # self.rouge_metrics_compression = self.compression_dec.rouge_metrics_compression # self.rouge_metrics_compression_upper_bound = self.compression_dec.rouge_metrics_compression_best_possible self.rouge_metrics_sent = self.sent_dec.rouge_metrics_sent self.mult_orac_sampling = mult_orac_sampling self.alpha = alpha initializer(self) if regularizer is not None: regularizer(self) self.counter = 0 # used for controlling compression and extraction
def from_params(cls, vocab, params): return UncontextualizedEmbedding( embedding=Embedding.from_params(vocab, params))
def __init__(self, params: Params, vocab: Vocabulary) -> None: super().__init__(vocab=vocab) enc_hidden_dim = params.pop_int('enc_hidden_dim', 300) gen_hidden_dim = params.pop_int('gen_hidden_dim', 300) disc_hidden_dim = params.pop_int('disc_hidden_dim', 1200) disc_num_layers = params.pop_int('disc_num_layers', 1) code_dist_type = params.pop_choice('code_dist_type', ['gaussian', 'vmf'], default_to_first_choice=True) code_dim = params.pop_int('code_dim', 300) label_emb_dim = params.pop_int('label_emb_dim', 50) shared_encoder = params.pop_bool('shared_encoder', True) tie_embedding = params.pop_bool('tie_embedding', False) auto_weighting = params.pop_bool('auto_weighting', False) emb_dropout = params.pop_float('emb_dropout', 0.0) disc_dropout = params.pop_float('disc_dropout', 0.0) l2_weight = params.pop_float('l2_weight', 0.0) self.emb_dropout = nn.Dropout(emb_dropout) self.disc_dropout = nn.Dropout(disc_dropout) self._l2_weight = l2_weight self.auto_weighting = auto_weighting self._token_embedder = Embedding.from_params( vocab=vocab, params=params.pop('token_embedder')) self._label_embedder = Embedding(num_embeddings=self._NUM_LABELS, embedding_dim=label_emb_dim) self._encoder = PytorchSeq2VecWrapper( nn.LSTM(input_size=self._token_embedder.get_output_dim(), hidden_size=enc_hidden_dim, batch_first=True)) self._generator = PytorchSeq2SeqWrapper( nn.LSTM(input_size=(self._token_embedder.get_output_dim() + code_dim + label_emb_dim), hidden_size=gen_hidden_dim, batch_first=True)) self._generator_projector = nn.Linear( in_features=self._generator.get_output_dim(), out_features=vocab.get_vocab_size()) self._discriminator_encoder = PytorchSeq2VecWrapper( nn.LSTM(input_size=self._token_embedder.get_output_dim(), hidden_size=enc_hidden_dim, batch_first=True)) if shared_encoder: self._discriminator_encoder = self._encoder if tie_embedding: self._generator_projector.weight = self._token_embedder.weight self._discriminator = FeedForward( input_dim=4 * self._discriminator_encoder.get_output_dim(), hidden_dims=[disc_hidden_dim] * disc_num_layers + [self._NUM_LABELS], num_layers=disc_num_layers + 1, activations=[Activation.by_name('relu')()] * disc_num_layers + [Activation.by_name('linear')()], dropout=disc_dropout) if code_dist_type == 'vmf': vmf_kappa = params.pop_int('vmf_kappa', 150) self._code_generator = VmfCodeGenerator( input_dim=self._encoder.get_output_dim(), code_dim=code_dim, kappa=vmf_kappa) elif code_dist_type == 'gaussian': self._code_generator = GaussianCodeGenerator( input_dim=self._encoder.get_output_dim(), code_dim=code_dim) else: raise ValueError('Unknown z_dist') self._kl_weight = 1.0 self._discriminator_weight = params.pop_float('discriminator_weight', 0.1) self._gumbel_temperature = 1.0 self._use_sampling = params.pop_bool('use_sampling', False) if auto_weighting: self.num_tasks = num_tasks = 3 self.task_weights = nn.Parameter(torch.zeros(num_tasks)) # Metrics self._metrics = { 'labeled': { 'generator_loss': ScalarMetric(), 'kl_divergence': ScalarMetric(), 'discriminator_entropy': ScalarMetric(), 'discriminator_accuracy': CategoricalAccuracy(), 'discriminator_loss': ScalarMetric(), 'loss': ScalarMetric() }, 'unlabeled': { 'generator_loss': ScalarMetric(), 'kl_divergence': ScalarMetric(), 'discriminator_entropy': ScalarMetric(), 'loss': ScalarMetric() }, 'aux': { 'discriminator_entropy': ScalarMetric(), 'discriminator_accuracy': CategoricalAccuracy(), 'discriminator_loss': ScalarMetric(), 'gumbel_temperature': ScalarMetric(), 'loss': ScalarMetric(), 'code_log_prob': ScalarMetric(), 'cosine_dist': ScalarMetric() } }
token_indexer = SingleIdTokenIndexer() reader = lstmDatasetReader(token_indexers={"tokens": token_indexer}) full_dataset = reader.read("pol_train_semibalanced.csv") train_size = int(0.8 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, validation_dataset = random_split(full_dataset, [train_size, test_size]) test_dataset = reader.read("pol_test_semibalanced.csv") vocab = Vocabulary.from_instances(train_dataset + validation_dataset, max_vocab_size=config.max_vocab_size) token_embedder = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': 'glove.twitter.27B.50d.txt', 'embedding_dim': 50 })) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedder}) #Iterrator: batching data + preparing it for input from allennlp.data.iterators import BucketIterator iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")], max_instances_in_memory=512) iterator.index_with(vocab) lstm = PytorchSeq2VecWrapper( nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz,
def save_top_results(process_no, start_index, end_index): print("Starting process {} with start at {} and end at {}".format( process_no, start_index, end_index)) DATA_FOLDER = "train_data" # EMBEDDING_TYPE = "" LOSS_TYPE = "" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt") # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt") # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt") #NOTE: Squad dev test set q_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt") r_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt") rules_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt") reader = QuestionResponseSoftmaxReader(q_file, r_file, token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) glove_embeddings_file = os.path.join("data", "glove", "glove.840B.300d.txt") # RESULTS_DIR = "squad_seq2seq_train2" #NOTE: All other experiments # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized" # make_dir_if_not_exists(RESULTS_DIR) # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index)) #NOTE: Squad dev test set RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized" make_dir_if_not_exists(RESULTS_DIR) all_results_save_file = os.path.join( RESULTS_DIR, "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format( start_index, end_index)) with open(all_results_save_file, "w") as all_writer: print("Testing out model with", EMBEDDING_TYPE, "embeddings") print("Testing out model with", LOSS_TYPE, "loss") # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]: for NEGATIVE_PERCENTAGE in [100]: model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params( vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder( {"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: device = torch.device('cpu') model.load_state_dict(torch.load(f, map_location=device)) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor( model, dataset_reader=reader) # Read test file and get predictions gold = list() predicted_labels = list() probs = list() total_time = avg_time = 0.0 print("Started Testing:", NEGATIVE_PERCENTAGE) # before working on anything just save all the questions and responses in a list all_data = list() examples_count = processed_examples_count = 0 with open(q_file, 'r') as q_reader, open(r_file, "r") as r_reader, open( rules_file, "r") as rule_reader: logger.info("Reading questions from : %s", q_file) logger.info("Reading responses from : %s", r_file) q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() for i, (response, rule) in enumerate(zip(r_reader, rule_reader)): response = response.strip() rule = rule.strip() if response and rule: # get current_answer from response a = get_answer_from_response(response) if not current_qa[1]: current_qa = (q, a) else: # verify if the a is same as the one in current_qa if a != current_qa[1]: # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response) current_qa = (current_qa[0], a) # print(current_rules_and_responses) # exit() # Add it to the current responses current_rules_and_responses.append((response, rule)) elif len(current_rules_and_responses) > 0: # Create a instance # print(current_qa) # print(current_rules_and_responses) # exit() if rule or response: print("Rule Response mismatch") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() if examples_count < start_index: examples_count += 1 q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() continue elif examples_count > end_index: break all_data.append( (current_qa, current_rules_and_responses)) try: q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) except StopIteration: # previous one was the last question q = "" current_qa = (q, "") current_rules_and_responses = list() examples_count += 1 # if(examples_count%100 == 0): # print(examples_count) else: # Serious Bug print("Serious BUG!!") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() print("{}:\tFINISHED IO".format(process_no)) examples_count = start_index processed_examples_count = 0 for current_qa, responses_and_rules in all_data: start_time = time.time() # Tokenize and preprocess the responses preprocessed_responses = [ mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False) for response, rule in responses_and_rules ] # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules]) predictions = predictor.predict(current_qa[0], preprocessed_responses) label_probs = predictions["label_probs"] tuples = zip(responses_and_rules, label_probs) sorted_by_score = sorted(tuples, key=lambda tup: tup[1], reverse=True) count = 0 all_writer.write("{}\n".format(current_qa[0])) all_writer.write("{}\n".format(current_qa[1])) for index, ((response, rule), label_prob) in enumerate(sorted_by_score): if index == 3: break all_writer.write("{}\t{}\t{}\t{}\n".format( response, mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False), rule, label_prob)) all_writer.write("\n") all_writer.flush() end_time = time.time() processed_examples_count += 1 examples_count += 1 total_time += end_time - start_time avg_time = total_time / float(processed_examples_count) print( "{}:\ttime to write {} with {} responses is {} secs. {} avg time" .format(process_no, examples_count, len(responses_and_rules), end_time - start_time, avg_time))
return np.concatenate(preds, axis=0), np.concatenate(labels, axis=0) token_indexer = SingleIdTokenIndexer() reader = rnnDatasetReader(token_indexers={"tokens": token_indexer}) test_dataset = reader.read( "/home/dkeren/Documents/Spring2019/CIS520/project/pol_test_semibalanced.csv" ) vocab = Vocabulary.from_files( "/tmp/vocabulary") #preloaded vocab, required to do lazy computations token_embedder = Embedding.from_params( vocab=vocab, params=Params({ 'pretrained_file': '/home/dkeren/Documents/Spring2019/CIS520/project/glove.twitter.27B.50d.txt', 'embedding_dim': 50 })) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedder}) lstm = PytorchSeq2VecWrapper( nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz, bidirectional=True, batch_first=True)) save_file = "model_v12.th" ## models saved by lstm training model2 = LSTM_Model(word_embeddings, lstm, 2) with open(save_file, 'rb') as f: model2.load_state_dict(torch.load(f))
"type": "gru", "input_size": 100, "hidden_size": 50, "num_layers": 2, "dropout": 0.25, "bidirectional": True } } }) #text_field_embedder = TextFieldEmbedder.from_params(text_field_embedder_cfg,vocab=vocab) # token based -> maybe do average from this glove_text_field_embedder = Embedding.from_params( vocab, Params({ "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz", "embedding_dim": 100, "trainable": False })) #text_field_embedder= TextFieldEmbedder.from_params(text_field_embedder_cfg) # """You need to be sure that the TextFieldEmbedder is expecting the same thing that your DatasetReader is producing, but that happens in the configuration file, and we'll talk about it later.""" trainer_cfg = Params({ "iterator": { "type": "basic", "batch_size": 32 }, "trainer": { "optimizer": { "type": "adam"