Пример #1
0
    def test_sliding_window_with_batch(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8)

        config_path = self.FIXTURES_ROOT / 'bert' / 'config.json'
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})})

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert bert_vectors is not None
    def test_squad_with_unwordpieceable_passage(self):

        tokenizer = SpacyTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = (
            "There were four major HDTV systems tested by SMPTE in the late 1970s, "
            "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:"
        )
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = (
            "Broca, being what today would be called a neurosurgeon, "
            "had taken an interest in the pathology of speech. He wanted "
            "to localize the difference between man and the other animals, "
            "which appeared to reside in speech. He discovered the speech "
            "center of the human brain, today called Broca's area after him. "
            "His interest was mainly in Biological anthropology, but a German "
            "philosopher specializing in psychology, Theodor Waitz, took up the "
            "theme of general and social anthropology in his six-volume work, "
            "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
            """soon translated as "The Anthropology of Primitive Peoples". """
            "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import (
            make_reading_comprehension_instance, )

        instance1 = make_reading_comprehension_instance(
            tokenizer.tokenize(question1),
            tokenizer.tokenize(passage1),
            {"bert": token_indexer},
            passage1,
        )

        instance2 = make_reading_comprehension_instance(
            tokenizer.tokenize(question2),
            tokenizer.tokenize(passage2),
            {"bert": token_indexer},
            passage2,
        )

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
    def setUp(self):
        super().setUp()

        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        self.token_indexer = PretrainedBertIndexer(str(vocab_path))

        config_path = self.FIXTURES_ROOT / 'bert' / 'config.json'
        config = BertConfig(str(config_path))
        self.bert_model = BertModel(config)
        self.token_embedder = BertEmbedder(self.bert_model)
Пример #4
0
    def setUp(self):
        super().setUp()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        self.token_indexer = PretrainedBertIndexer(str(vocab_path))

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig.from_json_file(str(config_path))
        self.bert_model = BertModel(config)
        self.token_embedder = BertEmbedder(self.bert_model)
    def _get_bert_word_embedder(self):
        pretrained_model = self.bert_file_path
        bert_model = PretrainedBertModel.load(pretrained_model, cache_model=False)
        for param in bert_model.parameters():
            param.requires_grad = self.configuration['train_bert']
        bert_embedder = BertEmbedder(bert_model=bert_model, top_layer_only=True)

        bert_word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": bert_embedder},
                                                                       # we'll be ignoring masks so we'll need to set this to True
                                                                       allow_unmatched_keys=True)
        return bert_word_embedder
    def test_end_to_end(self):
        tokenizer = BertPreTokenizer()

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        assert tokens["bert"].tolist() == [
            [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0],
            [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17],
        ]

        assert tokens["bert-offsets"].tolist() == [
            [1, 3, 4, 5, 6, 7, 8, 9, 10, 11],
            [1, 2, 3, 4, 5, 6, 7, 10, 11, 12],
        ]

        # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP])
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        # Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
    def test_sliding_window(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert tokens["bert"].tolist() == [[
            16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2,
            14, 12, 17
        ]]
        assert tokens["bert-offsets"].tolist() == [[
            1, 3, 4, 5, 6, 7, 8, 9, 10, 11
        ]]

        bert_vectors = token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [1, 13, 12]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [1, 10, 12]

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"],
                                      token_type_ids=tokens["bert-type-ids"])
        assert list(bert_vectors.shape) == [1, 10, 12]
Пример #8
0
    def test_end_to_end_with_higher_order_inputs(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)
        text_field1 = TextField(tokens1, {"bert": self.token_indexer})

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)
        text_field2 = TextField(tokens2, {"bert": self.token_indexer})

        #            2   5    15 10 11 6
        sentence3 = "the brown laziest fox"
        tokens3 = tokenizer.tokenize(sentence3)
        text_field3 = TextField(tokens3, {"bert": self.token_indexer})

        vocab = Vocabulary()

        instance1 = Instance({"tokens": ListField([text_field1])})
        instance2 = Instance({"tokens": ListField([text_field2, text_field3])})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True)
        tokens = tensor_dict["tokens"]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]
Пример #9
0
    def test_sliding_window_with_batch(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig.from_json_file(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({
            "tokens":
            TextField(tokens + tokens + tokens, {"bert": token_indexer})
        })

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]["bert"]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"])
        assert bert_vectors is not None

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"],
                                      token_type_ids=tokens["token_type_ids"])
        assert bert_vectors is not None
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
    def _get_bert_word_embedder(self):
        # bert_embedder = PretrainedBertEmbedder(
        #     pretrained_model=self.bert_file_path,
        #     top_layer_only=True,  # conserve memory
        #     requires_grad=(not self.configuration['fixed'])
        # )

        pretrained_model = self.bert_file_path
        bert_model = PretrainedBertModel.load(pretrained_model,
                                              cache_model=False)
        for param in bert_model.parameters():
            param.requires_grad = (not self.configuration['fixed'])
        bert_embedder = BertEmbedder(bert_model=bert_model,
                                     top_layer_only=True)

        bert_word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"bert": bert_embedder},
            # we'll be ignoring masks so we'll need to set this to True
            allow_unmatched_keys=True)
        bert_word_embedder.to(self.configuration['device'])
        return bert_word_embedder
                                         weights_file,
                                         dropout=DROPOUT,
                                         projection_dim=PROJECT_DIM)
 elif EMBEDDING_TYPE == "_elmo_retrained_2":
     options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                 "options_2.json")
     weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                 "weights_2.hdf5")
     token_embedding = ElmoTokenEmbedder(options_file,
                                         weights_file,
                                         dropout=DROPOUT,
                                         projection_dim=PROJECT_DIM)
 elif EMBEDDING_TYPE == "_bert":
     print("Loading bert model")
     model = BertModel.from_pretrained('bert-base-uncased')
     token_embedding = BertEmbedder(model)
     PROJECT_DIM = 768
 else:
     print("Error: Some weird Embedding type", EMBEDDING_TYPE)
     exit()
 word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
 HIDDEN_DIM = 200
 params = Params({
     'input_dim': PROJECT_DIM,
     'hidden_dims': HIDDEN_DIM,
     'activations': 'relu',
     'num_layers': NUM_LAYERS,
     'dropout': DROPOUT
 })
 attend_feedforward = FeedForward.from_params(params)
 similarity_function = DotProductSimilarity()
Пример #13
0
def load_decomposable_attention_elmo_softmax_model():
    NEGATIVE_PERCENTAGE = 100
    # EMBEDDING_TYPE = ""
    # LOSS_TYPE = ""				# NLL
    # LOSS_TYPE = "_nll"				# NLL
    LOSS_TYPE = "_mse"  # MSE
    # EMBEDDING_TYPE = ""
    # EMBEDDING_TYPE = "_glove"
    # EMBEDDING_TYPE = "_bert"
    EMBEDDING_TYPE = "_elmo"
    # EMBEDDING_TYPE = "_elmo_retrained"
    # EMBEDDING_TYPE = "_elmo_retrained_2"
    token_indexers = None
    if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2":
        token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
    MAX_BATCH_SIZE = 0
    # MAX_BATCH_SIZE = 150 # for bert and elmo
    reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers,
                                           max_batch_size=MAX_BATCH_SIZE)
    model_file = os.path.join(
        "saved_softmax_models",
        "decomposable_attention{}{}_model_{}.th".format(
            LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

    vocabulary_filepath = os.path.join(
        "saved_softmax_models",
        "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                   NEGATIVE_PERCENTAGE))
    print("LOADING VOCABULARY")
    # Load vocabulary
    vocab = Vocabulary.from_files(vocabulary_filepath)

    EMBEDDING_DIM = 300
    PROJECT_DIM = 200
    DROPOUT = 0.2
    NUM_LAYERS = 2
    if EMBEDDING_TYPE == "":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=EMBEDDING_DIM,
            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_glove":
        token_embedding = Embedding.from_params(vocab=vocab,
                                                params=Params({
                                                    'pretrained_file':
                                                    glove_embeddings_file,
                                                    'embedding_dim':
                                                    EMBEDDING_DIM,
                                                    'projection_dim':
                                                    PROJECT_DIM,
                                                    'trainable':
                                                    False
                                                }))
    elif EMBEDDING_TYPE == "_elmo":
        # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
        # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
        options_file = os.path.join(
            "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json")
        weights_file = os.path.join(
            "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
        # NOTE: using Small size as medium size gave CUDA out of memory error
        # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
        # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_elmo_retrained":
        options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "options.json")
        weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "weights.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_elmo_retrained_2":
        options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "options_2.json")
        weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "weights_2.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_bert":
        print("Loading bert model")
        model = BertModel.from_pretrained('bert-base-uncased')
        token_embedding = BertEmbedder(model)
        PROJECT_DIM = 768
    else:
        print("Error: Some weird Embedding type", EMBEDDING_TYPE)
        exit()
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    HIDDEN_DIM = 200
    params = Params({
        'input_dim': PROJECT_DIM,
        'hidden_dims': HIDDEN_DIM,
        'activations': 'relu',
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT
    })
    attend_feedforward = FeedForward.from_params(params)
    similarity_function = DotProductSimilarity()
    params = Params({
        'input_dim': 2 * PROJECT_DIM,
        'hidden_dims': HIDDEN_DIM,
        'activations': 'relu',
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT
    })
    compare_feedforward = FeedForward.from_params(params)
    params = Params({
        'input_dim': 2 * HIDDEN_DIM,
        'hidden_dims': 1,
        'activations': 'linear',
        'num_layers': 1
    })
    aggregate_feedforward = FeedForward.from_params(params)
    model = DecomposableAttentionSoftmax(vocab, word_embeddings,
                                         attend_feedforward,
                                         similarity_function,
                                         compare_feedforward,
                                         aggregate_feedforward)
    print("MODEL CREATED")
    # Load model state
    with open(model_file, 'rb') as f:
        model.load_state_dict(torch.load(f, map_location='cuda:0'))
    print("MODEL LOADED!")
    if torch.cuda.is_available():
        # cuda_device = 3
        # model = model.cuda(cuda_device)
        cuda_device = -1
    else:
        cuda_device = -1

    predictor = DecomposableAttentionSoftmaxPredictor(model,
                                                      dataset_reader=reader)
    return model, predictor
def save_top_results(process_no, start_index, end_index):
    print("Starting process {} with start at {} and end at {}".format(
        process_no, start_index, end_index))
    DATA_FOLDER = "train_data"
    # EMBEDDING_TYPE = ""
    LOSS_TYPE = ""  # NLL
    LOSS_TYPE = "_mse"  # MSE
    # EMBEDDING_TYPE = ""
    # EMBEDDING_TYPE = "_glove"
    # EMBEDDING_TYPE = "_bert"
    EMBEDDING_TYPE = "_elmo"
    # EMBEDDING_TYPE = "_elmo_retrained"
    # EMBEDDING_TYPE = "_elmo_retrained_2"
    token_indexers = None
    if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2":
        token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
    MAX_BATCH_SIZE = 0
    # MAX_BATCH_SIZE = 150 # for bert and elmo
    # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt")
    # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt")
    # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt")

    #NOTE: Squad dev test set
    q_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt")
    r_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt")
    rules_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt")
    reader = QuestionResponseSoftmaxReader(q_file,
                                           r_file,
                                           token_indexers=token_indexers,
                                           max_batch_size=MAX_BATCH_SIZE)
    glove_embeddings_file = os.path.join("data", "glove",
                                         "glove.840B.300d.txt")
    # RESULTS_DIR = "squad_seq2seq_train2"
    #NOTE: All other experiments
    # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized"
    # make_dir_if_not_exists(RESULTS_DIR)
    # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index))

    #NOTE: Squad dev test set
    RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized"
    make_dir_if_not_exists(RESULTS_DIR)
    all_results_save_file = os.path.join(
        RESULTS_DIR,
        "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format(
            start_index, end_index))

    with open(all_results_save_file, "w") as all_writer:
        print("Testing out model with", EMBEDDING_TYPE, "embeddings")
        print("Testing out model with", LOSS_TYPE, "loss")
        # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]:
        for NEGATIVE_PERCENTAGE in [100]:
            model_file = os.path.join(
                "saved_softmax_models",
                "decomposable_attention{}{}_model_{}.th".format(
                    LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

            vocabulary_filepath = os.path.join(
                "saved_softmax_models",
                "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                           NEGATIVE_PERCENTAGE))
            print("LOADING VOCABULARY")
            # Load vocabulary
            vocab = Vocabulary.from_files(vocabulary_filepath)

            EMBEDDING_DIM = 300
            PROJECT_DIM = 200
            DROPOUT = 0.2
            NUM_LAYERS = 2
            if EMBEDDING_TYPE == "":
                token_embedding = Embedding(
                    num_embeddings=vocab.get_vocab_size('tokens'),
                    embedding_dim=EMBEDDING_DIM,
                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_glove":
                token_embedding = Embedding.from_params(
                    vocab=vocab,
                    params=Params({
                        'pretrained_file': glove_embeddings_file,
                        'embedding_dim': EMBEDDING_DIM,
                        'projection_dim': PROJECT_DIM,
                        'trainable': False
                    }))
            elif EMBEDDING_TYPE == "_elmo":
                # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
                # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
                options_file = os.path.join(
                    "data", "elmo",
                    "elmo_2x2048_256_2048cnn_1xhighway_options.json")
                weights_file = os.path.join(
                    "data", "elmo",
                    "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
                # NOTE: using Small size as medium size gave CUDA out of memory error
                # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
                # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
                # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
                # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_elmo_retrained":
                options_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "options.json")
                weights_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "weights.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_elmo_retrained_2":
                options_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "options_2.json")
                weights_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "weights_2.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_bert":
                print("Loading bert model")
                model = BertModel.from_pretrained('bert-base-uncased')
                token_embedding = BertEmbedder(model)
                PROJECT_DIM = 768
            else:
                print("Error: Some weird Embedding type", EMBEDDING_TYPE)
                exit()
            word_embeddings = BasicTextFieldEmbedder(
                {"tokens": token_embedding})
            HIDDEN_DIM = 200
            params = Params({
                'input_dim': PROJECT_DIM,
                'hidden_dims': HIDDEN_DIM,
                'activations': 'relu',
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT
            })
            attend_feedforward = FeedForward.from_params(params)
            similarity_function = DotProductSimilarity()
            params = Params({
                'input_dim': 2 * PROJECT_DIM,
                'hidden_dims': HIDDEN_DIM,
                'activations': 'relu',
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT
            })
            compare_feedforward = FeedForward.from_params(params)
            params = Params({
                'input_dim': 2 * HIDDEN_DIM,
                'hidden_dims': 1,
                'activations': 'linear',
                'num_layers': 1
            })
            aggregate_feedforward = FeedForward.from_params(params)
            model = DecomposableAttentionSoftmax(vocab, word_embeddings,
                                                 attend_feedforward,
                                                 similarity_function,
                                                 compare_feedforward,
                                                 aggregate_feedforward)
            print("MODEL CREATED")
            # Load model state
            with open(model_file, 'rb') as f:
                device = torch.device('cpu')
                model.load_state_dict(torch.load(f, map_location=device))
            print("MODEL LOADED!")
            if torch.cuda.is_available():
                # cuda_device = 3
                # model = model.cuda(cuda_device)
                cuda_device = -1
            else:
                cuda_device = -1

            predictor = DecomposableAttentionSoftmaxPredictor(
                model, dataset_reader=reader)
            # Read test file and get predictions
            gold = list()
            predicted_labels = list()
            probs = list()
            total_time = avg_time = 0.0
            print("Started Testing:", NEGATIVE_PERCENTAGE)
            # before working on anything just save all the questions and responses in a list
            all_data = list()
            examples_count = processed_examples_count = 0
            with open(q_file,
                      'r') as q_reader, open(r_file, "r") as r_reader, open(
                          rules_file, "r") as rule_reader:
                logger.info("Reading questions from : %s", q_file)
                logger.info("Reading responses from : %s", r_file)
                q = next(q_reader).lower().strip()
                q = mt.tokenize(q, return_str=True, escape=False)
                current_qa = (q, "")
                current_rules_and_responses = list()
                for i, (response,
                        rule) in enumerate(zip(r_reader, rule_reader)):
                    response = response.strip()
                    rule = rule.strip()
                    if response and rule:
                        # get current_answer from response
                        a = get_answer_from_response(response)
                        if not current_qa[1]:
                            current_qa = (q, a)
                        else:
                            # verify if the a is same as the one in current_qa
                            if a != current_qa[1]:
                                # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response)
                                current_qa = (current_qa[0], a)
                                # print(current_rules_and_responses)
                                # exit()
                        # Add it to the current responses
                        current_rules_and_responses.append((response, rule))
                    elif len(current_rules_and_responses) > 0:
                        # Create a instance
                        # print(current_qa)
                        # print(current_rules_and_responses)
                        # exit()
                        if rule or response:
                            print("Rule Response mismatch")
                            print(current_qa)
                            print(response)
                            print(rule)
                            print(examples_count)
                            print(i)
                            exit()

                        if examples_count < start_index:
                            examples_count += 1
                            q = next(q_reader).lower().strip()
                            q = mt.tokenize(q, return_str=True, escape=False)
                            current_qa = (q, "")
                            current_rules_and_responses = list()
                            continue
                        elif examples_count > end_index:
                            break

                        all_data.append(
                            (current_qa, current_rules_and_responses))
                        try:
                            q = next(q_reader).lower().strip()
                            q = mt.tokenize(q, return_str=True, escape=False)
                        except StopIteration:
                            # previous one was the last question
                            q = ""
                        current_qa = (q, "")
                        current_rules_and_responses = list()
                        examples_count += 1
                        # if(examples_count%100 == 0):
                        # 	print(examples_count)
                    else:
                        # Serious Bug
                        print("Serious BUG!!")
                        print(current_qa)
                        print(response)
                        print(rule)
                        print(examples_count)
                        print(i)
                        exit()
            print("{}:\tFINISHED IO".format(process_no))
            examples_count = start_index
            processed_examples_count = 0
            for current_qa, responses_and_rules in all_data:
                start_time = time.time()
                # Tokenize and preprocess the responses
                preprocessed_responses = [
                    mt.tokenize(remove_answer_brackets(response),
                                return_str=True,
                                escape=False)
                    for response, rule in responses_and_rules
                ]
                # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules])
                predictions = predictor.predict(current_qa[0],
                                                preprocessed_responses)
                label_probs = predictions["label_probs"]
                tuples = zip(responses_and_rules, label_probs)
                sorted_by_score = sorted(tuples,
                                         key=lambda tup: tup[1],
                                         reverse=True)
                count = 0
                all_writer.write("{}\n".format(current_qa[0]))
                all_writer.write("{}\n".format(current_qa[1]))
                for index, ((response, rule),
                            label_prob) in enumerate(sorted_by_score):
                    if index == 3:
                        break
                    all_writer.write("{}\t{}\t{}\t{}\n".format(
                        response,
                        mt.tokenize(remove_answer_brackets(response),
                                    return_str=True,
                                    escape=False), rule, label_prob))
                all_writer.write("\n")
                all_writer.flush()
                end_time = time.time()
                processed_examples_count += 1
                examples_count += 1
                total_time += end_time - start_time
                avg_time = total_time / float(processed_examples_count)
                print(
                    "{}:\ttime to write {} with {} responses is {} secs. {} avg time"
                    .format(process_no, examples_count,
                            len(responses_and_rules), end_time - start_time,
                            avg_time))