Exemplo n.º 1
0
 def __init__(self, num_mac_cells: int, hidden_dim: int):
     self.cells = num_mac_cells
     self.mac = Mac(hidden_dim)
     self.hidden_dim = hidden_dim
     self.acts = []
     self.qenc = CudnnGru(hidden_dim // 2,
                          w_init=TruncatedNormal(stddev=0.05))
     self.question_drop = DropoutLayer(0.92)
     self.control_proj = FullyConnected(hidden_dim)
     for _ in range(num_mac_cells):
         self.acts.append(FullyConnected(hidden_dim))
Exemplo n.º 2
0
class MacNetwork(Configurable):
    """ Basic non-recurrent attention using the given SimilarityFunction """
    def __init__(self, num_mac_cells: int, hidden_dim: int):
        self.cells = num_mac_cells
        self.mac = Mac(hidden_dim)
        self.hidden_dim = hidden_dim
        self.acts = []
        self.qenc = CudnnGru(hidden_dim // 2,
                             w_init=TruncatedNormal(stddev=0.05))
        self.question_drop = DropoutLayer(0.92)
        self.control_proj = FullyConnected(hidden_dim)
        for _ in range(num_mac_cells):
            self.acts.append(FullyConnected(hidden_dim))

    def apply(self,
              is_train,
              document,
              questions,
              document_mask=None,
              question_mask=None):
        # create question vec
        # the cudnnGRU layer reverses the sequences and stuff for us so we just grab last hidden states.
        question_hidden = self.qenc.apply(is_train, questions,
                                          question_mask)[:, -1]
        question_hidden = self.question_drop.apply(is_train, question_hidden)
        # shared projection
        question_vec = tf.tanh(
            self.control_proj.apply(is_train, question_hidden))
        # create initial memory and control states
        init_control = question_hidden
        init_memory = tf.get_variable(
            'init_memory',
            shape=(1, self.hidden_dim),
            trainable=True,
        )
        init_memory = tf.tile(init_memory, [tf.shape(questions)[0], 1])
        # going through the cells!
        control, memory = init_control, init_memory
        for i in range(self.cells):
            # control projection stuff
            position_cont = self.acts[i].apply(is_train, question_vec)
            # call mac cell
            with tf.variable_scope('macmsc', reuse=False if i == 0 else True):
                next_control, next_mem, out = self.mac.apply(
                    is_train, document, questions, question_vec, control,
                    position_cont, memory, False if i == 0 else True,
                    document_mask, question_mask)
            control, memory = next_control, next_mem
        # no yes/no questions, so no need for outputting states.
        return out
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser("Train our ELMo model on SQuAD")
    parser.add_argument("loss_mode", choices=['default', 'confidence'])
    parser.add_argument("output_dir")
    parser.add_argument("--dim", type=int, default=90)
    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")
    parser.add_argument("--no-tfidf",
                        action='store_true',
                        help="Don't add TF-IDF negative examples")
    args = parser.parse_args()

    out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S")

    dim = args.dim
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))

    if args.loss_mode == 'default':
        n_epochs = 24
        answer_encoder = SingleSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer))
        batcher = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher)
    elif args.loss_mode == 'confidence':
        if args.no_tfidf:
            prepro = SquadDefault()
            n_epochs = 15
        else:
            prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True)
            n_epochs = 50
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = ConfidencePredictor(ChainBiMapper(
            first_layer=recurrent_layer,
            second_layer=recurrent_layer,
        ),
                                        AttentionEncoder(),
                                        FullyConnected(80, activation="tanh"),
                                        aggregate="sum")
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, 'flatten', True, 0)
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        data = PreprocessedData(SquadCorpus(),
                                prepro,
                                StratifyParagraphsBuilder(train_batching, 1),
                                eval_dataset,
                                eval_on_verified=False)
        data.preprocess(1)

    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=1.0)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=n_epochs,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )
    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(answer_encoder),
        lm_model=SquadContextConcatSkip(),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=predictor)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
Exemplo n.º 4
0
def get_model(char_th: int, dim: int, mode: str, preprocess: Optional[TextPreprocessor]):
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))
    #pdb.set_trace()
    if mode.startswith("shared-norm"):
        answer_encoder = GroupedSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer
            ),
            span_predictor=IndependentBoundsGrouped(aggregate="sum")
        )
    elif mode == "confidence":
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = ConfidencePredictor(
            ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer,
            ),
            AttentionEncoder(),
            FullyConnected(80, activation="tanh"),
            aggregate="sum"
        )
    elif mode == "sigmoid":
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer
            ),
            span_predictor=IndependentBoundsSigmoidLoss()
        )
    elif mode == "paragraph" or mode == "merge":
        answer_encoder = MultiChoiceAnswerEncoder()
        predictor = MultiChoicePredictor(4)
    else:
        raise NotImplementedError(mode)

    return Attention(
        encoder=DocumentAndQuestionEncoder(answer_encoder),
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True),
        char_embed=CharWordEmbedder(
            LearnedCharEmbedder(word_size_th=14, char_th=char_th, char_dim=20, init_scale=0.05, force_cpu=True),
            MaxPool(Conv1d(100, 5, 0.8)),
            shared_parameters=True
        ),
        preprocess=preprocess,
        word_embed_layer=None,
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        question_mapper=None,
        context_mapper=None,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(FullyConnected(dim * 2, activation="relu"),
                                        ResidualLayer(SequenceMapperSeq(
                                            VariationalDropoutLayer(0.8),
                                            recurrent_layer,
                                            VariationalDropoutLayer(0.8),
                                            StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()),
                                            FullyConnected(dim * 2, activation="relu"),
                                        )),
                                        VariationalDropoutLayer(0.8)),
        #templayer = BiLinear(bias = True),
        predictor=predictor
    )
Exemplo n.º 5
0
def build_model(preprocess: Optional[TextPreprocessor], train_config, use_cudnn=False):
    if use_cudnn:
        print('Using Cuddn:')
        recurrent_layer = CudnnGru(train_config.dim, w_init=TruncatedNormal(stddev=train_config.recurrent_stdev))
    else:
        recurrent_layer = BiRecurrentMapper(CompatGruCellSpec(train_config.dim))

    lm_reduce = MapperSeq(
        ElmoLayer(
            train_config.l2,
            layer_norm=train_config.lm_layernorm,
            top_layer_only=train_config.top_layer_only
        ),
        DropoutLayer(train_config.elmo_dropout),
    )

    answer_encoder = GroupedSpanAnswerEncoder()
    predictor = BoundsPredictor(
        ChainBiMapper(
            first_layer=recurrent_layer,
            second_layer=recurrent_layer
        ),
        span_predictor=IndependentBoundsGrouped(aggregate="sum")
    )
    word_embed = FixedWordEmbedder(
        vec_name=train_config.word_vectors,
        word_vec_init_scale=0,
        learn_unk=train_config.learn_unk_vector,
        cpu=True
    )
    char_embed = CharWordEmbedder(
        LearnedCharEmbedder(
            word_size_th=14,
            char_th=train_config.char_th,
            char_dim=train_config.char_dim,
            init_scale=0.05,
            force_cpu=True
        ),
        MaxPool(Conv1d(100, 5, 0.8)),
        shared_parameters=True
    )
    embed_mapper = SequenceMapperSeq(
        VariationalDropoutLayer(train_config.var_dropout),
        recurrent_layer,
        VariationalDropoutLayer(train_config.var_dropout)
    )
    attention = BiAttention(TriLinear(bias=True), True)
    match_encoder = SequenceMapperSeq(
        FullyConnected(train_config.dim * 2, activation="relu"),
        ResidualLayer(SequenceMapperSeq(
            VariationalDropoutLayer(train_config.var_dropout),
            recurrent_layer,
            VariationalDropoutLayer(train_config.var_dropout),
            StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()),
            FullyConnected(train_config.dim * 2, activation="relu"),
        )),
        VariationalDropoutLayer(train_config.var_dropout)
    )
    lm_model = LanguageModel(LM_VOCAB, LM_OPTIONS, LM_WEIGHTS, LM_TOKEN_WEIGHTS)
    model = CapeAttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(answer_encoder),
        lm_model=lm_model,
        max_batch_size=train_config.max_batch_size,
        preprocess=preprocess,
        per_sentence=False,
        append_embed=(train_config.elmo_mode == "both" or train_config.elmo_mode == "input"),
        append_before_atten=(train_config.elmo_mode == "both" or train_config.elmo_mode == "output"),
        word_embed=word_embed,
        char_embed=char_embed,
        embed_mapper=embed_mapper,
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        memory_builder=NullBiMapper(),
        attention=attention,
        match_encoder=match_encoder,
        predictor=predictor
    )
    return model
def main():
    parser = argparse.ArgumentParser("Train rejection model on SQuAD")

    parser.add_argument("--corpus_dir", type=str, default="~/data/document-qa")
    parser.add_argument("--output_dir",
                        type=str,
                        default="~/model/document-qa/squad")
    parser.add_argument("--lm_dir", type=str, default="~/data/lm")
    parser.add_argument("--exp_id", type=str, default="rejection")

    parser.add_argument("--lr", type=float, default=0.5)
    parser.add_argument("--epoch", type=int, default=20)

    parser.add_argument("--dim", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=45)

    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")

    args = parser.parse_args()

    print("Arguments : ", args)

    out = args.output_dir + "_" + args.exp_id + "_lr" + str(
        args.lr) + "-" + datetime.now().strftime("%m%d-%H%M%S")
    dim = args.dim
    batch_size = args.batch_size
    out = expanduser(out)
    lm_dir = expanduser(args.lm_dir)
    corpus_dir = expanduser(args.corpus_dir)

    print("Make global recurrent_layer...")
    recurrent_layer = CudnnGru(
        dim, w_init=tf.keras.initializers.TruncatedNormal(stddev=0.05))
    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=args.lr)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=args.epoch,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )

    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        lm_model=SquadContextConcatSkip(lm_dir=lm_dir),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer)))

    batcher = ClusteredBatcher(batch_size, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(SquadCorpus(corpus_dir), None, batcher,
                                  batcher)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
Exemplo n.º 7
0
 def __init__(self, hidden_dim):
     # control
     self.control_lin = FullyConnected(hidden_dim)
     self.attn = FullyConnected(1)
     # read
     self.mem_drop = DropoutLayer(0.85)
     self.read_drop = DropoutLayer(0.85)
     self.mem_proj = FullyConnected(hidden_dim)
     self.kb_proj = FullyConnected(hidden_dim)
     self.concat = FullyConnected(hidden_dim)
     self.concat2 = FullyConnected(hidden_dim)
     self.bi = CtrlBiAttention(TriLinear(bias=True))
     self.lin = FullyConnected(hidden_dim)
     self.read_drop = DropoutLayer(0.85)
     self.rattn = FullyConnected(1)
     # write
     self.write = FullyConnected(hidden_dim)
     self.gate = FullyConnected(1)
Exemplo n.º 8
0
class Mac(Configurable):
    def __init__(self, hidden_dim):
        # control
        self.control_lin = FullyConnected(hidden_dim)
        self.attn = FullyConnected(1)
        # read
        self.mem_drop = DropoutLayer(0.85)
        self.read_drop = DropoutLayer(0.85)
        self.mem_proj = FullyConnected(hidden_dim)
        self.kb_proj = FullyConnected(hidden_dim)
        self.concat = FullyConnected(hidden_dim)
        self.concat2 = FullyConnected(hidden_dim)
        self.bi = CtrlBiAttention(TriLinear(bias=True))
        self.lin = FullyConnected(hidden_dim)
        self.read_drop = DropoutLayer(0.85)
        self.rattn = FullyConnected(1)
        # write
        self.write = FullyConnected(hidden_dim)
        self.gate = FullyConnected(1)

    def apply(self,
              is_train,
              document,
              question_words,
              question_vec,
              prev_cont,
              position_aware_cont,
              prev_mem,
              reuse,
              document_mask=None,
              question_mask=None):
        # control unit
        with tf.variable_scope("control", reuse=reuse):
            control = tf.concat([prev_cont, position_aware_cont],
                                axis=1)  # B, 2xF
            control_question = self.control_lin.apply(is_train,
                                                      control)  # B, F
            control_question = tf.expand_dims(control_question,
                                              axis=1)  # B, 1, F
            context_prod = control_question * question_words  # B, L, F
            attn_weight = tf.squeeze(self.attn.apply(is_train, context_prod),
                                     axis=2)  # B, L
            if question_mask is not None:
                m = tf.sequence_mask(question_mask)
                attn_weight += VERY_NEGATIVE_NUMBER * (
                    1 - tf.cast(m, context_prod.dtype))
            ctrl_attn = tf.nn.softmax(attn_weight, 1)  # B, L
            attn = tf.expand_dims(ctrl_attn, axis=2)  # B, L, 1
            next_control = tf.reduce_sum(attn * question_words, axis=1)  # B, F
        # read unit
        with tf.variable_scope("read", reuse=reuse):
            last_mem = self.mem_drop.apply(is_train, prev_mem)
            know = self.read_drop.apply(is_train, document)
            proj_mem = tf.expand_dims(self.mem_proj.apply(is_train, last_mem),
                                      axis=1)
            proj_know = self.kb_proj.apply(is_train, know)
            concat = self.concat.apply(
                is_train, tf.concat([proj_mem * proj_know, proj_know], axis=2))
            out = tf.nn.elu(
                self.lin.apply(
                    is_train,
                    self.bi.apply(is_train, concat, question_words, ctrl_attn,
                                  document_mask, question_mask)))
            attn = self.read_drop.apply(is_train, out)
            attn = tf.squeeze(self.rattn.apply(is_train, attn), axis=-1)
            if document_mask is not None:
                m = tf.sequence_mask(document_mask)
                attn += VERY_NEGATIVE_NUMBER * (1 - tf.cast(m, attn.dtype))
            attn = tf.expand_dims(tf.nn.softmax(attn, 1), axis=2)
            read = tf.reduce_sum(attn * know, axis=1)
        # write unit, with memory gate.
        with tf.variable_scope("write", reuse=reuse):
            concat = self.write.apply(
                is_train, tf.concat([read, prev_mem, next_control], axis=1))
            gate = tf.sigmoid(self.gate.apply(is_train, next_control) + 1.0)
            next_mem = gate * prev_mem + (1 - gate) * concat
        # return results of cell!
        return next_control, next_mem, out