def __init__( self, kind: str, n_units, n_layers=1, # Its not obvious how to compute fan_in/fan_out for these models # so we recommend avoiding glorot initialization for now w_init=TruncatedNormal(stddev=0.05), recurrent_init=None, bidirectional=True, learn_initial_states: bool = False, lstm_bias=1, keep_recurrent: float = 1): if bidirectional is None or n_layers is None or n_units is None: raise ValueError() if kind not in ["GRU", "LSTM"]: raise ValueError() self._kind = kind self.keep_recurrent = keep_recurrent self.lstm_bias = lstm_bias self.n_units = n_units self.n_layers = n_layers self.bidirectional = bidirectional self.w_init = w_init self.recurrent_init = recurrent_init self.learn_initial_states = learn_initial_states
def __init__(self, n_units, n_layers=1, lstm_bias=1, w_init=TruncatedNormal(stddev=0.05), recurrent_init=None, bidirectional=True, learn_initial_states=False): super().__init__("LSTM", n_units, n_layers, w_init, recurrent_init, bidirectional, learn_initial_states, lstm_bias)
def __init__(self, num_mac_cells: int, hidden_dim: int): self.cells = num_mac_cells self.mac = Mac(hidden_dim) self.hidden_dim = hidden_dim self.acts = [] self.qenc = CudnnGru(hidden_dim // 2, w_init=TruncatedNormal(stddev=0.05)) self.question_drop = DropoutLayer(0.92) self.control_proj = FullyConnected(hidden_dim) for _ in range(num_mac_cells): self.acts.append(FullyConnected(hidden_dim))
def main(): parser = argparse.ArgumentParser("Train our ELMo model on SQuAD") parser.add_argument("loss_mode", choices=['default', 'confidence']) parser.add_argument("output_dir") parser.add_argument("--dim", type=int, default=90) parser.add_argument("--l2", type=float, default=0) parser.add_argument("--mode", choices=["input", "output", "both", "none"], default="both") parser.add_argument("--top_layer_only", action="store_true") parser.add_argument("--no-tfidf", action='store_true', help="Don't add TF-IDF negative examples") args = parser.parse_args() out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S") dim = args.dim recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05)) if args.loss_mode == 'default': n_epochs = 24 answer_encoder = SingleSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer)) batcher = ClusteredBatcher(45, ContextLenKey(), False, False) data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher) elif args.loss_mode == 'confidence': if args.no_tfidf: prepro = SquadDefault() n_epochs = 15 else: prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True) n_epochs = 50 answer_encoder = DenseMultiSpanAnswerEncoder() predictor = ConfidencePredictor(ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer, ), AttentionEncoder(), FullyConnected(80, activation="tanh"), aggregate="sum") eval_dataset = RandomParagraphSetDatasetBuilder( 100, 'flatten', True, 0) train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False) data = PreprocessedData(SquadCorpus(), prepro, StratifyParagraphsBuilder(train_batching, 1), eval_dataset, eval_on_verified=False) data.preprocess(1) params = trainer.TrainParams(trainer.SerializableOptimizer( "Adadelta", dict(learning_rate=1.0)), ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, num_epochs=n_epochs, log_period=30, eval_period=1200, save_period=1200, best_weights=("dev", "b17/text-f1"), eval_samples=dict(dev=None, train=8000)) lm_reduce = MapperSeq( ElmoLayer(args.l2, layer_norm=False, top_layer_only=args.top_layer_only), DropoutLayer(0.5), ) model = AttentionWithElmo( encoder=DocumentAndQuestionEncoder(answer_encoder), lm_model=SquadContextConcatSkip(), append_before_atten=(args.mode == "both" or args.mode == "output"), append_embed=(args.mode == "both" or args.mode == "input"), max_batch_size=128, word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14, char_th=49, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True), embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), lm_reduce=None, lm_reduce_shared=lm_reduce, per_sentence=False, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=SequenceMapperSeq( FullyConnected(dim * 2, activation="relu"), ResidualLayer( SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), predictor=predictor) with open(__file__, "r") as f: notes = f.read() notes = str(sorted(args.__dict__.items(), key=lambda x: x[0])) + "\n" + notes trainer.start_training( data, model, params, [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
def get_model(char_th: int, dim: int, mode: str, preprocess: Optional[TextPreprocessor]): recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05)) if mode.startswith("shared-norm"): answer_encoder = GroupedSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer), span_predictor=IndependentBoundsGrouped(aggregate="sum")) elif mode == "confidence": answer_encoder = DenseMultiSpanAnswerEncoder() predictor = ConfidencePredictor(ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer, ), AttentionEncoder(), FullyConnected(80, activation="tanh"), aggregate="sum") elif mode == "sigmoid": answer_encoder = DenseMultiSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer), span_predictor=IndependentBoundsSigmoidLoss()) elif mode == "paragraph" or mode == "merge": answer_encoder = DenseMultiSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer)) else: raise NotImplementedError(mode) return MacAttention( encoder=DocumentAndQuestionEncoder(answer_encoder), word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14, char_th=char_th, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True), preprocess=preprocess, word_embed_layer=None, embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), question_mapper=None, context_mapper=None, memory_builder=NullBiMapper(), mac=MacNetwork(2, dim * 2), match_encoder=SequenceMapperSeq( FullyConnected(dim * 2, activation="relu"), ResidualLayer( SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), predictor=predictor)