def test_tokens_dictfeat_contextual(self): # TODO (T65593688): this should be removed after # https://github.com/pytorch/pytorch/pull/33645 is merged. with torch.no_grad(): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), inputs=Seq2SeqModel.Config.ModelInput( dict_feat=GazetteerTensorizer.Config( text_column="source_sequence" ), contextual_token_embedding=ByteTokenTensorizer.Config(), ), encoder_decoder=RNNModel.Config( encoder=LSTMSequenceEncoder.Config(embed_dim=619) ), dict_embedding=DictEmbedding.Config(), contextual_token_embedding=ContextualTokenEmbedding.Config( embed_dim=7 ), ), get_tensorizers(add_dict_feat=True, add_contextual_feat=True), ) model.eval() ts_model = model.torchscriptify() res = ts_model( ["call", "mom"], (["call", "mom"], [0.42, 0.17], [4, 3]), [0.42] * (7 * 2), ) assert res is not None
def test_reset_incremental_states(self): """ This test might seem trivial. However, interacting with the scripted sequence generator crosses the Torchscript boundary, which can lead to weird behavior. If the incremental states don't get properly reset, the model will produce garbage _after_ the first call, which is a pain to debug when you only catch it after training. """ tensorizers = get_tensorizers() # Avoid numeric issues with quantization by setting a known seed. torch.manual_seed(42) model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), ), tensorizers, ) # Get sample inputs using a data source. schema = { "source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str, } data = Data.from_config( Data.Config(source=TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=[ "source_sequence", "dict_feat", "target_sequence" ], )), schema, tensorizers, ) data.batcher = Batcher(1, 1, 1) raw_batch, batch = next( iter(data.batches(Stage.TRAIN, load_early=True))) inputs = model.arrange_model_inputs(batch) model.eval() outputs = model(*inputs) pred, scores = model.get_pred(outputs, {"stage": Stage.TEST}) # Verify that the incremental states reset correctly. decoder = model.sequence_generator.beam_search.decoder_ens decoder.reset_incremental_states() self.assertDictEqual(decoder.incremental_states, {"0": {}}) # Verify that the model returns the same predictions. new_pred, new_scores = model.get_pred(outputs, {"stage": Stage.TEST}) self.assertEqual(new_scores, scores)
def test_tokens(self): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), ), get_tensorizers(), ) model.eval() ts_model = model.torchscriptify() res = ts_model(["call", "mom"]) assert res is not None
class Config(Model.Config): class ModelInput(Model.Config.ModelInput): src_seq_tokens: TokenTensorizer.Config = TokenTensorizer.Config() trg_seq_tokens: TokenTensorizer.Config = TokenTensorizer.Config() dict_feat: Optional[GazetteerTensorizer.Config] = None inputs: ModelInput = ModelInput() encoder_decoder: RNNModel.Config = RNNModel.Config() source_embedding: WordEmbedding.Config = WordEmbedding.Config() target_embedding: WordEmbedding.Config = WordEmbedding.Config() dict_embedding: Optional[DictEmbedding.Config] = None output_layer: Seq2SeqOutputLayer.Config = Seq2SeqOutputLayer.Config() sequence_generator: ScriptedSequenceGenerator.Config = ( ScriptedSequenceGenerator.Config())
def test_tokens(self): # TODO: this should be removed after # https://github.com/pytorch/pytorch/pull/33645 is merged. with torch.no_grad(): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), ), get_tensorizers(), ) model.eval() ts_model = model.torchscriptify() res = ts_model(["call", "mom"]) assert res is not None
class Config(Model.Config): class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() word_labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config( allow_unknown=True) doc_labels: LabelTensorizer.Config = LabelTensorizer.Config( allow_unknown=True) doc_weight: FloatTensorizer.Config = FloatTensorizer.Config( column="doc_weight") word_weight: FloatTensorizer.Config = FloatTensorizer.Config( column="word_weight") inputs: ModelInput = ModelInput() word_embedding: WordEmbedding.Config = WordEmbedding.Config() representation: Union[BiLSTMDocSlotAttention.Config, JointCNNRepresentation.Config, PassThroughRepresentation. Config, ] = BiLSTMDocSlotAttention.Config() output_layer: IntentSlotOutputLayer.Config = ( IntentSlotOutputLayer.Config()) decoder: IntentSlotModelDecoder.Config = IntentSlotModelDecoder.Config( ) default_doc_loss_weight: float = 0.2 default_word_loss_weight: float = 0.5
class Config(BaseModel.Config): class ModelInput(BaseModel.Config.ModelInput): squad_input: SquadTensorizer.Config = SquadTensorizer.Config() has_answer: LabelTensorizer.Config = LabelTensorizer.Config( column="has_answer" ) # Model inputs. inputs: ModelInput = ModelInput() # Configrable modules for the model. dropout: float = 0.4 # Overrides dropout in sub-modules of the model. embedding: WordEmbedding.Config = WordEmbedding.Config( embed_dim=300, pretrained_embeddings_path=GLOVE_840B_300D, vocab_from_pretrained_embeddings=True, ) ques_rnn: StackedBidirectionalRNN.Config = StackedBidirectionalRNN.Config( dropout=dropout ) doc_rnn: StackedBidirectionalRNN.Config = StackedBidirectionalRNN.Config( dropout=dropout ) # Output layer. output_layer: SquadOutputLayer.Config = SquadOutputLayer.Config()
class Config(BasePairwiseModel.Config): """ Attributes: encode_relations (bool): if `false`, return the concatenation of the two representations; if `true`, also concatenate their pairwise absolute difference and pairwise elementwise product (à la arXiv:1705.02364). Default: `true`. tied_representation: whether to use the same representation, with tied weights, for all the input subrepresentations. Default: `true`. """ class ModelInput(BasePairwiseModel.Config.ModelInput): tokens1: TokenTensorizer.Config = TokenTensorizer.Config(column="text1") tokens2: TokenTensorizer.Config = TokenTensorizer.Config(column="text2") labels: LabelTensorizer.Config = LabelTensorizer.Config() # for metric reporter raw_text: JoinStringTensorizer.Config = JoinStringTensorizer.Config( columns=["text1", "text2"] ) inputs: ModelInput = ModelInput() embedding: WordEmbedding.Config = WordEmbedding.Config() representation: Union[ BiLSTMDocAttention.Config, DocNNRepresentation.Config ] = BiLSTMDocAttention.Config() shared_representations: bool = True
class Config(DocModel.Config): class PersonalizedModelInput(DocModel.Config.ModelInput): uid: Optional[UidTensorizer.Config] = UidTensorizer.Config() inputs: PersonalizedModelInput = PersonalizedModelInput() # user_embedding is a representation for a user and is jointly trained # with the model. Consider user ids as "words" to reuse WordEmbedding class. user_embedding: WordEmbedding.Config = WordEmbedding.Config()
class Config(DocModel_Deprecated.Config): class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True) # for metric reporter raw_text: RawString.Config = RawString.Config(column="text") inputs: ModelInput = ModelInput() embedding: WordEmbedding.Config = WordEmbedding.Config()
class Config(DocModel.Config): class ModelInput(Model.Config.ModelInput): tokens: WordTensorizer.Config = WordTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True) # for metric reporter raw_text: MetaInput.Config = MetaInput.Config(column="text") inputs: ModelInput = ModelInput() embedding: WordEmbedding.Config = WordEmbedding.Config()
def test_tokens_dictfeat(self): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), inputs=Seq2SeqModel.Config.ModelInput( dict_feat=GazetteerTensorizer.Config( text_column="source_sequence")), encoder_decoder=RNNModel.Config( encoder=LSTMSequenceEncoder.Config(embed_dim=612)), dict_embedding=DictEmbedding.Config(), ), get_tensorizers(add_dict_feat=True), ) model.eval() ts_model = model.torchscriptify() res = ts_model(["call", "mom"], (["call", "mom"], [0.42, 0.17], [4, 3])) assert res is not None
def test_tokens_contextual(self): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), inputs=Seq2SeqModel.Config.ModelInput( contextual_token_embedding=ByteTokenTensorizer.Config()), contextual_token_embedding=ContextualTokenEmbedding.Config( embed_dim=7), encoder_decoder=RNNModel.Config( encoder=LSTMSequenceEncoder.Config(embed_dim=519)), ), get_tensorizers(add_contextual_feat=True), ) model.eval() ts_model = model.torchscriptify() res = ts_model(["call", "mom"], contextual_token_embedding=[0.42] * (7 * 2)) assert res is not None
class Config(IntentSlotModel.Config): class ModelInput(IntentSlotModel.Config.ModelInput): seq_tokens: Optional[ SeqTokenTensorizer.Config] = SeqTokenTensorizer.Config() inputs: ModelInput = ModelInput() seq_embedding: Optional[WordEmbedding.Config] = WordEmbedding.Config() representation: ContextualIntentSlotRepresentation.Config = ContextualIntentSlotRepresentation.Config( )
def create_embedding(cls, config, tensorizers): vocab = tensorizers["tokens"].vocab return WordEmbedding( len(vocab), config.embedding.embed_dim, None, None, vocab.idx[SpecialTokens.UNK], [], )
def __init__( self, # word embedding config pretrained_embeddings_path: str, embedding_dim: int, mlp_layer_dims: Optional[Sequence[int]] = (150,), lowercase_tokens: bool = False, skip_header: bool = True, delimiter: str = " ", # DocNN config kernel_num: int = 100, kernel_sizes: Optional[Sequence[int]] = (3, 4, 5), pooling_type: str = "max", dropout: float = 0.4, # decoder config dense_dim: int = 0, decoder_hidden_dims: Optional[Sequence[int]] = (128,), out_dim: int = 2, vocab: ScriptVocabulary = None, ) -> None: super().__init__() self.word_embedding = WordEmbedding( pretrained_embeddings_path=pretrained_embeddings_path, embedding_dim=embedding_dim, mlp_layer_dims=mlp_layer_dims, lowercase_tokens=lowercase_tokens, skip_header=skip_header, delimiter=delimiter, vocab=vocab, ) self.encoder = DocNNEncoder( embed_dim=self.word_embedding.get_output_dim(), kernel_num=kernel_num, kernel_sizes=kernel_sizes, pooling_type=pooling_type, dropout=dropout, ) self.decoder = MLPDecoder( in_dim=self.encoder.out_dim + dense_dim, out_dim=out_dim, bias=True, hidden_dims=decoder_hidden_dims, )
def test_force_predictions_on_eval(self): tensorizers = get_tensorizers() model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), ), tensorizers, ) # Get sample inputs using a data source. schema = { "source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str, } data = Data.from_config( Data.Config(source=TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=[ "source_sequence", "dict_feat", "target_sequence" ], )), schema, tensorizers, ) data.batcher = Batcher(1, 1, 1) raw_batch, batch = next( iter(data.batches(Stage.TRAIN, load_early=True))) inputs = model.arrange_model_inputs(batch) # Verify that model does not run sequence generation on prediction. outputs = model(*inputs) pred = model.get_pred(outputs, {"stage": Stage.EVAL}) self.assertEqual(pred, (None, None)) # Verify that attempting to set force_eval_predictions is correctly # accounted for. model.force_eval_predictions = True with self.assertRaises(AssertionError): _ = model.get_pred(outputs, {"stage": Stage.EVAL})
def create_embedding(cls, config, tensorizers): vocab = tensorizers["tokens"].vocab word_embedding = WordEmbedding( len(vocab), config.word_embedding.embed_dim, None, None, vocab.idx[SpecialTokens.UNK], [], ) return EmbeddingList([word_embedding], concat=True)
class Config(ConfigBase): class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() slots: TokenTensorizer.Config = TokenTensorizer.Config( column="slots") inputs: ModelInput = ModelInput() embedding: WordEmbedding.Config = WordEmbedding.Config() representation: BiLSTMSlotAttention.Config = BiLSTMSlotAttention.Config( ) decoder: MLPDecoder.Config = MLPDecoder.Config() output_layer: MyTaggingOutputLayer.Config = MyTaggingOutputLayer.Config( )
class Config(Model.Config): class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config() inputs: ModelInput = ModelInput() embedding: WordEmbedding.Config = WordEmbedding.Config() representation: Union[ PureDocAttention.Config, BiLSTMDocAttention.Config, DocNNRepresentation.Config, ] = BiLSTMDocAttention.Config() decoder: MLPDecoder.Config = MLPDecoder.Config() output_layer: ClassificationOutputLayer.Config = ( ClassificationOutputLayer.Config())
class DocModel(nn.Module): def __init__( self, # word embedding config pretrained_embeddings_path: str, embedding_dim: int, mlp_layer_dims: Optional[Sequence[int]] = (150,), lowercase_tokens: bool = False, skip_header: bool = True, delimiter: str = " ", # DocNN config kernel_num: int = 100, kernel_sizes: Optional[Sequence[int]] = (3, 4, 5), pooling_type: str = "max", dropout: float = 0.4, # decoder config dense_dim: int = 0, decoder_hidden_dims: Optional[Sequence[int]] = (128,), out_dim: int = 2, vocab: ScriptVocabulary = None, ) -> None: super().__init__() self.word_embedding = WordEmbedding( pretrained_embeddings_path=pretrained_embeddings_path, embedding_dim=embedding_dim, mlp_layer_dims=mlp_layer_dims, lowercase_tokens=lowercase_tokens, skip_header=skip_header, delimiter=delimiter, vocab=vocab, ) self.encoder = DocNNEncoder( embed_dim=self.word_embedding.get_output_dim(), kernel_num=kernel_num, kernel_sizes=kernel_sizes, pooling_type=pooling_type, dropout=dropout, ) self.decoder = MLPDecoder( in_dim=self.encoder.out_dim + dense_dim, out_dim=out_dim, bias=True, hidden_dims=decoder_hidden_dims, ) def forward(self, inputs: Dict[str, torch.Tensor]) -> torch.Tensor: tokens = inputs["token_ids"] denses = inputs["dense"] if "dense" in inputs else None word_embedding_output = self.word_embedding(tokens) encoder_output = self.encoder(word_embedding_output) return self.decoder(encoder_output, denses)
def setUp(self): actions_counter = Counter() for action in [ "IN:A", "IN:B", "IN:UNSUPPORTED", "REDUCE", "SHIFT", "SL:C", "SL:D", ]: actions_counter[action] += 1 actions_vocab = Vocab(actions_counter, specials=[]) self.parser = RNNGParser( ablation=RNNGParser.Config.AblationParams(), constraints=RNNGParser.Config.RNNGConstraints(), lstm_num_layers=2, lstm_dim=20, max_open_NT=10, dropout=0.2, beam_size=3, top_k=3, actions_vocab=actions_vocab, shift_idx=4, reduce_idx=3, ignore_subNTs_roots=[2], valid_NT_idxs=[0, 1, 2, 5, 6], valid_IN_idxs=[0, 1, 2], valid_SL_idxs=[5, 6], embedding=EmbeddingList( embeddings=[ WordEmbedding( num_embeddings=5, embedding_dim=20, embeddings_weight=None, init_range=[-1, 1], unk_token_idx=4, mlp_layer_dims=[], ), DictEmbedding( num_embeddings=4, embed_dim=10, pooling_type=PoolingType.MEAN ), ], concat=True, ), p_compositional=CompositionalNN(lstm_dim=20), ) self.parser.train()
def from_config(cls, config: Config, tensorizers: Dict[str, Tensorizer]): vocab = tensorizers["tokens"].vocab labels = tensorizers["labels"].labels embedding = WordEmbedding(len(vocab), config.embedding.embed_dim, None, None, vocab.idx[UNK], []) representation = create_module(config.representation, embed_dim=embedding.embedding_dim) decoder = create_module( config.decoder, in_dim=representation.representation_dim, out_dim=len(labels), ) output_layer = ClassificationOutputLayer(labels, CrossEntropyLoss(None)) return cls(embedding, representation, decoder, output_layer)
class Config(Model.Config): class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config() inputs: ModelInput = ModelInput() embedding: WordEmbedding.Config = WordEmbedding.Config() representation: Union[BiLSTMSlotAttention. Config, # TODO: make default when sorting solved BSeqCNNRepresentation.Config, PassThroughRepresentation. Config, ] = PassThroughRepresentation.Config() output_layer: Union[ WordTaggingOutputLayer.Config, CRFOutputLayer.Config] = WordTaggingOutputLayer.Config() decoder: MLPDecoder.Config = MLPDecoder.Config()
def setUp(self): contextual_emb_dim = 1 emb_module = EmbeddingList( embeddings=[ WordEmbedding(num_embeddings=103, embedding_dim=100), DictEmbedding( num_embeddings=59, embed_dim=10, pooling_type=PoolingType.MEAN ), ContextualTokenEmbedding(contextual_emb_dim), ], concat=True, ) self.training_model = RNNGModel( input_for_trace=RNNGModel.get_input_for_trace(contextual_emb_dim), embedding=emb_module, ablation=RNNGParser.Config.AblationParams(), constraints=RNNGParser.Config.RNNGConstraints(), lstm_num_layers=2, lstm_dim=32, max_open_NT=10, dropout=0.4, num_actions=20, shift_idx=0, reduce_idx=1, ignore_subNTs_roots=[8, 15], valid_NT_idxs=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11] + [12, 13, 14, 15, 16, 17, 18, 19], valid_IN_idxs=[2, 4, 7, 8, 10, 12, 13, 14, 15], valid_SL_idxs=[3, 5, 6, 9, 11, 16, 17, 18, 19], embedding_dim=emb_module.embedding_dim, p_compositional=CompositionalNN(lstm_dim=32, device="cpu"), device="cpu", ) self.training_model.train() self.inference_model = RNNGInference( self.training_model.trace_embedding(), self.training_model.jit_model, MockVocab(["<unk>", "foo", "bar"]), MockVocab(["<unk>", "a", "b"]), MockVocab(["SHIFT", "REDUCE", "IN:END_CALL", "SL:METHOD_CALL"]), ) self.inference_model.eval()
def build_model( model_name: str, word_embedding_file: str, embedding_dim: int, mlp_layer_dims: Sequence[int], lowercase_tokens: bool, skip_header: bool, kernel_num: int, kernel_sizes: Sequence[int], dropout: int, dense_dim: int, decoder_hidden_dims: Sequence[int], ): """build a custom doc model """ vocab = build_vocab(word_embedding_file) word_embedding = WordEmbedding( pretrained_embeddings_path=word_embedding_file, vocab=vocab, embedding_dim=embedding_dim, mlp_layer_dims=mlp_layer_dims, lowercase_tokens=lowercase_tokens, skip_header=skip_header, ) if model_name == DocModelForBinaryDocClassification.__name__: doc_model = DocModelForBinaryDocClassification( word_embedding=word_embedding, # DocNN config kernel_num=kernel_num, kernel_sizes=kernel_sizes, dropout=dropout, # decoder config dense_dim=dense_dim, decoder_hidden_dims=decoder_hidden_dims, ) else: raise RuntimeError(f"unknown model_name: {model_name}") return doc_model
def create_embedding(cls, config: Config, tensorizers: Dict[str, Tensorizer]): vocab = tensorizers["tokens"].vocab return WordEmbedding(len(vocab), config.embedding.embed_dim, None, None, vocab.idx[UNK], [])