class Config(DocModel.Config): class ByteModelInput(DocModel.Config.ModelInput): token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config( ) inputs: ByteModelInput = ByteModelInput() byte_embedding: CharacterEmbedding.Config = CharacterEmbedding.Config()
class Config(WordTaggingModel.Config): class ByteModelInput(WordTaggingModel.Config.ModelInput): # We should support characters as well, but CharacterTokenTensorizer # does not support adding characters to vocab yet. tokens: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config() inputs: ByteModelInput = ByteModelInput() embedding: CharacterEmbedding.Config = CharacterEmbedding.Config()
def create_embedding(cls, config, tensorizers): return CharacterEmbedding( tensorizers["token_bytes"].NUM_BYTES, config.embedding.embed_dim, config.embedding.cnn.kernel_num, config.embedding.cnn.kernel_sizes, config.embedding.highway_layers, config.embedding.projection_dim, )
def create_embedding(cls, config, tensorizers: Dict[str, Tensorizer]): word_tensorizer = config.inputs.tokens byte_tensorizer = config.inputs.token_bytes assert word_tensorizer.column == byte_tensorizer.column word_embedding = create_module(config.embedding, tensorizer=tensorizers["tokens"]) byte_embedding = CharacterEmbedding( ByteTokenTensorizer.NUM_BYTES, config.byte_embedding.embed_dim, config.byte_embedding.cnn.kernel_num, config.byte_embedding.cnn.kernel_sizes, config.byte_embedding.highway_layers, config.byte_embedding.projection_dim, ) return EmbeddingList([word_embedding, byte_embedding], concat=True)