예제 #1
0
파일: i2b2.py 프로젝트: yyht/sciwing
    def _get_model(self) -> nn.Module:
        word_embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
        )

        elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager,
                                        layer_aggregation="sum")

        embedder = ConcatEmbedders([word_embedder, elmo_embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
            dropout_value=self.hparams.get("lstm2seq_dropout", 0.0),
            add_projection_layer=False,
        )
        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 *
            self.hparams.get("hidden_dim") if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat" else
            self.hparams.get("hidden_dim"),
            datasets_manager=self.data_manager,
        )

        return model
예제 #2
0
    def build_model(self):
        embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
            device=self.hparams.get("device"),
        )

        embedder = ConcatEmbedders([embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            dropout_value=self.hparams.get("dropout"),
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
            device=self.hparams.get("device"),
            num_layers=self.hparams.get("num_layers"),
        )
        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 * self.hparams.get("hidden_dim")
            if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat"
            else self.hparams.get("hidden_dim"),
            device=self.hparams.get("device"),
            tagging_type="IOB1",
            datasets_manager=self.data_manager,
        )
        return model
예제 #3
0
def setup_seq2seq_model(abs_sum_dataset_manager):
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 100
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"
    NUM_LAYERS = 1
    MAX_LENGTH = 4
    datasets_manager = abs_sum_dataset_manager

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        add_projection_layer=False,
    )

    decoder = Lstm2SeqDecoder(
        embedder=embedder,
        vocab=datasets_manager.namespace_to_vocab["tokens"],
        max_length=MAX_LENGTH,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        num_layers=NUM_LAYERS,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
    )

    model = Seq2SeqModel(
        rnn2seqencoder=encoder,
        rnn2seqdecoder=decoder,
        datasets_manager=datasets_manager,
        enc_hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
    )

    return (
        model,
        datasets_manager,
        {
            "EMBEDDING_DIM": EMBEDDING_DIM,
            "MAX_LENGTH": MAX_LENGTH,
            "HIDDEN_DIM": 2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat"
            else HIDDEN_DIM,
            "COMBINE_STRATEGY": COMBINE_STRATEGY,
            "BIDIRECTIONAL": BIDIRECTIONAL,
            "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL
            else HIDDEN_DIM,
        },
    )
예제 #4
0
def setup_lstm2seqencoder(request):
    EMBEDDING_DIM = 100
    VOCAB_SIZE = 1000
    BATCH_SIZE = 2
    HIDDEN_DIM = 1024
    NUM_TIME_STEPS = 10
    BIDIRECTIONAL = request.param[0]
    COMBINE_STRATEGY = request.param[1]
    NUM_LAYERS = request.param[2]
    EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([VOCAB_SIZE, EMBEDDING_DIM]))
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TIME_STEPS))
    tokens = torch.LongTensor(tokens)

    embedder = VanillaEmbedder(embedding=EMBEDDING,
                               embedding_dim=EMBEDDING_DIM)

    encoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        num_layers=NUM_LAYERS,
    )

    return (
        encoder,
        {
            "EMBEDDING_DIM":
            EMBEDDING_DIM,
            "VOCAB_SIZE":
            VOCAB_SIZE,
            "BATCH_SIZE":
            BATCH_SIZE,
            "HIDDEN_DIM":
            HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "tokens":
            tokens,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM,
            "TIME_STEPS":
            NUM_TIME_STEPS,
            "NUM_LAYERS":
            NUM_LAYERS,
        },
    )
예제 #5
0
def setup_parscit_tagger(seq_dataset_manager):
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 100
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"
    dataset_manager = seq_dataset_manager

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    char_embedder = CharEmbedder(
        char_embedding_dimension=10,
        hidden_dimension=20,
        datasets_manager=dataset_manager,
    )
    embedder = ConcatEmbedders([embedder, char_embedder])

    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        add_projection_layer=False,
    )

    tagger = RnnSeqCrfTagger(
        rnn2seqencoder=encoder,
        encoding_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        datasets_manager=dataset_manager,
    )

    return (
        tagger,
        dataset_manager,
        {
            "EMBEDDING_DIM":
            EMBEDDING_DIM,
            "HIDDEN_DIM":
            2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM,
        },
    )
예제 #6
0
def build_science_ie_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    data_dir = pathlib.Path(DATA_DIR)
    train_filename = data_dir.joinpath("train_science_ie_conll.txt")
    dev_filename = data_dir.joinpath("dev_science_ie_conll.txt")
    test_filename = data_dir.joinpath("dev_science_ie_conll.txt")
    data_manager = CoNLLDatasetManager(
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=test_filename,
        column_names=["TASK", "PROCESS", "MATERIAL"],
    )

    word_embedder = TrainableWordEmbedder(
        embedding_type="glove_6B_100", datasets_manager=data_manager
    )
    char_embedder = CharEmbedder(
        char_embedding_dimension=20, hidden_dimension=25, datasets_manager=data_manager
    )
    embedder = ConcatEmbedders([word_embedder, char_embedder])

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        hidden_dim=350,
        bidirectional=True,
        combine_strategy="concat",
        rnn_bias=True,
        device=torch.device("cpu"),
        num_layers=2,
    )

    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seqencoder,
        encoding_dim=700,
        datasets_manager=data_manager,
        namespace_to_constraints=None,
        tagging_type="BIOUL",
    )

    infer = SequenceLabellingInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")),
        datasets_manager=data_manager,
    )

    return infer
예제 #7
0
def setup_lstm2seqencoder(request):
    HIDDEN_DIM = 1024
    BIDIRECTIONAL = request.param[0]
    COMBINE_STRATEGY = request.param[1]
    NUM_LAYERS = request.param[2]
    ADD_PROJECTION_LAYER = request.param[3]
    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        num_layers=NUM_LAYERS,
        add_projection_layer=ADD_PROJECTION_LAYER,
    )

    lines = []
    texts = ["First sentence", "second sentence"]
    for text in texts:
        line = Line(text=text)
        lines.append(line)

    return (
        encoder,
        {
            "HIDDEN_DIM":
            HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL
            and not ADD_PROJECTION_LAYER else HIDDEN_DIM,
            "NUM_LAYERS":
            NUM_LAYERS,
            "LINES":
            lines,
            "TIME_STEPS":
            2,
        },
    )
def encoder():
    embedder = WordEmbedder(embedding_type="glove_6B_50")
    hidden_dim = 50

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder, hidden_dim=hidden_dim, bidirectional=False
    )

    attn_module = DotProductAttention()

    context_embedder = WordEmbedder(
        embedding_type="glove_6B_50", word_tokens_namespace="tokens"
    )

    encoder = Lstm2SeqAttnContextEncoder(
        rnn2seqencoder=lstm2seqencoder,
        attn_module=attn_module,
        context_embedder=context_embedder,
    )

    return encoder
예제 #9
0
    def build_model(self):

        word_embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
        )

        char_embedder = CharEmbedder(
            char_embedding_dimension=self.hparams.get("char_emb_dim"),
            hidden_dimension=self.hparams.get("char_encoder_hidden_dim"),
            datasets_manager=self.data_manager,
        )

        elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager)

        embedder = ConcatEmbedders([word_embedder, char_embedder, elmo_embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
        )

        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 * self.hparams.get("hidden_dim")
            if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat"
            else self.hparams.get("hidden_dim"),
            datasets_manager=self.data_manager,
        )

        self.printer.good("Finished Loading the Model")
        return model
예제 #10
0
파일: science_ie.py 프로젝트: yyht/sciwing
                                     device=args.device)

    char_embedder = CharEmbedder(
        char_embedding_dimension=args.char_emb_dim,
        hidden_dimension=args.char_encoder_hidden_dim,
        datasets_manager=data_manager,
        device=args.device,
    )
    embedder = ConcatEmbedders([embedder, char_embedder])

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=args.dropout,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        rnn_bias=True,
        device=torch.device(args.device),
        num_layers=args.num_layers,
        add_projection_layer=False,
    )
    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seqencoder,
        encoding_dim=2 * args.hidden_dim if args.bidirectional
        and args.combine_strategy == "concat" else args.hidden_dim,
        device=torch.device(args.device),
        tagging_type="BIOUL",
        datasets_manager=data_manager,
    )

    optimizer = optim.Adam(params=model.parameters(),
예제 #11
0
            char_emb_dim=CHAR_EMBEDDING_DIMENSION,
            char_embedder=char_embedder,
            bidirectional=True,
            hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
            combine_strategy="concat",
            device=torch.device(DEVICE),
        )
        embedder = ConcatEmbedders([embedder, char_encoder])
        EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM

    lstm2seqencoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=DROPOUT,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        num_layers=NUM_LAYERS,
        rnn_bias=True,
        device=torch.device(DEVICE),
    )
    model = ScienceIETagger(
        rnn2seqencoder=lstm2seqencoder,
        num_classes=NUM_CLASSES,
        hid_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        task_constraints=task_constraints,
        process_constraints=process_constraints,
        material_constraints=material_constraints,
        device=torch.device(DEVICE),
    )
예제 #12
0
        hidden_dimension=args.char_encoder_hidden_dim,
        datasets_manager=data_manager,
        device=args.device,
    )

    elmo_embedder = BowElmoEmbedder(
        datasets_manager=data_manager, layer_aggregation="sum", device=args.device
    )

    embedder = ConcatEmbedders([word_embedder, char_embedder, elmo_embedder])

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        rnn_bias=True,
        device=torch.device(args.device),
        dropout_value=0.1,
    )
    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seqencoder,
        encoding_dim=2 * args.hidden_dim
        if args.bidirectional and args.combine_strategy == "concat"
        else args.hidden_dim,
        device=torch.device(args.device),
        datasets_manager=data_manager,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=args.lr)
    train_metric = TokenClassificationAccuracy(datasets_manager=data_manager)
예제 #13
0
    def setup_engine_once(
        config_dict: Dict[str, str],
        experiment_name: str,
        train_data_filepath: pathlib.Path,
        test_data_filepath: pathlib.Path,
    ):
        DEBUG = config_dict["DEBUG"]
        DEBUG_DATASET_PROPORTION = config_dict["DEBUG_DATASET_PROPORTION"]
        BATCH_SIZE = config_dict["BATCH_SIZE"]
        LEARNING_RATE = config_dict["LEARNING_RATE"]
        NUM_EPOCHS = config_dict["NUM_EPOCHS"]
        SAVE_EVERY = config_dict["SAVE_EVERY"]
        LOG_TRAIN_METRICS_EVERY = config_dict["LOG_TRAIN_METRICS_EVERY"]
        EMBEDDING_DIMENSION = config_dict["EMBEDDING_DIMENSION"]
        CHAR_EMBEDDING_DIMENSION = config_dict["CHAR_EMBEDDING_DIMENSION"]
        EMBEDDING_TYPE = config_dict["EMBEDDING_TYPE"]
        MAX_NUM_WORDS = config_dict["MAX_NUM_WORDS"]
        MAX_LENGTH = config_dict["MAX_LENGTH"]
        DEVICE = config_dict["DEVICE"]
        HIDDEN_DIM = config_dict["HIDDEN_DIM"]
        BIDIRECTIONAL = config_dict["BIDIRECTIONAL"]
        COMBINE_STRATEGY = config_dict["COMBINE_STRATEGY"]
        MAX_CHAR_LENGTH = config_dict["MAX_CHAR_LENGTH"]
        USE_CHAR_ENCODER = config_dict["USE_CHAR_ENCODER"]
        CHAR_ENCODER_HIDDEN_DIM = config_dict["CHAR_ENCODER_HIDDEN_DIM"]
        DROPOUT = config_dict["DROPOUT"]

        EXP_NAME = experiment_name
        EXP_DIR_PATH = os.path.join(OUTPUT_DIR, EXP_NAME)

        if not os.path.isdir(EXP_DIR_PATH):
            os.mkdir(EXP_DIR_PATH)

        MODEL_SAVE_DIR = os.path.join(EXP_DIR_PATH, "checkpoints")

        if not os.path.isdir(MODEL_SAVE_DIR):
            os.mkdir(MODEL_SAVE_DIR)

        VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "vocab.json")
        CHAR_VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH,
                                                 "char_vocab.json")
        CAPITALIZATION_VOCAB_STORE_LOCATION = os.path.join(
            EXP_DIR_PATH, "capitalization_vocab.json")
        CAPITALIZATION_EMBEDDING_DIMENSION = 10
        TENSORBOARD_LOGDIR = os.path.join(".", "runs", EXP_NAME)

        train_dataset = ParscitDataset(
            filename=str(train_data_filepath),
            dataset_type="train",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        validation_dataset = ParscitDataset(
            filename=str(test_data_filepath),
            dataset_type="valid",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        test_dataset = ParscitDataset(
            filename=str(test_data_filepath),
            dataset_type="test",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        train_dataset.print_stats()
        validation_dataset.print_stats()
        test_dataset.print_stats()

        VOCAB_SIZE = train_dataset.word_vocab.get_vocab_len()
        NUM_CLASSES = train_dataset.get_num_classes()
        embedding = train_dataset.word_vocab.load_embedding()
        embedding = nn.Embedding.from_pretrained(embedding, freeze=False)
        char_embedding = train_dataset.char_vocab.load_embedding()
        char_embedding = nn.Embedding.from_pretrained(char_embedding,
                                                      freeze=False)

        embedder = VanillaEmbedder(embedding=embedding,
                                   embedding_dim=EMBEDDING_DIMENSION)

        if USE_CHAR_ENCODER:
            char_embedder = VanillaEmbedder(
                embedding=char_embedding,
                embedding_dim=CHAR_EMBEDDING_DIMENSION)
            char_encoder = CharLSTMEncoder(
                char_emb_dim=CHAR_EMBEDDING_DIMENSION,
                char_embedder=char_embedder,
                bidirectional=True,
                hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
                combine_strategy="concat",
                device=torch.device(DEVICE),
            )
            embedder = ConcatEmbedders([embedder, char_encoder])
            EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM

        lstm2seqencoder = Lstm2SeqEncoder(
            emb_dim=EMBEDDING_DIMENSION,
            embedder=embedder,
            dropout_value=DROPOUT,
            hidden_dim=HIDDEN_DIM,
            bidirectional=BIDIRECTIONAL,
            combine_strategy=COMBINE_STRATEGY,
            rnn_bias=True,
            device=torch.device(DEVICE),
        )
        model = ParscitTagger(
            rnn2seqencoder=lstm2seqencoder,
            num_classes=NUM_CLASSES,
            hid_dim=2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        )

        optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
        metric = TokenClassificationAccuracy(
            idx2labelname_mapping=train_dataset.idx2classname)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer, mode="max", factor=0.1, patience=2)

        engine = Engine(
            model=model,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            test_dataset=test_dataset,
            optimizer=optimizer,
            batch_size=BATCH_SIZE,
            save_dir=MODEL_SAVE_DIR,
            num_epochs=NUM_EPOCHS,
            save_every=SAVE_EVERY,
            log_train_metrics_every=LOG_TRAIN_METRICS_EVERY,
            tensorboard_logdir=TENSORBOARD_LOGDIR,
            device=torch.device(DEVICE),
            metric=metric,
            track_for_best="macro_fscore",
            lr_scheduler=scheduler,
        )

        config_dict["VOCAB_STORE_LOCATION"] = VOCAB_STORE_LOCATION
        config_dict["CHAR_VOCAB_STORE_LOCATION"] = CHAR_VOCAB_STORE_LOCATION
        config_dict["MODEL_SAVE_DIR"] = MODEL_SAVE_DIR
        config_dict["VOCAB_SIZE"] = VOCAB_SIZE
        config_dict["NUM_CLASSES"] = NUM_CLASSES

        with open(os.path.join(f"{EXP_DIR_PATH}", "config.json"), "w") as fp:
            json.dump(config_dict, fp)

        return engine
예제 #14
0
def get_science_ie_infer(dirname: str):
    model_folder = pathlib.Path(dirname)
    hyperparam_config_filename = model_folder.joinpath("config.json")

    with open(hyperparam_config_filename, "r") as fp:
        config = json.load(fp)

    MAX_NUM_WORDS = config.get("MAX_NUM_WORDS", None)
    MAX_LENGTH = config.get("MAX_LENGTH", None)
    MAX_CHAR_LENGTH = config.get("MAX_CHAR_LENGTH", None)
    VOCAB_STORE_LOCATION = config.get("VOCAB_STORE_LOCATION", None)
    DEBUG = config.get("DEBUG", None)
    DEBUG_DATASET_PROPORTION = config.get("DEBUG_DATASET_PROPORTION", None)
    EMBEDDING_TYPE = config.get("EMBEDDING_TYPE", None)
    EMBEDDING_DIMENSION = config.get("EMBEDDING_DIMENSION", None)
    HIDDEN_DIMENSION = config.get("HIDDEN_DIM", None)
    BIDIRECTIONAL = config.get("BIDIRECTIONAL", None)
    COMBINE_STRATEGY = config.get("COMBINE_STRATEGY", None)
    DEVICE = config.get("DEVICE", "cpu")
    NUM_CLASSES = config.get("NUM_CLASSES", None)
    MODEL_SAVE_DIR = config.get("MODEL_SAVE_DIR", None)
    model_filepath = pathlib.Path(MODEL_SAVE_DIR, "best_model.pt")
    CHAR_VOCAB_STORE_LOCATION = config.get("CHAR_VOCAB_STORE_LOCATION", None)
    CHAR_EMBEDDING_DIMENSION = config.get("CHAR_EMBEDDING_DIMENSION", None)
    USE_CHAR_ENCODER = config.get("USE_CHAR_ENCODER", None)
    CHAR_ENCODER_HIDDEN_DIM = config.get("CHAR_ENCODER_HIDDEN_DIM", None)
    NUM_LAYERS = config.get("NUM_LAYERS", 1)
    DROPOUT = config.get("DROPOUT", 0.0)

    print(f"NUM_LAYERS", NUM_LAYERS)

    test_science_ie_conll_filepath = pathlib.Path(DATA_DIR, "dev_science_ie_conll.txt")

    test_dataset = ScienceIEDataset(
        filename=test_science_ie_conll_filepath,
        dataset_type="test",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        max_char_length=MAX_CHAR_LENGTH,
        word_vocab_store_location=VOCAB_STORE_LOCATION,
        char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
        debug=DEBUG,
        debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
        word_embedding_type=EMBEDDING_TYPE,
        word_embedding_dimension=EMBEDDING_DIMENSION,
        char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
        word_start_token="<SOS>",
        word_end_token="<EOS>",
        word_pad_token="<PAD>",
        word_unk_token="<UNK>",
        word_add_start_end_token=False,
    )

    embedding = test_dataset.word_vocab.load_embedding()
    embedding = nn.Embedding.from_pretrained(embedding)

    char_embedding = test_dataset.char_vocab.load_embedding()
    char_embedding = nn.Embedding.from_pretrained(char_embedding)

    classnames2idx = ScienceIEDataset.get_classname2idx()
    idx2classnames = {idx: classname for classname, idx in classnames2idx.items()}

    task_idx2classnames = {
        idx: classname
        for idx, classname in idx2classnames.items()
        if idx in range(0, 8)
    }
    process_idx2classnames = {
        idx - 8: classname
        for idx, classname in idx2classnames.items()
        if idx in range(8, 16)
    }
    material_idx2classnames = {
        idx - 16: classname
        for idx, classname in idx2classnames.items()
        if idx in range(16, 24)
    }

    task_constraints = allowed_transitions(
        constraint_type="BIOUL", labels=task_idx2classnames
    )
    process_constraints = allowed_transitions(
        constraint_type="BIOUL", labels=process_idx2classnames
    )
    material_constraints = allowed_transitions(
        constraint_type="BIOUL", labels=material_idx2classnames
    )

    embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION)

    if USE_CHAR_ENCODER:
        char_embedder = VanillaEmbedder(
            embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION
        )
        char_encoder = CharLSTMEncoder(
            char_embedder=char_embedder,
            char_emb_dim=CHAR_EMBEDDING_DIMENSION,
            bidirectional=True,
            hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
            combine_strategy="concat",
            device=torch.device(DEVICE),
        )
        embedder = ConcatEmbedders([embedder, char_encoder])
        EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM

    lstm2seqencoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=DROPOUT,
        hidden_dim=HIDDEN_DIMENSION,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        num_layers=NUM_LAYERS,
        rnn_bias=True,
        device=torch.device(DEVICE),
    )
    model = ScienceIETagger(
        rnn2seqencoder=lstm2seqencoder,
        num_classes=NUM_CLASSES,
        hid_dim=2 * HIDDEN_DIMENSION
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat"
        else HIDDEN_DIMENSION,
        task_constraints=task_constraints,
        process_constraints=process_constraints,
        material_constraints=material_constraints,
        device=torch.device(DEVICE),
    )

    inference_client = ScienceIEInference(
        model=model, model_filepath=str(model_filepath), dataset=test_dataset
    )
    return inference_client
예제 #15
0
def setup_science_ie_tagger(request):
    EMBEDDING_DIM = 100
    CHARACTER_EMBEDDING_DIM = 25
    VOCAB_SIZE = 1000
    BATCH_SIZE = 2
    HIDDEN_DIM = 1024
    CHARACTER_ENCODER_HIDDEN_DIM = 100
    NUM_TIME_STEPS = 10
    MAX_CHAR_LENGTH = 25
    CHAR_VOCAB_SIZE = 100
    BIDIRECTIONAL = request.param[0]
    COMBINE_STRATEGY = request.param[1]
    HAVE_CHARACTER_ENCODER = request.param[2]
    DEVICE = torch.device("cpu")
    NUM_CLASSES = 8
    EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([VOCAB_SIZE, EMBEDDING_DIM]))
    CHARACTER_EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([CHAR_VOCAB_SIZE, CHARACTER_EMBEDDING_DIM]))
    tokens = np.random.randint(0,
                               VOCAB_SIZE,
                               size=(BATCH_SIZE, NUM_TIME_STEPS))

    task_labels = np.random.randint(0, 8, size=(BATCH_SIZE, NUM_TIME_STEPS))
    process_labels = np.random.randint(8,
                                       16,
                                       size=(BATCH_SIZE, NUM_TIME_STEPS))
    material_labels = np.random.randint(16,
                                        24,
                                        size=(BATCH_SIZE, NUM_TIME_STEPS))
    task_labels = torch.LongTensor(task_labels)
    process_labels = torch.LongTensor(process_labels)
    material_labels = torch.LongTensor(material_labels)
    labels = torch.cat([task_labels, process_labels, material_labels], dim=1)

    char_tokens = np.random.randint(0,
                                    CHAR_VOCAB_SIZE - 1,
                                    size=(BATCH_SIZE, NUM_TIME_STEPS,
                                          MAX_CHAR_LENGTH))
    tokens = torch.LongTensor(tokens)
    labels = torch.LongTensor(labels)
    char_tokens = torch.LongTensor(char_tokens)

    classnames2idx = ScienceIEDataset.get_classname2idx()
    idx2classnames = {
        idx: classname
        for classname, idx in classnames2idx.items()
    }
    task_idx2classnames = {
        idx: classname
        for idx, classname in idx2classnames.items() if idx in range(0, 8)
    }
    process_idx2classnames = {
        idx - 8: classname
        for idx, classname in idx2classnames.items() if idx in range(8, 16)
    }
    material_idx2classnames = {
        idx - 16: classname
        for idx, classname in idx2classnames.items() if idx in range(16, 24)
    }

    task_constraints: List[(int, int)] = allowed_transitions(
        constraint_type="BIOUL", labels=task_idx2classnames)
    process_constraints: List[(int, int)] = allowed_transitions(
        constraint_type="BIOUL", labels=process_idx2classnames)
    material_constraints: List[(int, int)] = allowed_transitions(
        constraint_type="BIOUL", labels=material_idx2classnames)

    embedder = VanillaEmbedder(embedding=EMBEDDING,
                               embedding_dim=EMBEDDING_DIM)

    if HAVE_CHARACTER_ENCODER:
        char_embedder = VanillaEmbedder(embedding=CHARACTER_EMBEDDING,
                                        embedding_dim=CHARACTER_EMBEDDING_DIM)
        char_encoder = CharLSTMEncoder(
            char_embedder=char_embedder,
            char_emb_dim=CHARACTER_EMBEDDING_DIM,
            hidden_dim=CHARACTER_ENCODER_HIDDEN_DIM,
            bidirectional=True,
            combine_strategy="concat",
        )
        embedder = ConcatEmbedders([embedder, char_encoder])
        EMBEDDING_DIM += 2 * CHARACTER_ENCODER_HIDDEN_DIM

    encoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
    )

    tagger = ScienceIETagger(
        rnn2seqencoder=encoder,
        hid_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        num_classes=NUM_CLASSES,
        task_constraints=task_constraints,
        process_constraints=process_constraints,
        material_constraints=material_constraints,
        device=DEVICE,
    )

    return (
        tagger,
        {
            "EMBEDDING_DIM":
            EMBEDDING_DIM,
            "VOCAB_SIZE":
            VOCAB_SIZE,
            "BATCH_SIZE":
            BATCH_SIZE,
            "HIDDEN_DIM":
            2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "tokens":
            tokens,
            "labels":
            labels,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM,
            "TIME_STEPS":
            NUM_TIME_STEPS,
            "NUM_CLASSES":
            NUM_CLASSES,
            "HAVE_CHAR_ENCODER":
            HAVE_CHARACTER_ENCODER,
            "char_tokens":
            char_tokens,
        },
    )
    # # instantiate the elmo embedder
    # elmo_embedder = BowElmoEmbedder(layer_aggregation="sum", device=args.device)
    #
    # # instantiate the vanilla embedder
    # vanilla_embedder = WordEmbedder(embedding_type=args.emb_type, device=args.device)
    #
    # # concat the embeddings
    # embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    embedder = WordEmbedder(embedding_type=args.emb_type, device=args.device)

    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        device=torch.device(args.device),
    )

    encoding_dim = (2 * args.hidden_dim if args.bidirectional
                    and args.combine_strategy == "concat" else args.hidden_dim)

    decoder = Lstm2SeqDecoder(
        embedder=embedder,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        device=torch.device(args.device),
        max_length=args.pred_max_length,
        vocab=vocab,
예제 #17
0
                                     device=args.device)

    char_embedder = CharEmbedder(
        char_embedding_dimension=args.char_emb_dim,
        hidden_dimension=args.char_encoder_hidden_dim,
        datasets_manager=data_manager,
        device=args.device,
    )
    embedder = ConcatEmbedders([embedder, char_embedder])

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=args.dropout,
        hidden_dim=2 * args.hidden_dim if args.bidirectional
        and args.combine_strategy == "concat" else args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        rnn_bias=True,
        device=torch.device(args.device),
        num_layers=args.num_layers,
    )
    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seqencoder,
        encoding_dim=2 * args.hidden_dim if args.bidirectional
        and args.combine_strategy == "concat" else args.hidden_dim,
        device=torch.device(args.device),
        tagging_type="BIOUL",
        datasets_manager=data_manager,
    )

    optimizer = optim.Adam(params=model.parameters(),
예제 #18
0
def get_bilstm_crf_infer(dirname: str):
    hyperparam_config_filepath = pathlib.Path(dirname, "config.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    MAX_NUM_WORDS = config.get("MAX_NUM_WORDS", None)
    MAX_LENGTH = config.get("MAX_LENGTH", None)
    MAX_CHAR_LENGTH = config.get("MAX_CHAR_LENGTH", None)
    VOCAB_STORE_LOCATION = config.get("VOCAB_STORE_LOCATION", None)
    DEBUG = config.get("DEBUG", None)
    DEBUG_DATASET_PROPORTION = config.get("DEBUG_DATASET_PROPORTION", None)
    EMBEDDING_TYPE = config.get("EMBEDDING_TYPE", None)
    EMBEDDING_DIMENSION = config.get("EMBEDDING_DIMENSION", None)
    HIDDEN_DIMENSION = config.get("HIDDEN_DIM", None)
    BIDIRECTIONAL = config.get("BIDIRECTIONAL", None)
    COMBINE_STRATEGY = config.get("COMBINE_STRATEGY", None)
    DEVICE = config.get("DEVICE", "cpu")
    NUM_CLASSES = config.get("NUM_CLASSES", None)
    MODEL_SAVE_DIR = config.get("MODEL_SAVE_DIR", None)
    model_filepath = pathlib.Path(MODEL_SAVE_DIR, "best_model.pt")
    CHAR_VOCAB_STORE_LOCATION = config.get("CHAR_VOCAB_STORE_LOCATION", None)
    CHAR_EMBEDDING_DIMENSION = config.get("CHAR_EMBEDDING_DIMENSION", None)
    USE_CHAR_ENCODER = config.get("USE_CHAR_ENCODER", None)
    CHAR_ENCODER_HIDDEN_DIM = config.get("CHAR_ENCODER_HIDDEN_DIM", None)
    DROPOUT = config.get("DROPOUT", 0.0)

    test_conll_filepath = pathlib.Path(DATA_DIR, "cora_conll.txt")

    test_dataset = ParscitDataset(
        filename=test_conll_filepath,
        dataset_type="test",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        max_char_length=MAX_CHAR_LENGTH,
        word_vocab_store_location=VOCAB_STORE_LOCATION,
        char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
        debug=DEBUG,
        debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
        word_embedding_type=EMBEDDING_TYPE,
        word_embedding_dimension=EMBEDDING_DIMENSION,
        char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
        word_start_token="<SOS>",
        word_end_token="<EOS>",
        word_pad_token="<PAD>",
        word_unk_token="<UNK>",
        word_add_start_end_token=False,
    )

    embedding = test_dataset.word_vocab.load_embedding()
    embedding = nn.Embedding.from_pretrained(embedding)
    embedder = VanillaEmbedder(embedding=embedding,
                               embedding_dim=EMBEDDING_DIMENSION)

    char_embedding = test_dataset.char_vocab.load_embedding()
    char_embedding = nn.Embedding.from_pretrained(char_embedding)

    if USE_CHAR_ENCODER:
        char_embedder = VanillaEmbedder(embedding=char_embedding,
                                        embedding_dim=CHAR_EMBEDDING_DIMENSION)
        char_encoder = CharLSTMEncoder(
            char_embedder=char_embedder,
            char_emb_dim=CHAR_EMBEDDING_DIMENSION,
            hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
            bidirectional=True,
            combine_strategy="concat",
        )
        embedder = ConcatEmbedders([embedder, char_encoder])

        EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM

    lstm2seqencoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=DROPOUT,
        hidden_dim=HIDDEN_DIMENSION,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=True,
        device=torch.device(DEVICE),
    )
    model = ParscitTagger(
        rnn2seqencoder=lstm2seqencoder,
        num_classes=NUM_CLASSES,
        hid_dim=2 * HIDDEN_DIMENSION if BIDIRECTIONAL
        and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION,
    )

    inference_client = ParscitInference(model=model,
                                        model_filepath=str(model_filepath),
                                        dataset=test_dataset)
    return inference_client
예제 #19
0
def setup_parscit_tagger(request):
    EMBEDDING_DIM = 100
    CHARACTER_EMBEDDING_DIM = 25
    VOCAB_SIZE = 1000
    BATCH_SIZE = 2
    HIDDEN_DIM = 1024
    CHARACTER_ENCODER_HIDDEN_DIM = 100
    NUM_TIME_STEPS = 10
    MAX_CHAR_LENGTH = 25
    CHAR_VOCAB_SIZE = 100
    BIDIRECTIONAL = request.param[0]
    COMBINE_STRATEGY = request.param[1]
    HAVE_CHARACTER_ENCODER = request.param[2]
    NUM_CLASSES = 5
    EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([VOCAB_SIZE, EMBEDDING_DIM]))
    CHARACTER_EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([CHAR_VOCAB_SIZE, CHARACTER_EMBEDDING_DIM]))
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TIME_STEPS))
    labels = np.random.randint(0,
                               NUM_CLASSES - 1,
                               size=(BATCH_SIZE, NUM_TIME_STEPS))
    char_tokens = np.random.randint(0,
                                    CHAR_VOCAB_SIZE - 1,
                                    size=(BATCH_SIZE, NUM_TIME_STEPS,
                                          MAX_CHAR_LENGTH))
    tokens = torch.LongTensor(tokens)
    labels = torch.LongTensor(labels)
    char_tokens = torch.LongTensor(char_tokens)

    embedder = VanillaEmbedder(embedding=EMBEDDING,
                               embedding_dim=EMBEDDING_DIM)
    if HAVE_CHARACTER_ENCODER:
        char_embedder = VanillaEmbedder(embedding=CHARACTER_EMBEDDING,
                                        embedding_dim=CHARACTER_EMBEDDING_DIM)
        char_encoder = CharLSTMEncoder(
            char_embedder=char_embedder,
            char_emb_dim=CHARACTER_EMBEDDING_DIM,
            hidden_dim=CHARACTER_ENCODER_HIDDEN_DIM,
            bidirectional=True,
            combine_strategy="concat",
        )
        embedder = ConcatEmbedders([embedder, char_encoder])
        EMBEDDING_DIM = EMBEDDING_DIM + (2 * CHARACTER_ENCODER_HIDDEN_DIM)

    encoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
    )

    tagger = ParscitTagger(
        rnn2seqencoder=encoder,
        hid_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        num_classes=NUM_CLASSES,
    )

    return (
        tagger,
        {
            "EMBEDDING_DIM":
            EMBEDDING_DIM,
            "VOCAB_SIZE":
            VOCAB_SIZE,
            "BATCH_SIZE":
            BATCH_SIZE,
            "HIDDEN_DIM":
            2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "tokens":
            tokens,
            "labels":
            labels,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM,
            "TIME_STEPS":
            NUM_TIME_STEPS,
            "NUM_CLASSES":
            NUM_CLASSES,
            "HAVE_CHAR_ENCODER":
            HAVE_CHARACTER_ENCODER,
            "char_tokens":
            char_tokens,
        },
    )
예제 #20
0
def setup_parscit_inference(seq_dataset_manager, tmpdir_factory):
    HIDDEN_DIM = 100
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"
    dataset_manager = seq_dataset_manager

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    char_embedder = CharEmbedder(
        char_embedding_dimension=10,
        hidden_dimension=20,
        datasets_manager=dataset_manager,
    )
    embedder = ConcatEmbedders([embedder, char_embedder])

    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        add_projection_layer=False,
    )

    tagger = RnnSeqCrfTagger(
        rnn2seqencoder=encoder,
        encoding_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat"
        else HIDDEN_DIM,
        datasets_manager=dataset_manager,
    )

    train_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)
    dev_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)
    test_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)

    optimizer = torch.optim.Adam(params=tagger.parameters())
    batch_size = 1
    save_dir = tmpdir_factory.mktemp("experiment_1")
    num_epochs = 1
    save_every = 1
    log_train_metrics_every = 10

    engine = Engine(
        model=tagger,
        datasets_manager=dataset_manager,
        optimizer=optimizer,
        batch_size=batch_size,
        save_dir=save_dir,
        num_epochs=num_epochs,
        save_every=save_every,
        log_train_metrics_every=log_train_metrics_every,
        train_metric=train_metric,
        validation_metric=dev_metric,
        test_metric=test_metric,
        track_for_best="macro_fscore",
    )

    engine.run()
    model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt")

    inference_client = SequenceLabellingInference(
        model=tagger, model_filepath=model_filepath, datasets_manager=dataset_manager
    )

    return inference_client