Пример #1
0
    def _get_model(self) -> nn.Module:
        word_embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
        )

        elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager,
                                        layer_aggregation="sum")

        embedder = ConcatEmbedders([word_embedder, elmo_embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
            dropout_value=self.hparams.get("lstm2seq_dropout", 0.0),
            add_projection_layer=False,
        )
        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 *
            self.hparams.get("hidden_dim") if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat" else
            self.hparams.get("hidden_dim"),
            datasets_manager=self.data_manager,
        )

        return model
Пример #2
0
    def build_model(self):
        embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
            device=self.hparams.get("device"),
        )

        embedder = ConcatEmbedders([embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            dropout_value=self.hparams.get("dropout"),
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
            device=self.hparams.get("device"),
            num_layers=self.hparams.get("num_layers"),
        )
        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 * self.hparams.get("hidden_dim")
            if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat"
            else self.hparams.get("hidden_dim"),
            device=self.hparams.get("device"),
            tagging_type="IOB1",
            datasets_manager=self.data_manager,
        )
        return model
Пример #3
0
def build_science_ie_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    data_dir = pathlib.Path(DATA_DIR)
    train_filename = data_dir.joinpath("train_science_ie_conll.txt")
    dev_filename = data_dir.joinpath("dev_science_ie_conll.txt")
    test_filename = data_dir.joinpath("dev_science_ie_conll.txt")
    data_manager = CoNLLDatasetManager(
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=test_filename,
        column_names=["TASK", "PROCESS", "MATERIAL"],
    )

    word_embedder = TrainableWordEmbedder(
        embedding_type="glove_6B_100", datasets_manager=data_manager
    )
    char_embedder = CharEmbedder(
        char_embedding_dimension=20, hidden_dimension=25, datasets_manager=data_manager
    )
    embedder = ConcatEmbedders([word_embedder, char_embedder])

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        hidden_dim=350,
        bidirectional=True,
        combine_strategy="concat",
        rnn_bias=True,
        device=torch.device("cpu"),
        num_layers=2,
    )

    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seqencoder,
        encoding_dim=700,
        datasets_manager=data_manager,
        namespace_to_constraints=None,
        tagging_type="BIOUL",
    )

    infer = SequenceLabellingInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")),
        datasets_manager=data_manager,
    )

    return infer
Пример #4
0
    def build_model(self):

        word_embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
        )

        char_embedder = CharEmbedder(
            char_embedding_dimension=self.hparams.get("char_emb_dim"),
            hidden_dimension=self.hparams.get("char_encoder_hidden_dim"),
            datasets_manager=self.data_manager,
        )

        elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager)

        embedder = ConcatEmbedders([word_embedder, char_embedder, elmo_embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
        )

        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 * self.hparams.get("hidden_dim")
            if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat"
            else self.hparams.get("hidden_dim"),
            datasets_manager=self.data_manager,
        )

        self.printer.good("Finished Loading the Model")
        return model
Пример #5
0
    args = parser.parse_args()
    msg_printer = wasabi.Printer()

    DATA_DIR = pathlib.Path(DATA_DIR)
    train_filename = DATA_DIR.joinpath("train_science_ie_conll.txt")
    dev_filename = DATA_DIR.joinpath("dev_science_ie_conll.txt")

    data_manager = CoNLLDatasetManager(
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=dev_filename,
        column_names=["TASK", "PROCESS", "MATERIAL"],
    )

    embedder = TrainableWordEmbedder(embedding_type=args.emb_type,
                                     datasets_manager=data_manager,
                                     device=args.device)

    char_embedder = CharEmbedder(
        char_embedding_dimension=args.char_emb_dim,
        hidden_dimension=args.char_encoder_hidden_dim,
        datasets_manager=data_manager,
        device=args.device,
    )
    embedder = ConcatEmbedders([embedder, char_embedder])

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=args.dropout,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
Пример #6
0
def setup_embedder(setup_parscit_dataset_manager, request):
    data_manager = setup_parscit_dataset_manager
    embedding_type = request.param
    embedder = TrainableWordEmbedder(datasets_manager=data_manager,
                                     embedding_type=embedding_type)
    return embedder, data_manager
Пример #7
0
    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=args.dropout,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        rnn_bias=True,
        device=args.device,
        num_layers=args.num_layers,
        add_projection_layer=args.add_projection_layer,
    )

    attn = DotProductAttention()
    context_embedder = TrainableWordEmbedder(
        embedding_type="glove_6B_300", datasets_manager=data_manager, device=args.device
    )
    lstm2seq_attn_encoder = Lstm2SeqAttnContextEncoder(
        rnn2seqencoder=lstm2seqencoder,
        attn_module=attn,
        context_embedder=context_embedder,
        device=args.device,
    )
    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seq_attn_encoder,
        encoding_dim=600,
        device=args.device,
        tagging_type="BIOUL",
        datasets_manager=data_manager,
        include_start_end_trainsitions=False,
    )