示例#1
0
文件: i2b2.py 项目: yyht/sciwing
    def _get_model(self) -> nn.Module:
        word_embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
        )

        elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager,
                                        layer_aggregation="sum")

        embedder = ConcatEmbedders([word_embedder, elmo_embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
            dropout_value=self.hparams.get("lstm2seq_dropout", 0.0),
            add_projection_layer=False,
        )
        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 *
            self.hparams.get("hidden_dim") if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat" else
            self.hparams.get("hidden_dim"),
            datasets_manager=self.data_manager,
        )

        return model
示例#2
0
    def build_model(self):
        embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
            device=self.hparams.get("device"),
        )

        embedder = ConcatEmbedders([embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            dropout_value=self.hparams.get("dropout"),
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
            device=self.hparams.get("device"),
            num_layers=self.hparams.get("num_layers"),
        )
        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 * self.hparams.get("hidden_dim")
            if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat"
            else self.hparams.get("hidden_dim"),
            device=self.hparams.get("device"),
            tagging_type="IOB1",
            datasets_manager=self.data_manager,
        )
        return model
示例#3
0
def setup_parscit_tagger(seq_dataset_manager):
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 100
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"
    dataset_manager = seq_dataset_manager

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    char_embedder = CharEmbedder(
        char_embedding_dimension=10,
        hidden_dimension=20,
        datasets_manager=dataset_manager,
    )
    embedder = ConcatEmbedders([embedder, char_embedder])

    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        add_projection_layer=False,
    )

    tagger = RnnSeqCrfTagger(
        rnn2seqencoder=encoder,
        encoding_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        datasets_manager=dataset_manager,
    )

    return (
        tagger,
        dataset_manager,
        {
            "EMBEDDING_DIM":
            EMBEDDING_DIM,
            "HIDDEN_DIM":
            2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM,
        },
    )
示例#4
0
def build_science_ie_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    data_dir = pathlib.Path(DATA_DIR)
    train_filename = data_dir.joinpath("train_science_ie_conll.txt")
    dev_filename = data_dir.joinpath("dev_science_ie_conll.txt")
    test_filename = data_dir.joinpath("dev_science_ie_conll.txt")
    data_manager = CoNLLDatasetManager(
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=test_filename,
        column_names=["TASK", "PROCESS", "MATERIAL"],
    )

    word_embedder = TrainableWordEmbedder(
        embedding_type="glove_6B_100", datasets_manager=data_manager
    )
    char_embedder = CharEmbedder(
        char_embedding_dimension=20, hidden_dimension=25, datasets_manager=data_manager
    )
    embedder = ConcatEmbedders([word_embedder, char_embedder])

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        hidden_dim=350,
        bidirectional=True,
        combine_strategy="concat",
        rnn_bias=True,
        device=torch.device("cpu"),
        num_layers=2,
    )

    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seqencoder,
        encoding_dim=700,
        datasets_manager=data_manager,
        namespace_to_constraints=None,
        tagging_type="BIOUL",
    )

    infer = SequenceLabellingInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")),
        datasets_manager=data_manager,
    )

    return infer
示例#5
0
def build_ner_biobert_model(hparams):
   # data_dir = pathlib.Path(DATA_DIR)
    exp_dirpath = pathlib.Path(dirname)
    train_filename = "./ner/%s/%s.train"%(hparams.get("dataset"), hparams.get("dataset"))
    dev_filename = "./ner/%s/%s.dev"%(hparams.get("dataset"), hparams.get("dataset"))
    test_filename = "./ner/%s/%s.test"%(hparams.get("dataset"), hparams.get("dataset"))

    data_manager = BioNERDatasetManager(
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=test_filename,
        column_names=["NER"],
        train_only="ner",
    )

    config = transformers.BertConfig(output_hidden_states=True, vocab_size=28996)

    model = AutoModelWithLMHead.from_pretrained("monologg/biobert_v1.1_pubmed", config=config)
    tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed")

    # TODO: Specifying the max length

    biobert2seqencoder = Biobert2SeqEncoder(
        tokenizer=tokenizer,
        model=model,
        device=torch.device(hparams.get("device")),
    )

    model = RnnSeqCrfTagger(
        rnn2seqencoder=biobert2seqencoder,
        encoding_dim=768,
        device=torch.device(hparams.get("device")),
        datasets_manager=data_manager,
    )

    infer = SequenceLabellingInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")),
        datasets_manager=data_manager,
    )

    return infer
示例#6
0
    def build_model(self):

        word_embedder = TrainableWordEmbedder(
            embedding_type=self.hparams.get("emb_type"),
            datasets_manager=self.data_manager,
        )

        char_embedder = CharEmbedder(
            char_embedding_dimension=self.hparams.get("char_emb_dim"),
            hidden_dimension=self.hparams.get("char_encoder_hidden_dim"),
            datasets_manager=self.data_manager,
        )

        elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager)

        embedder = ConcatEmbedders([word_embedder, char_embedder, elmo_embedder])

        lstm2seqencoder = Lstm2SeqEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            bidirectional=self.hparams.get("bidirectional"),
            combine_strategy=self.hparams.get("combine_strategy"),
            rnn_bias=True,
        )

        model = RnnSeqCrfTagger(
            rnn2seqencoder=lstm2seqencoder,
            encoding_dim=2 * self.hparams.get("hidden_dim")
            if self.hparams.get("bidirectional")
            and self.hparams.get("combine_strategy") == "concat"
            else self.hparams.get("hidden_dim"),
            datasets_manager=self.data_manager,
        )

        self.printer.good("Finished Loading the Model")
        return model
示例#7
0
    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=args.dropout,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        rnn_bias=True,
        device=torch.device(args.device),
        num_layers=args.num_layers,
        add_projection_layer=False,
    )
    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seqencoder,
        encoding_dim=2 * args.hidden_dim if args.bidirectional
        and args.combine_strategy == "concat" else args.hidden_dim,
        device=torch.device(args.device),
        tagging_type="BIOUL",
        datasets_manager=data_manager,
    )

    optimizer = optim.Adam(params=model.parameters(),
                           lr=args.lr,
                           weight_decay=args.reg)

    train_metric = TokenClassificationAccuracy(datasets_manager=data_manager)
    dev_metric = TokenClassificationAccuracy(datasets_manager=data_manager)
    test_metric = TokenClassificationAccuracy(datasets_manager=data_manager)

    engine = Engine(
        model=model,
        datasets_manager=data_manager,
示例#8
0
    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=word_embedder,
        dropout_value=args.dropout,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        rnn_bias=True,
        device=args.device,
        num_layers=args.num_layers,
        add_projection_layer=args.add_projection_layer,
    )
    model = RnnSeqCrfTagger(
        rnn2seqencoder=lstm2seqencoder,
        encoding_dim=2 * args.hidden_dim,
        device=args.device,
        tagging_type="BIOUL",
        datasets_manager=data_manager,
        include_start_end_trainsitions=False,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=args.lr)

    train_metric = ConLL2003Metrics(datasets_manager=data_manager)
    dev_metric = ConLL2003Metrics(datasets_manager=data_manager)
    test_metric = ConLL2003Metrics(datasets_manager=data_manager)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer=optimizer,
        factor=0.1,
        mode="max",
        patience=25,
示例#9
0
def setup_parscit_inference(seq_dataset_manager, tmpdir_factory):
    HIDDEN_DIM = 100
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"
    dataset_manager = seq_dataset_manager

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    char_embedder = CharEmbedder(
        char_embedding_dimension=10,
        hidden_dimension=20,
        datasets_manager=dataset_manager,
    )
    embedder = ConcatEmbedders([embedder, char_embedder])

    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        add_projection_layer=False,
    )

    tagger = RnnSeqCrfTagger(
        rnn2seqencoder=encoder,
        encoding_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat"
        else HIDDEN_DIM,
        datasets_manager=dataset_manager,
    )

    train_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)
    dev_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)
    test_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)

    optimizer = torch.optim.Adam(params=tagger.parameters())
    batch_size = 1
    save_dir = tmpdir_factory.mktemp("experiment_1")
    num_epochs = 1
    save_every = 1
    log_train_metrics_every = 10

    engine = Engine(
        model=tagger,
        datasets_manager=dataset_manager,
        optimizer=optimizer,
        batch_size=batch_size,
        save_dir=save_dir,
        num_epochs=num_epochs,
        save_every=save_every,
        log_train_metrics_every=log_train_metrics_every,
        train_metric=train_metric,
        validation_metric=dev_metric,
        test_metric=test_metric,
        track_for_best="macro_fscore",
    )

    engine.run()
    model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt")

    inference_client = SequenceLabellingInference(
        model=tagger, model_filepath=model_filepath, datasets_manager=dataset_manager
    )

    return inference_client