Пример #1
0
    def _get_model(self) -> nn.Module:
        embedding_type = self.hparams.get("emb_type")
        word_embedder = WordEmbedder(embedding_type=embedding_type)
        elmo_embedder = ElmoEmbedder(datasets_manager=self.data_manager)
        embedder = ConcatEmbedders([word_embedder, elmo_embedder])

        hidden_dim = self.hparams.get("hidden_dim")
        combine_strategy = self.hparams.get("combine_strategy")
        bidirectional = self.hparams.get("bidirectional")

        encoder = LSTM2VecEncoder(
            embedder=embedder,
            hidden_dim=hidden_dim,
            combine_strategy=combine_strategy,
            bidirectional=bidirectional,
        )

        classifier_encoding_dim = 2 * hidden_dim if bidirectional else hidden_dim
        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=classifier_encoding_dim,
            num_classes=3,
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
        )
        return model
Пример #2
0
    def _get_model(self):
        elmo_embedder = BowElmoEmbedder(layer_aggregation="sum")

        # instantiate the vanilla embedder
        vanilla_embedder = WordEmbedder(
            embedding_type=self.hparams.get("emb_type"))

        # concat the embeddings
        embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder])

        hidden_dim = self.hparams.get("hidden_dim")
        bidirectional = self.hparams.get("bidirectional")
        combine_strategy = self.hparams.get("combine_strategy")

        encoder = LSTM2VecEncoder(
            embedder=embedder,
            hidden_dim=hidden_dim,
            bidirectional=bidirectional,
            combine_strategy=combine_strategy,
        )

        encoding_dim = (2 * hidden_dim if bidirectional
                        and combine_strategy == "concat" else hidden_dim)

        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=encoding_dim,
            num_classes=23,
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
        )

        return model
Пример #3
0
def setup_lstm2vecencoder(request):
    hidden_dimension = 1024
    combine_strategy = request.param[1]
    bidirectional = request.param[0]
    embedder = WordEmbedder(embedding_type="glove_6B_50")

    encoder = LSTM2VecEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=hidden_dimension,
        bidirectional=bidirectional,
        combine_strategy=combine_strategy,
        rnn_bias=False,
    )

    texts = ["First sentence", "second sentence"]
    lines = []
    for text in texts:
        line = Line(text=text)
        lines.append(line)

    return (
        encoder,
        {
            "hidden_dim": 2 * hidden_dimension
            if bidirectional and combine_strategy == "concat"
            else hidden_dimension,
            "bidirectional": False,
            "combine_strategy": combine_strategy,
            "lines": lines,
        },
    )
Пример #4
0
    def build_model(self):
        word_embedder = WordEmbedder(
            embedding_type=self.hparams.get("embedding_type"),
            device=self.hparams.get("device"))
        elmo_embedder = ElmoEmbedder(device=self.hparams.get("device"))

        embedder = ConcatEmbedders([word_embedder, elmo_embedder])

        encoder = LSTM2VecEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            combine_strategy=self.hparams.get("combine_strategy"),
            bidirectional=self.hparams.get("bidirectional"),
            device=torch.device(self.hparams.get("device")),
        )

        classiier_encoding_dim = (2 * self.hparams.get("hidden_dim")
                                  if self.hparams.get("bidirectional") else
                                  self.hparams.get("hidden_dim"))
        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=classiier_encoding_dim,
            num_classes=self.hparams.get("num_classes"),
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
            device=self.hparams.get("device"),
        )

        return model
Пример #5
0
    def test_raises_error_on_wrong_combine_strategy(self,
                                                    setup_lstm2vecencoder):
        with pytest.raises(AssertionError):

            encoder = LSTM2VecEncoder(
                emb_dim=300,
                embedder=VanillaEmbedder(nn.Embedding(10, 1024),
                                         embedding_dim=1024),
                combine_strategy="add",
            )
Пример #6
0
def build_sectlabel_elmobilstm_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )
    DEVICE = "cpu"
    EMBEDDING_TYPE = "glove_6B_50"
    HIDDEN_DIM = 512
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"

    elmo_embedder = BowElmoEmbedder(cuda_device_id=-1 if DEVICE ==
                                    "cpu" else int(DEVICE.split("cuda:")[1]))

    vanilla_embedder = WordEmbedder(embedding_type=EMBEDDING_TYPE)

    embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        embedder=embedders,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIM)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )
    return inference
Пример #7
0
def setup_lstm2vecencoder(request):
    emb_dim = 300
    time_steps = 10
    vocab_size = 100
    batch_size = 32
    embedding = nn.Embedding.from_pretrained(torch.zeros([vocab_size,
                                                          emb_dim]))
    hidden_dimension = 1024
    combine_strategy = request.param[1]
    bidirectional = request.param[0]
    tokens = np.random.randint(0,
                               vocab_size - 1,
                               size=(batch_size, time_steps))
    tokens = torch.LongTensor(tokens)

    iter_dict = {"tokens": tokens}
    embedder = VanillaEmbedder(embedding=embedding, embedding_dim=emb_dim)

    encoder = LSTM2VecEncoder(
        emb_dim=emb_dim,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=hidden_dimension,
        bidirectional=bidirectional,
        combine_strategy=combine_strategy,
        rnn_bias=False,
    )

    return (
        encoder,
        {
            "emb_dim":
            emb_dim,
            "vocab_size":
            vocab_size,
            "hidden_dim":
            2 * hidden_dimension if bidirectional
            and combine_strategy == "concat" else hidden_dimension,
            "bidirectional":
            False,
            "combine_strategy":
            combine_strategy,
            "tokens":
            tokens,
            "batch_size":
            batch_size,
            "iter_dict":
            iter_dict,
        },
    )
Пример #8
0
def get_bilstm_lc_infer_parsect(dirname: str):

    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    HIDDEN_DIM = config["HIDDEN_DIMENSION"]
    COMBINE_STRATEGY = config["COMBINE_STRATEGY"]
    BIDIRECTIONAL = config["BIDIRECTIONAL"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    NUM_CLASSES = config["NUM_CLASSES"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
    embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIM,
                               embedding=embedding)

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        hidden_dim=HIDDEN_DIM,
        combine_strategy=COMBINE_STRATEGY,
        bidirectional=BIDIRECTIONAL,
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classifier_encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    inference = ClassificationInference(model=model,
                                        model_filepath=model_filepath,
                                        dataset=dataset)

    return inference
Пример #9
0
def build_sectlabel_bilstm_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    HIDDEN_DIM = 512
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"

    classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    encoder = LSTM2VecEncoder(
        embedder=embedder,
        hidden_dim=HIDDEN_DIM,
        combine_strategy=COMBINE_STRATEGY,
        bidirectional=BIDIRECTIONAL,
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classifier_encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")),
        datasets_manager=data_manager,
    )

    return inference
Пример #10
0
    def __init__(
            self,
            char_embedder: nn.Module,
            char_emb_dim: int,
            hidden_dim: int = 1024,
            bidirectional: bool = False,
            combine_strategy: str = "concat",
            device: torch.device = torch.device("cpu"),
    ):
        """ Encodes character tokens using lstms

        Parameters
        ----------
        char_embedder : nn.Module
            An embedder that embeds character tokens
        char_emb_dim : int
            The embedding of characters
        hidden_dim : int
            Hidden dimension of the LSTM
        bidirectional : bool
            Should the LSTM be bi-directional
        combine_strategy : str
            Combine strategy for the lstm hidden dimensions
        device : torch.device("cpu)
            The device on which the lstm will run
        """
        super(CharLSTMEncoder, self).__init__()
        self.char_embedder = char_embedder
        self.char_emb_dim = char_emb_dim
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        self.combine_strategy = combine_strategy
        self.device = device
        self.seq2vecencoder = LSTM2VecEncoder(
            embedder=self.char_embedder,
            emb_dim=char_emb_dim,
            hidden_dim=hidden_dim,
            bidirectional=bidirectional,
            combine_strategy=combine_strategy,
            rnn_bias=True,
            device=device,
        )
Пример #11
0
    def build_model(self):
        embedder = WordEmbedder(
            embedding_type=self.hparams.get("embedding_type"))

        encoder = LSTM2VecEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            combine_strategy=self.hparams.get("combine_strategy"),
            bidirectional=self.hparams.get("bidirectional"),
        )

        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=2 * self.hparams.get("hidden_dim"),
            num_classes=self.hparams.get("num_classes"),
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
        )

        return model
Пример #12
0
        layer_aggregation="sum",
        cuda_device_id=-1 if DEVICE == "cpu" else int(
            DEVICE.split("cuda:")[1]),
    )

    # instantiate the vanilla embedder
    vanilla_embedder = VanillaEmbedder(embedding=embeddings,
                                       embedding_dim=EMBEDDING_DIMENSION)

    # concat the embeddings
    embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIMENSION + 1024,
        embedder=embedder,
        hidden_dim=HIDDEN_DIMENSION,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIMENSION if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
Пример #13
0
    DATA_PATH = pathlib.Path(DATA_DIR)
    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = WordEmbedder(embedding_type=args.emb_type, device=args.device)
    encoder = LSTM2VecEncoder(
        embedder=embedder,
        hidden_dim=args.hidden_dim,
        combine_strategy=args.combine_strategy,
        bidirectional=args.bidirectional,
        device=torch.device(args.device),
    )

    classiier_encoding_dim = (2 * args.hidden_dim
                              if args.bidirectional else args.hidden_dim)
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classiier_encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
        device=args.device,
    )
Пример #14
0
def get_elmo_bilstm_lc_infer(dirname: str):

    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    DEVICE = config["DEVICE"]
    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    HIDDEN_DIM = config["HIDDEN_DIMENSION"]
    BIDIRECTIONAL = config["BIDIRECTIONAL"]
    COMBINE_STRATEGY = config["COMBINE_STRATEGY"]
    NUM_CLASSES = config["NUM_CLASSES"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)

    elmo_embedder = BowElmoEmbedder(
        layer_aggregation="sum",
        cuda_device_id=-1 if DEVICE == "cpu" else int(
            DEVICE.split("cuda:")[1]),
    )

    vanilla_embedder = VanillaEmbedder(embedding=embedding,
                                       embedding_dim=EMBEDDING_DIM)

    embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIM + 1024,
        embedder=embedders,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIM)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    inference = ClassificationInference(model=model,
                                        model_filepath=model_filepath,
                                        dataset=dataset)
    return inference
Пример #15
0
    def test_raises_error_on_wrong_combine_strategy(self, setup_lstm2vecencoder):
        with pytest.raises(AssertionError):

            encoder = LSTM2VecEncoder(
                embedder=WordEmbedder("glove_6B_50"), combine_strategy="add"
            )