示例#1
0
def setup_sectlabel_bow_glove_infer(request, clf_datasets_manager,
                                    tmpdir_factory):
    track_for_best = request.param
    sample_proportion = 0.5
    datasets_manager = clf_datasets_manager
    word_embedder = WordEmbedder(embedding_type="glove_6B_50")
    bow_encoder = BOW_Encoder(embedder=word_embedder)
    classifier = SimpleClassifier(
        encoder=bow_encoder,
        encoding_dim=word_embedder.get_embedding_dimension(),
        num_classes=2,
        classification_layer_bias=True,
        datasets_manager=datasets_manager,
    )
    train_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager)
    validation_metric = PrecisionRecallFMeasure(
        datasets_manager=datasets_manager)
    test_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager)

    optimizer = torch.optim.Adam(params=classifier.parameters())
    batch_size = 1
    save_dir = tmpdir_factory.mktemp("experiment_1")
    num_epochs = 1
    save_every = 1
    log_train_metrics_every = 10

    engine = Engine(
        model=classifier,
        datasets_manager=datasets_manager,
        optimizer=optimizer,
        batch_size=batch_size,
        save_dir=save_dir,
        num_epochs=num_epochs,
        save_every=save_every,
        log_train_metrics_every=log_train_metrics_every,
        train_metric=train_metric,
        validation_metric=validation_metric,
        test_metric=test_metric,
        track_for_best=track_for_best,
        sample_proportion=sample_proportion,
    )

    engine.run()
    model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt")
    infer = ClassificationInference(
        model=classifier,
        model_filepath=str(model_filepath),
        datasets_manager=datasets_manager,
    )
    return infer
示例#2
0
def build_sectlabel_bow_elmo_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = BowElmoEmbedder(layer_aggregation="last")
    encoder = BOW_Encoder(aggregation_type="sum", embedder=embedder)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=1024,
        num_classes=data_manager.num_labels["label"],
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    infer_client = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )

    return infer_client
示例#3
0
def setup_simple_classifier():
    BATCH_SIZE = 1
    NUM_TOKENS = 3
    EMB_DIM = 300
    VOCAB_SIZE = 10
    NUM_CLASSES = 3
    embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM]))
    embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding)
    labels = torch.LongTensor([[1]])
    encoder = BOW_Encoder(emb_dim=EMB_DIM,
                          embedder=embedder,
                          dropout_value=0,
                          aggregation_type="sum")
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TOKENS))
    tokens = torch.LongTensor(tokens)
    simple_classifier = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMB_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=False,
    )
    iter_dict = {"tokens": tokens, "label": labels}
    return iter_dict, simple_classifier, BATCH_SIZE, NUM_CLASSES
    def _get_model(self) -> nn.Module:
        embedding_type = self.hparams.get("emb_type")
        word_embedder = WordEmbedder(embedding_type=embedding_type)
        elmo_embedder = ElmoEmbedder(datasets_manager=self.data_manager)
        embedder = ConcatEmbedders([word_embedder, elmo_embedder])

        hidden_dim = self.hparams.get("hidden_dim")
        combine_strategy = self.hparams.get("combine_strategy")
        bidirectional = self.hparams.get("bidirectional")

        encoder = LSTM2VecEncoder(
            embedder=embedder,
            hidden_dim=hidden_dim,
            combine_strategy=combine_strategy,
            bidirectional=bidirectional,
        )

        classifier_encoding_dim = 2 * hidden_dim if bidirectional else hidden_dim
        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=classifier_encoding_dim,
            num_classes=3,
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
        )
        return model
示例#5
0
    def build_model(self):
        word_embedder = WordEmbedder(
            embedding_type=self.hparams.get("embedding_type"),
            device=self.hparams.get("device"))
        elmo_embedder = ElmoEmbedder(device=self.hparams.get("device"))

        embedder = ConcatEmbedders([word_embedder, elmo_embedder])

        encoder = LSTM2VecEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            combine_strategy=self.hparams.get("combine_strategy"),
            bidirectional=self.hparams.get("bidirectional"),
            device=torch.device(self.hparams.get("device")),
        )

        classiier_encoding_dim = (2 * self.hparams.get("hidden_dim")
                                  if self.hparams.get("bidirectional") else
                                  self.hparams.get("hidden_dim"))
        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=classiier_encoding_dim,
            num_classes=self.hparams.get("num_classes"),
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
            device=self.hparams.get("device"),
        )

        return model
示例#6
0
    def _get_model(self):
        elmo_embedder = BowElmoEmbedder(layer_aggregation="sum")

        # instantiate the vanilla embedder
        vanilla_embedder = WordEmbedder(
            embedding_type=self.hparams.get("emb_type"))

        # concat the embeddings
        embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder])

        hidden_dim = self.hparams.get("hidden_dim")
        bidirectional = self.hparams.get("bidirectional")
        combine_strategy = self.hparams.get("combine_strategy")

        encoder = LSTM2VecEncoder(
            embedder=embedder,
            hidden_dim=hidden_dim,
            bidirectional=bidirectional,
            combine_strategy=combine_strategy,
        )

        encoding_dim = (2 * hidden_dim if bidirectional
                        and combine_strategy == "concat" else hidden_dim)

        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=encoding_dim,
            num_classes=23,
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
        )

        return model
def build_sectlabel_elmobilstm_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )
    DEVICE = "cpu"
    EMBEDDING_TYPE = "glove_6B_50"
    HIDDEN_DIM = 512
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"

    elmo_embedder = BowElmoEmbedder(cuda_device_id=-1 if DEVICE ==
                                    "cpu" else int(DEVICE.split("cuda:")[1]))

    vanilla_embedder = WordEmbedder(embedding_type=EMBEDDING_TYPE)

    embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        embedder=embedders,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIM)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )
    return inference
示例#8
0
    def build_model(self):
        embedder = WordEmbedder(embedding_type=self.hparams.get("emb_type"))
        encoder = BOW_Encoder(embedder=embedder)

        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=self.hparams.get("encoding_dim"),
            num_classes=self.hparams.get("num_classes"),
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
        )

        return model
示例#9
0
def get_bilstm_lc_infer_parsect(dirname: str):

    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    HIDDEN_DIM = config["HIDDEN_DIMENSION"]
    COMBINE_STRATEGY = config["COMBINE_STRATEGY"]
    BIDIRECTIONAL = config["BIDIRECTIONAL"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    NUM_CLASSES = config["NUM_CLASSES"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
    embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIM,
                               embedding=embedding)

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        hidden_dim=HIDDEN_DIM,
        combine_strategy=COMBINE_STRATEGY,
        bidirectional=BIDIRECTIONAL,
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classifier_encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    inference = ClassificationInference(model=model,
                                        model_filepath=model_filepath,
                                        dataset=dataset)

    return inference
示例#10
0
def setup_simple_classifier(clf_dataset_manager):
    datasets_manager = clf_dataset_manager
    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder = BOW_Encoder(embedder=embedder)
    classifier = SimpleClassifier(
        encoder=encoder,
        encoding_dim=50,
        num_classes=2,
        datasets_manager=datasets_manager,
        classification_layer_bias=True,
    )
    train_dataset = datasets_manager.train_dataset
    lines, labels = train_dataset.get_lines_labels()

    return classifier, lines, labels
示例#11
0
    def build_model(self):
        embedder = BowElmoEmbedder(
            layer_aggregation=self.hparams.get("layer_aggregation"))

        encoder = BOW_Encoder(
            aggregation_type=self.hparams.get("word_aggregation"),
            embedder=embedder)

        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=self.hparams.get("encoding_dim"),
            num_classes=self.hparams.get("num_classes"),
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
        )
        return model
示例#12
0
def get_bow_bert_emb_lc_gensect_infer(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    NUM_CLASSES = config["NUM_CLASSES"]
    BERT_TYPE = config["BERT_TYPE"]

    DEVICE = config["DEVICE"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedder = BertEmbedder(
        emb_dim=EMBEDDING_DIM,
        dropout_value=0.0,
        aggregation_type="average",
        bert_type=BERT_TYPE,
        device=torch.device(DEVICE),
    )

    encoder = BOW_Encoder(
        embedder=embedder, emb_dim=EMBEDDING_DIM, aggregation_type="average"
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = GenericSectDataset(**test_dataset_args)

    parsect_inference = ClassificationInference(
        model=model, model_filepath=model_filepath, dataset=dataset
    )

    return parsect_inference
示例#13
0
def build_sectlabel_bilstm_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    HIDDEN_DIM = 512
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"

    classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    encoder = LSTM2VecEncoder(
        embedder=embedder,
        hidden_dim=HIDDEN_DIM,
        combine_strategy=COMBINE_STRATEGY,
        bidirectional=BIDIRECTIONAL,
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classifier_encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")),
        datasets_manager=data_manager,
    )

    return inference
示例#14
0
def get_bow_lc_parsect_infer(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIMENSION = config["EMBEDDING_DIMENSION"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    NUM_CLASSES = config["NUM_CLASSES"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIMENSION)
    embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIMENSION,
                               embedding=embedding)
    encoder = BOW_Encoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=0.0,
        aggregation_type="sum",
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIMENSION,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    dataset.print_stats()

    parsect_inference = ClassificationInference(model=model,
                                                model_filepath=model_filepath,
                                                dataset=dataset)

    return parsect_inference
示例#15
0
def build_sectlabel_bow_model(dirname: str):
    """

    Parameters
    ----------
    dirname : The directory where sciwing stores your outputs for the model

    Returns
    -------


    """
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder = BOW_Encoder(embedder=embedder)
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=embedder.get_embedding_dimension(),
        num_classes=data_manager.num_labels["label"],
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    infer = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )
    return infer
示例#16
0
def get_elmo_emb_lc_infer_gensect(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]

    NUM_CLASSES = config["NUM_CLASSES"]
    EMBEDDING_DIMENSION = config["EMBEDDING_DIMENSION"]
    LAYER_AGGREGATION = config["LAYER_AGGREGATION"]
    WORD_AGGREGATION = config["WORD_AGGREGATION"]

    embedder = BowElmoEmbedder(emb_dim=EMBEDDING_DIMENSION,
                               layer_aggregation=LAYER_AGGREGATION)
    encoder = BOW_Encoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        aggregation_type=WORD_AGGREGATION,
    )
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]
    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    dataset = GenericSectDataset(**test_dataset_args)

    parsect_inference = ClassificationInference(model=model,
                                                model_filepath=model_filepath,
                                                dataset=dataset)

    return parsect_inference
示例#17
0
    def build_model(self):
        embedder = WordEmbedder(
            embedding_type=self.hparams.get("embedding_type"))

        encoder = LSTM2VecEncoder(
            embedder=embedder,
            hidden_dim=self.hparams.get("hidden_dim"),
            combine_strategy=self.hparams.get("combine_strategy"),
            bidirectional=self.hparams.get("bidirectional"),
        )

        model = SimpleClassifier(
            encoder=encoder,
            encoding_dim=2 * self.hparams.get("hidden_dim"),
            num_classes=self.hparams.get("num_classes"),
            classification_layer_bias=True,
            datasets_manager=self.data_manager,
        )

        return model
示例#18
0
def build_sectlabel_bow_bert(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = BertEmbedder(
        dropout_value=0.0,
        aggregation_type="average",
        bert_type="bert-base-uncased",
        device=torch.device("cpu"),
    )

    encoder = BOW_Encoder(embedder=embedder, aggregation_type="average")
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=768,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    parsect_inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )

    return parsect_inference
示例#19
0
    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIMENSION + 1024,
        embedder=embedder,
        hidden_dim=HIDDEN_DIMENSION,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIMENSION if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
    metric = PrecisionRecallFMeasure(
        idx2labelname_mapping=train_dataset.idx2classname)

    engine = Engine(
        model=model,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        test_dataset=test_dataset,
        optimizer=optimizer,
        batch_size=BATCH_SIZE,
        save_dir=MODEL_SAVE_DIR,
示例#20
0
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    # BowElmoEmbedder embeds sentences using ELMO
    embedder = BowElmoEmbedder(layer_aggregation=args.layer_aggregation,
                               device=args.device)

    encoder = BOW_Encoder(embedder=embedder,
                          aggregation_type=args.word_aggregation,
                          device=args.device)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=1024,
        num_classes=data_manager.num_labels["label"],
        classification_layer_bias=True,
        datasets_manager=data_manager,
        device=args.device,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=args.lr)
    train_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)
    dev_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)
    test_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)

    engine = Engine(
        model=model,
        datasets_manager=data_manager,
        optimizer=optimizer,
        batch_size=args.bs,
        save_dir=args.model_save_dir,
示例#21
0
def setup_engine_test_with_simple_classifier(request, tmpdir_factory):
    MAX_NUM_WORDS = 1000
    MAX_LENGTH = 50
    vocab_store_location = tmpdir_factory.mktemp("tempdir").join("vocab.json")
    DEBUG = True
    BATCH_SIZE = 1
    NUM_TOKENS = 3
    EMB_DIM = 300

    train_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="train",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    validation_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="valid",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    test_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="test",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    VOCAB_SIZE = MAX_NUM_WORDS + len(train_dataset.word_vocab.special_vocab)
    NUM_CLASSES = train_dataset.get_num_classes()
    NUM_EPOCHS = 1
    embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM]))
    labels = torch.LongTensor([1])
    metric = PrecisionRecallFMeasure(
        idx2labelname_mapping=train_dataset.idx2classname)
    embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding)
    encoder = BOW_Encoder(emb_dim=EMB_DIM,
                          embedder=embedder,
                          dropout_value=0,
                          aggregation_type="sum")
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TOKENS))
    tokens = torch.LongTensor(tokens)
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMB_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=False,
    )

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    engine = Engine(
        model,
        train_dataset,
        validation_dataset,
        test_dataset,
        optimizer=optimizer,
        batch_size=BATCH_SIZE,
        save_dir=tmpdir_factory.mktemp("model_save"),
        num_epochs=NUM_EPOCHS,
        save_every=1,
        log_train_metrics_every=10,
        metric=metric,
        track_for_best=request.param,
    )

    options = {
        "MAX_NUM_WORDS": MAX_NUM_WORDS,
        "MAX_LENGTH": MAX_LENGTH,
        "BATCH_SIZE": BATCH_SIZE,
        "NUM_TOKENS": NUM_TOKENS,
        "EMB_DIM": EMB_DIM,
        "VOCAB_SIZE": VOCAB_SIZE,
        "NUM_CLASSES": NUM_CLASSES,
        "NUM_EPOCHS": NUM_EPOCHS,
    }

    return engine, tokens, labels, options
示例#22
0
def get_elmo_bilstm_lc_infer(dirname: str):

    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    DEVICE = config["DEVICE"]
    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    HIDDEN_DIM = config["HIDDEN_DIMENSION"]
    BIDIRECTIONAL = config["BIDIRECTIONAL"]
    COMBINE_STRATEGY = config["COMBINE_STRATEGY"]
    NUM_CLASSES = config["NUM_CLASSES"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)

    elmo_embedder = BowElmoEmbedder(
        layer_aggregation="sum",
        cuda_device_id=-1 if DEVICE == "cpu" else int(
            DEVICE.split("cuda:")[1]),
    )

    vanilla_embedder = VanillaEmbedder(embedding=embedding,
                                       embedding_dim=EMBEDDING_DIM)

    embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIM + 1024,
        embedder=embedders,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIM)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    inference = ClassificationInference(model=model,
                                        model_filepath=model_filepath,
                                        dataset=dataset)
    return inference
示例#23
0
    dev_filename = DATA_DIR.joinpath("genericSect.dev")
    test_filename = DATA_DIR.joinpath("genericSect.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=test_filename,
    )

    embedder = WordEmbedder(embedding_type=args.emb_type)
    encoder = BOW_Encoder(embedder=embedder)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=50,
        num_classes=12,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=args.lr)
    train_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)
    dev_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)
    test_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)

    engine = Engine(
        datasets_manager=data_manager,
        model=model,
        optimizer=optimizer,
        batch_size=args.bs,
        save_dir=args.model_save_dir,