Exemplo n.º 1
0
def build_sectlabel_bow_elmo_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = BowElmoEmbedder(layer_aggregation="last")
    encoder = BOW_Encoder(aggregation_type="sum", embedder=embedder)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=1024,
        num_classes=data_manager.num_labels["label"],
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    infer_client = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )

    return infer_client
Exemplo n.º 2
0
 def _get_infer_client(self):
     client = ClassificationInference(
         model=self.model,
         model_filepath=self.final_model_dir.joinpath("best_model.pt"),
         datasets_manager=self.data_manager,
     )
     return client
Exemplo n.º 3
0
 def build_infer(self):
     parsect_inference = ClassificationInference(
         model=self.model,
         model_filepath=self.hparams.get("model_filepath"),
         datasets_manager=self.data_manager,
     )
     return parsect_inference
Exemplo n.º 4
0
def build_sectlabel_elmobilstm_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )
    DEVICE = "cpu"
    EMBEDDING_TYPE = "glove_6B_50"
    HIDDEN_DIM = 512
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"

    elmo_embedder = BowElmoEmbedder(cuda_device_id=-1 if DEVICE ==
                                    "cpu" else int(DEVICE.split("cuda:")[1]))

    vanilla_embedder = WordEmbedder(embedding_type=EMBEDDING_TYPE)

    embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        embedder=embedders,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIM)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )
    return inference
Exemplo n.º 5
0
def get_bilstm_lc_infer_parsect(dirname: str):

    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    HIDDEN_DIM = config["HIDDEN_DIMENSION"]
    COMBINE_STRATEGY = config["COMBINE_STRATEGY"]
    BIDIRECTIONAL = config["BIDIRECTIONAL"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    NUM_CLASSES = config["NUM_CLASSES"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
    embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIM,
                               embedding=embedding)

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        hidden_dim=HIDDEN_DIM,
        combine_strategy=COMBINE_STRATEGY,
        bidirectional=BIDIRECTIONAL,
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classifier_encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    inference = ClassificationInference(model=model,
                                        model_filepath=model_filepath,
                                        dataset=dataset)

    return inference
Exemplo n.º 6
0
def setup_sectlabel_bow_glove_infer(request, clf_datasets_manager,
                                    tmpdir_factory):
    track_for_best = request.param
    sample_proportion = 0.5
    datasets_manager = clf_datasets_manager
    word_embedder = WordEmbedder(embedding_type="glove_6B_50")
    bow_encoder = BOW_Encoder(embedder=word_embedder)
    classifier = SimpleClassifier(
        encoder=bow_encoder,
        encoding_dim=word_embedder.get_embedding_dimension(),
        num_classes=2,
        classification_layer_bias=True,
        datasets_manager=datasets_manager,
    )
    train_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager)
    validation_metric = PrecisionRecallFMeasure(
        datasets_manager=datasets_manager)
    test_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager)

    optimizer = torch.optim.Adam(params=classifier.parameters())
    batch_size = 1
    save_dir = tmpdir_factory.mktemp("experiment_1")
    num_epochs = 1
    save_every = 1
    log_train_metrics_every = 10

    engine = Engine(
        model=classifier,
        datasets_manager=datasets_manager,
        optimizer=optimizer,
        batch_size=batch_size,
        save_dir=save_dir,
        num_epochs=num_epochs,
        save_every=save_every,
        log_train_metrics_every=log_train_metrics_every,
        train_metric=train_metric,
        validation_metric=validation_metric,
        test_metric=test_metric,
        track_for_best=track_for_best,
        sample_proportion=sample_proportion,
    )

    engine.run()
    model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt")
    infer = ClassificationInference(
        model=classifier,
        model_filepath=str(model_filepath),
        datasets_manager=datasets_manager,
    )
    return infer
Exemplo n.º 7
0
def get_bow_bert_emb_lc_gensect_infer(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    NUM_CLASSES = config["NUM_CLASSES"]
    BERT_TYPE = config["BERT_TYPE"]

    DEVICE = config["DEVICE"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedder = BertEmbedder(
        emb_dim=EMBEDDING_DIM,
        dropout_value=0.0,
        aggregation_type="average",
        bert_type=BERT_TYPE,
        device=torch.device(DEVICE),
    )

    encoder = BOW_Encoder(
        embedder=embedder, emb_dim=EMBEDDING_DIM, aggregation_type="average"
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = GenericSectDataset(**test_dataset_args)

    parsect_inference = ClassificationInference(
        model=model, model_filepath=model_filepath, dataset=dataset
    )

    return parsect_inference
Exemplo n.º 8
0
def get_bow_lc_parsect_infer(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIMENSION = config["EMBEDDING_DIMENSION"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    NUM_CLASSES = config["NUM_CLASSES"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIMENSION)
    embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIMENSION,
                               embedding=embedding)
    encoder = BOW_Encoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=0.0,
        aggregation_type="sum",
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIMENSION,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    dataset.print_stats()

    parsect_inference = ClassificationInference(model=model,
                                                model_filepath=model_filepath,
                                                dataset=dataset)

    return parsect_inference
Exemplo n.º 9
0
def build_sectlabel_bilstm_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    HIDDEN_DIM = 512
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"

    classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    encoder = LSTM2VecEncoder(
        embedder=embedder,
        hidden_dim=HIDDEN_DIM,
        combine_strategy=COMBINE_STRATEGY,
        bidirectional=BIDIRECTIONAL,
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classifier_encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")),
        datasets_manager=data_manager,
    )

    return inference
Exemplo n.º 10
0
def build_sectlabel_bow_model(dirname: str):
    """

    Parameters
    ----------
    dirname : The directory where sciwing stores your outputs for the model

    Returns
    -------


    """
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder = BOW_Encoder(embedder=embedder)
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=embedder.get_embedding_dimension(),
        num_classes=data_manager.num_labels["label"],
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    infer = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )
    return infer
Exemplo n.º 11
0
def get_elmo_emb_lc_infer_gensect(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]

    NUM_CLASSES = config["NUM_CLASSES"]
    EMBEDDING_DIMENSION = config["EMBEDDING_DIMENSION"]
    LAYER_AGGREGATION = config["LAYER_AGGREGATION"]
    WORD_AGGREGATION = config["WORD_AGGREGATION"]

    embedder = BowElmoEmbedder(emb_dim=EMBEDDING_DIMENSION,
                               layer_aggregation=LAYER_AGGREGATION)
    encoder = BOW_Encoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        aggregation_type=WORD_AGGREGATION,
    )
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]
    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    dataset = GenericSectDataset(**test_dataset_args)

    parsect_inference = ClassificationInference(model=model,
                                                model_filepath=model_filepath,
                                                dataset=dataset)

    return parsect_inference
Exemplo n.º 12
0
def build_sectlabel_bow_bert(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = BertEmbedder(
        dropout_value=0.0,
        aggregation_type="average",
        bert_type="bert-base-uncased",
        device=torch.device("cpu"),
    )

    encoder = BOW_Encoder(embedder=embedder, aggregation_type="average")
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=768,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    parsect_inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )

    return parsect_inference
Exemplo n.º 13
0
def get_elmo_bilstm_lc_infer(dirname: str):

    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    DEVICE = config["DEVICE"]
    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    HIDDEN_DIM = config["HIDDEN_DIMENSION"]
    BIDIRECTIONAL = config["BIDIRECTIONAL"]
    COMBINE_STRATEGY = config["COMBINE_STRATEGY"]
    NUM_CLASSES = config["NUM_CLASSES"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)

    elmo_embedder = BowElmoEmbedder(
        layer_aggregation="sum",
        cuda_device_id=-1 if DEVICE == "cpu" else int(
            DEVICE.split("cuda:")[1]),
    )

    vanilla_embedder = VanillaEmbedder(embedding=embedding,
                                       embedding_dim=EMBEDDING_DIM)

    embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIM + 1024,
        embedder=embedders,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIM)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    inference = ClassificationInference(model=model,
                                        model_filepath=model_filepath,
                                        dataset=dataset)
    return inference