Пример #1
0
def build_sectlabel_bow_elmo_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = BowElmoEmbedder(layer_aggregation="last")
    encoder = BOW_Encoder(aggregation_type="sum", embedder=embedder)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=1024,
        num_classes=data_manager.num_labels["label"],
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    infer_client = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )

    return infer_client
Пример #2
0
 def _get_data(self):
     data_manager = TextClassificationDatasetManager(
         train_filename=self.data_dir.joinpath("scicite.train"),
         dev_filename=self.data_dir.joinpath("scicite.dev"),
         test_filename=self.data_dir.joinpath("scicite.test"),
     )
     return data_manager
Пример #3
0
    def build_dataset(self):
        train_filename = DATA_DIR.joinpath("genericSect.train")
        dev_filename = DATA_DIR.joinpath("genericSect.dev")
        test_filename = DATA_DIR.joinpath("genericSect.test")

        data_manager = TextClassificationDatasetManager(
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
        )
        return data_manager
Пример #4
0
    def _get_data(self):
        train_filename = self.data_dir.joinpath("sectLabel.train")
        dev_filename = self.data_dir.joinpath("sectLabel.dev")
        test_filename = self.data_dir.joinpath("sectLabel.test")

        data_manager = TextClassificationDatasetManager(
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
        )

        return data_manager
Пример #5
0
def build_sectlabel_elmobilstm_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )
    DEVICE = "cpu"
    EMBEDDING_TYPE = "glove_6B_50"
    HIDDEN_DIM = 512
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"

    elmo_embedder = BowElmoEmbedder(cuda_device_id=-1 if DEVICE ==
                                    "cpu" else int(DEVICE.split("cuda:")[1]))

    vanilla_embedder = WordEmbedder(embedding_type=EMBEDDING_TYPE)

    embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        embedder=embedders,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIM)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )
    return inference
Пример #6
0
def setup_sectlabel_dataset_manager():
    data_dir = pathlib.Path(DATA_DIR)
    sect_label_train_file = data_dir.joinpath("sectLabel.train")
    sect_label_dev_file = data_dir.joinpath("sectLabel.dev")
    sect_label_test_file = data_dir.joinpath("sectLabel.test")

    dataset_manager = TextClassificationDatasetManager(
        train_filename=sect_label_train_file,
        dev_filename=sect_label_dev_file,
        test_filename=sect_label_test_file,
    )

    return dataset_manager
Пример #7
0
    def build_dataset(self):
        #   train_file = DATA_PATH.joinpath("coda-19.train")
        #   dev_file = DATA_PATH.joinpath("coda-19.dev")
        #   test_file = DATA_PATH.joinpath("coda-19.test")
        train_filename = "./backend/abstract_tagging/sciwing/coda19_classification_elmo_slower/data/coda-19.train"
        dev_filename = "./backend/abstract_tagging/sciwing/coda19_classification_elmo_slower/data/coda-19.dev"
        test_filename = "./backend/abstract_tagging/sciwing/coda19_classification_elmo_slower/data/coda-19.test"

        data_manager = TextClassificationDatasetManager(
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
        )
        return data_manager
Пример #8
0
def clf_dataset_manager(tmpdir_factory, request):
    train_file = tmpdir_factory.mktemp("train_data").join("train_file.txt")
    train_file.write("train_line1###label1\ntrain_line2###label2")

    dev_file = tmpdir_factory.mktemp("dev_data").join("dev_file.txt")
    dev_file.write("dev_line1###label1\ndev_line2###label2")

    test_file = tmpdir_factory.mktemp("test_data").join("test_file.txt")
    test_file.write("dev_line1###label1\ndev_line2###label2")

    clf_dataset_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    return clf_dataset_manager
Пример #9
0
def build_sectlabel_bilstm_model(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    HIDDEN_DIM = 512
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"

    classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    encoder = LSTM2VecEncoder(
        embedder=embedder,
        hidden_dim=HIDDEN_DIM,
        combine_strategy=COMBINE_STRATEGY,
        bidirectional=BIDIRECTIONAL,
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classifier_encoding_dim,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")),
        datasets_manager=data_manager,
    )

    return inference
Пример #10
0
def build_sectlabel_bow_model(dirname: str):
    """

    Parameters
    ----------
    dirname : The directory where sciwing stores your outputs for the model

    Returns
    -------


    """
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder = BOW_Encoder(embedder=embedder)
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=embedder.get_embedding_dimension(),
        num_classes=data_manager.num_labels["label"],
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    infer = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )
    return infer
Пример #11
0
    def _get_data(self):
        train_file = cached_path(
            path=self.data_dir.joinpath("scicite.train"),
            url=self.train_data_url,
            unzip=False,
        )
        dev_file = cached_path(
            path=self.data_dir.joinpath("scicite.dev"),
            url=self.dev_data_url,
            unzip=False,
        )
        test_file = cached_path(
            path=self.data_dir.joinpath("scicite.test"),
            url=self.test_data_url,
            unzip=False,
        )

        data_manager = TextClassificationDatasetManager(
            train_filename=train_file,
            dev_filename=dev_file,
            test_filename=test_file)
        return data_manager
Пример #12
0
def build_sectlabel_bow_bert(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = BertEmbedder(
        dropout_value=0.0,
        aggregation_type="average",
        bert_type="bert-base-uncased",
        device=torch.device("cpu"),
    )

    encoder = BOW_Encoder(embedder=embedder, aggregation_type="average")
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=768,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    parsect_inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )

    return parsect_inference
Пример #13
0
    def _get_data(self):
        train_filename = self.data_dir.joinpath("sectLabel.train")
        dev_filename = self.data_dir.joinpath("sectLabel.dev")
        test_filename = self.data_dir.joinpath("sectLabel.test")

        train_filename = cached_path(path=train_filename,
                                     url=self.train_data_url,
                                     unzip=False)
        dev_filename = cached_path(path=dev_filename,
                                   url=self.dev_data_url,
                                   unzip=False)

        test_filename = cached_path(path=test_filename,
                                    url=self.test_data_url,
                                    unzip=False)

        data_manager = TextClassificationDatasetManager(
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
        )

        return data_manager
Пример #14
0
    parser.add_argument("--vocab_store_location",
                        help="File in which the vocab is stored")
    parser.add_argument("--sample_proportion",
                        help="Sample proportion for debugging",
                        type=float)

    args = parser.parse_args()

    DATA_PATH = pathlib.Path(DATA_DIR)
    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    # BowElmoEmbedder embeds sentences using ELMO
    embedder = BowElmoEmbedder(layer_aggregation=args.layer_aggregation,
                               device=args.device)

    encoder = BOW_Encoder(embedder=embedder,
                          aggregation_type=args.word_aggregation,
                          device=args.device)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=1024,
        num_classes=data_manager.num_labels["label"],
Пример #15
0
        "Directory where the checkpoints during model training are stored.",
    )
    parser.add_argument("--sample_proportion",
                        help="Sample data size",
                        type=float)

    args = parser.parse_args()

    DATA_DIR = pathlib.Path(DATA_DIR)
    train_filename = DATA_DIR.joinpath("genericSect.train")
    dev_filename = DATA_DIR.joinpath("genericSect.dev")
    test_filename = DATA_DIR.joinpath("genericSect.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=test_filename,
    )

    embedder = BowElmoEmbedder(layer_aggregation=args.layer_aggregation,
                               device=args.device)

    encoder = BOW_Encoder(aggregation_type=args.word_aggregation,
                          embedder=embedder,
                          device=args.device)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=1024,
        num_classes=12,
        classification_layer_bias=True,