示例#1
0
def setup_sectlabel_bow_glove_infer(request, clf_datasets_manager,
                                    tmpdir_factory):
    track_for_best = request.param
    sample_proportion = 0.5
    datasets_manager = clf_datasets_manager
    word_embedder = WordEmbedder(embedding_type="glove_6B_50")
    bow_encoder = BOW_Encoder(embedder=word_embedder)
    classifier = SimpleClassifier(
        encoder=bow_encoder,
        encoding_dim=word_embedder.get_embedding_dimension(),
        num_classes=2,
        classification_layer_bias=True,
        datasets_manager=datasets_manager,
    )
    train_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager)
    validation_metric = PrecisionRecallFMeasure(
        datasets_manager=datasets_manager)
    test_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager)

    optimizer = torch.optim.Adam(params=classifier.parameters())
    batch_size = 1
    save_dir = tmpdir_factory.mktemp("experiment_1")
    num_epochs = 1
    save_every = 1
    log_train_metrics_every = 10

    engine = Engine(
        model=classifier,
        datasets_manager=datasets_manager,
        optimizer=optimizer,
        batch_size=batch_size,
        save_dir=save_dir,
        num_epochs=num_epochs,
        save_every=save_every,
        log_train_metrics_every=log_train_metrics_every,
        train_metric=train_metric,
        validation_metric=validation_metric,
        test_metric=test_metric,
        track_for_best=track_for_best,
        sample_proportion=sample_proportion,
    )

    engine.run()
    model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt")
    infer = ClassificationInference(
        model=classifier,
        model_filepath=str(model_filepath),
        datasets_manager=datasets_manager,
    )
    return infer
示例#2
0
        encoding_dim=encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
    metric = PrecisionRecallFMeasure(
        idx2labelname_mapping=train_dataset.idx2classname)

    engine = Engine(
        model=model,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        test_dataset=test_dataset,
        optimizer=optimizer,
        batch_size=BATCH_SIZE,
        save_dir=MODEL_SAVE_DIR,
        num_epochs=NUM_EPOCHS,
        save_every=SAVE_EVERY,
        log_train_metrics_every=LOG_TRAIN_METRICS_EVERY,
        tensorboard_logdir=TENSORBOARD_LOGDIR,
        device=torch.device(DEVICE),
        metric=metric,
        use_wandb=True,
        experiment_name=EXP_NAME,
        experiment_hyperparams=config,
        track_for_best="macro_fscore",
    )

    engine.run()
示例#3
0
        datasets_manager=data_manager,
        device=args.device,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=args.lr)
    train_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)
    dev_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)
    test_metric = PrecisionRecallFMeasure(datasets_manager=data_manager)

    engine = Engine(
        model=model,
        datasets_manager=data_manager,
        optimizer=optimizer,
        batch_size=args.bs,
        save_dir=args.model_save_dir,
        num_epochs=args.epochs,
        save_every=args.save_every,
        log_train_metrics_every=args.log_train_metrics_every,
        device=args.device,
        train_metric=train_metric,
        validation_metric=dev_metric,
        test_metric=test_metric,
        use_wandb=True,
        experiment_name=args.exp_name,
        experiment_hyperparams=vars(args),
        track_for_best="macro_fscore",
        sample_proportion=args.sample_proportion,
    )

    engine.run()
示例#4
0
        threshold=1e-3,
    )

    engine = Engine(
        model=model,
        datasets_manager=data_manager,
        optimizer=optimizer,
        batch_size=args.bs,
        save_dir=args.model_save_dir,
        num_epochs=args.epochs,
        save_every=args.save_every,
        log_train_metrics_every=args.log_train_metrics_every,
        track_for_best="fscore",
        device=torch.device(args.device),
        train_metric=train_metric,
        validation_metric=dev_metric,
        test_metric=test_metric,
        use_wandb=True,
        experiment_name=args.exp_name,
        experiment_hyperparams=vars(args),
        sample_proportion=args.sample_proportion,
        lr_scheduler=scheduler,
        seeds={
            "random_seed": 17,
            "numpy_seed": 17,
            "pytorch_seed": 17
        },
    )

    engine.run()
示例#5
0
    def setup_engine_once(
        config_dict: Dict[str, str],
        experiment_name: str,
        train_data_filepath: pathlib.Path,
        test_data_filepath: pathlib.Path,
    ):
        DEBUG = config_dict["DEBUG"]
        DEBUG_DATASET_PROPORTION = config_dict["DEBUG_DATASET_PROPORTION"]
        BATCH_SIZE = config_dict["BATCH_SIZE"]
        LEARNING_RATE = config_dict["LEARNING_RATE"]
        NUM_EPOCHS = config_dict["NUM_EPOCHS"]
        SAVE_EVERY = config_dict["SAVE_EVERY"]
        LOG_TRAIN_METRICS_EVERY = config_dict["LOG_TRAIN_METRICS_EVERY"]
        EMBEDDING_DIMENSION = config_dict["EMBEDDING_DIMENSION"]
        CHAR_EMBEDDING_DIMENSION = config_dict["CHAR_EMBEDDING_DIMENSION"]
        EMBEDDING_TYPE = config_dict["EMBEDDING_TYPE"]
        MAX_NUM_WORDS = config_dict["MAX_NUM_WORDS"]
        MAX_LENGTH = config_dict["MAX_LENGTH"]
        DEVICE = config_dict["DEVICE"]
        HIDDEN_DIM = config_dict["HIDDEN_DIM"]
        BIDIRECTIONAL = config_dict["BIDIRECTIONAL"]
        COMBINE_STRATEGY = config_dict["COMBINE_STRATEGY"]
        MAX_CHAR_LENGTH = config_dict["MAX_CHAR_LENGTH"]
        USE_CHAR_ENCODER = config_dict["USE_CHAR_ENCODER"]
        CHAR_ENCODER_HIDDEN_DIM = config_dict["CHAR_ENCODER_HIDDEN_DIM"]
        DROPOUT = config_dict["DROPOUT"]

        EXP_NAME = experiment_name
        EXP_DIR_PATH = os.path.join(OUTPUT_DIR, EXP_NAME)

        if not os.path.isdir(EXP_DIR_PATH):
            os.mkdir(EXP_DIR_PATH)

        MODEL_SAVE_DIR = os.path.join(EXP_DIR_PATH, "checkpoints")

        if not os.path.isdir(MODEL_SAVE_DIR):
            os.mkdir(MODEL_SAVE_DIR)

        VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "vocab.json")
        CHAR_VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH,
                                                 "char_vocab.json")
        CAPITALIZATION_VOCAB_STORE_LOCATION = os.path.join(
            EXP_DIR_PATH, "capitalization_vocab.json")
        CAPITALIZATION_EMBEDDING_DIMENSION = 10
        TENSORBOARD_LOGDIR = os.path.join(".", "runs", EXP_NAME)

        train_dataset = ParscitDataset(
            filename=str(train_data_filepath),
            dataset_type="train",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        validation_dataset = ParscitDataset(
            filename=str(test_data_filepath),
            dataset_type="valid",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        test_dataset = ParscitDataset(
            filename=str(test_data_filepath),
            dataset_type="test",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        train_dataset.print_stats()
        validation_dataset.print_stats()
        test_dataset.print_stats()

        VOCAB_SIZE = train_dataset.word_vocab.get_vocab_len()
        NUM_CLASSES = train_dataset.get_num_classes()
        embedding = train_dataset.word_vocab.load_embedding()
        embedding = nn.Embedding.from_pretrained(embedding, freeze=False)
        char_embedding = train_dataset.char_vocab.load_embedding()
        char_embedding = nn.Embedding.from_pretrained(char_embedding,
                                                      freeze=False)

        embedder = VanillaEmbedder(embedding=embedding,
                                   embedding_dim=EMBEDDING_DIMENSION)

        if USE_CHAR_ENCODER:
            char_embedder = VanillaEmbedder(
                embedding=char_embedding,
                embedding_dim=CHAR_EMBEDDING_DIMENSION)
            char_encoder = CharLSTMEncoder(
                char_emb_dim=CHAR_EMBEDDING_DIMENSION,
                char_embedder=char_embedder,
                bidirectional=True,
                hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
                combine_strategy="concat",
                device=torch.device(DEVICE),
            )
            embedder = ConcatEmbedders([embedder, char_encoder])
            EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM

        lstm2seqencoder = Lstm2SeqEncoder(
            emb_dim=EMBEDDING_DIMENSION,
            embedder=embedder,
            dropout_value=DROPOUT,
            hidden_dim=HIDDEN_DIM,
            bidirectional=BIDIRECTIONAL,
            combine_strategy=COMBINE_STRATEGY,
            rnn_bias=True,
            device=torch.device(DEVICE),
        )
        model = ParscitTagger(
            rnn2seqencoder=lstm2seqencoder,
            num_classes=NUM_CLASSES,
            hid_dim=2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        )

        optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
        metric = TokenClassificationAccuracy(
            idx2labelname_mapping=train_dataset.idx2classname)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer, mode="max", factor=0.1, patience=2)

        engine = Engine(
            model=model,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            test_dataset=test_dataset,
            optimizer=optimizer,
            batch_size=BATCH_SIZE,
            save_dir=MODEL_SAVE_DIR,
            num_epochs=NUM_EPOCHS,
            save_every=SAVE_EVERY,
            log_train_metrics_every=LOG_TRAIN_METRICS_EVERY,
            tensorboard_logdir=TENSORBOARD_LOGDIR,
            device=torch.device(DEVICE),
            metric=metric,
            track_for_best="macro_fscore",
            lr_scheduler=scheduler,
        )

        config_dict["VOCAB_STORE_LOCATION"] = VOCAB_STORE_LOCATION
        config_dict["CHAR_VOCAB_STORE_LOCATION"] = CHAR_VOCAB_STORE_LOCATION
        config_dict["MODEL_SAVE_DIR"] = MODEL_SAVE_DIR
        config_dict["VOCAB_SIZE"] = VOCAB_SIZE
        config_dict["NUM_CLASSES"] = NUM_CLASSES

        with open(os.path.join(f"{EXP_DIR_PATH}", "config.json"), "w") as fp:
            json.dump(config_dict, fp)

        return engine
示例#6
0
def setup_parscit_inference(seq_dataset_manager, tmpdir_factory):
    HIDDEN_DIM = 100
    BIDIRECTIONAL = True
    COMBINE_STRATEGY = "concat"
    dataset_manager = seq_dataset_manager

    embedder = WordEmbedder(embedding_type="glove_6B_50")

    char_embedder = CharEmbedder(
        char_embedding_dimension=10,
        hidden_dimension=20,
        datasets_manager=dataset_manager,
    )
    embedder = ConcatEmbedders([embedder, char_embedder])

    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        add_projection_layer=False,
    )

    tagger = RnnSeqCrfTagger(
        rnn2seqencoder=encoder,
        encoding_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat"
        else HIDDEN_DIM,
        datasets_manager=dataset_manager,
    )

    train_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)
    dev_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)
    test_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager)

    optimizer = torch.optim.Adam(params=tagger.parameters())
    batch_size = 1
    save_dir = tmpdir_factory.mktemp("experiment_1")
    num_epochs = 1
    save_every = 1
    log_train_metrics_every = 10

    engine = Engine(
        model=tagger,
        datasets_manager=dataset_manager,
        optimizer=optimizer,
        batch_size=batch_size,
        save_dir=save_dir,
        num_epochs=num_epochs,
        save_every=save_every,
        log_train_metrics_every=log_train_metrics_every,
        train_metric=train_metric,
        validation_metric=dev_metric,
        test_metric=test_metric,
        track_for_best="macro_fscore",
    )

    engine.run()
    model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt")

    inference_client = SequenceLabellingInference(
        model=tagger, model_filepath=model_filepath, datasets_manager=dataset_manager
    )

    return inference_client
示例#7
0
def setup_engine_test_with_simple_classifier(request, tmpdir_factory):
    MAX_NUM_WORDS = 1000
    MAX_LENGTH = 50
    vocab_store_location = tmpdir_factory.mktemp("tempdir").join("vocab.json")
    DEBUG = True
    BATCH_SIZE = 1
    NUM_TOKENS = 3
    EMB_DIM = 300

    train_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="train",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    validation_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="valid",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    test_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="test",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    VOCAB_SIZE = MAX_NUM_WORDS + len(train_dataset.word_vocab.special_vocab)
    NUM_CLASSES = train_dataset.get_num_classes()
    NUM_EPOCHS = 1
    embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM]))
    labels = torch.LongTensor([1])
    metric = PrecisionRecallFMeasure(
        idx2labelname_mapping=train_dataset.idx2classname)
    embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding)
    encoder = BOW_Encoder(emb_dim=EMB_DIM,
                          embedder=embedder,
                          dropout_value=0,
                          aggregation_type="sum")
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TOKENS))
    tokens = torch.LongTensor(tokens)
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMB_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=False,
    )

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    engine = Engine(
        model,
        train_dataset,
        validation_dataset,
        test_dataset,
        optimizer=optimizer,
        batch_size=BATCH_SIZE,
        save_dir=tmpdir_factory.mktemp("model_save"),
        num_epochs=NUM_EPOCHS,
        save_every=1,
        log_train_metrics_every=10,
        metric=metric,
        track_for_best=request.param,
    )

    options = {
        "MAX_NUM_WORDS": MAX_NUM_WORDS,
        "MAX_LENGTH": MAX_LENGTH,
        "BATCH_SIZE": BATCH_SIZE,
        "NUM_TOKENS": NUM_TOKENS,
        "EMB_DIM": EMB_DIM,
        "VOCAB_SIZE": VOCAB_SIZE,
        "NUM_CLASSES": NUM_CLASSES,
        "NUM_EPOCHS": NUM_EPOCHS,
    }

    return engine, tokens, labels, options