def setup_sectlabel_bow_glove_infer(request, clf_datasets_manager, tmpdir_factory): track_for_best = request.param sample_proportion = 0.5 datasets_manager = clf_datasets_manager word_embedder = WordEmbedder(embedding_type="glove_6B_50") bow_encoder = BOW_Encoder(embedder=word_embedder) classifier = SimpleClassifier( encoder=bow_encoder, encoding_dim=word_embedder.get_embedding_dimension(), num_classes=2, classification_layer_bias=True, datasets_manager=datasets_manager, ) train_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager) validation_metric = PrecisionRecallFMeasure( datasets_manager=datasets_manager) test_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager) optimizer = torch.optim.Adam(params=classifier.parameters()) batch_size = 1 save_dir = tmpdir_factory.mktemp("experiment_1") num_epochs = 1 save_every = 1 log_train_metrics_every = 10 engine = Engine( model=classifier, datasets_manager=datasets_manager, optimizer=optimizer, batch_size=batch_size, save_dir=save_dir, num_epochs=num_epochs, save_every=save_every, log_train_metrics_every=log_train_metrics_every, train_metric=train_metric, validation_metric=validation_metric, test_metric=test_metric, track_for_best=track_for_best, sample_proportion=sample_proportion, ) engine.run() model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt") infer = ClassificationInference( model=classifier, model_filepath=str(model_filepath), datasets_manager=datasets_manager, ) return infer
encoding_dim=encoding_dim, num_classes=NUM_CLASSES, classification_layer_bias=True, ) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE) metric = PrecisionRecallFMeasure( idx2labelname_mapping=train_dataset.idx2classname) engine = Engine( model=model, train_dataset=train_dataset, validation_dataset=validation_dataset, test_dataset=test_dataset, optimizer=optimizer, batch_size=BATCH_SIZE, save_dir=MODEL_SAVE_DIR, num_epochs=NUM_EPOCHS, save_every=SAVE_EVERY, log_train_metrics_every=LOG_TRAIN_METRICS_EVERY, tensorboard_logdir=TENSORBOARD_LOGDIR, device=torch.device(DEVICE), metric=metric, use_wandb=True, experiment_name=EXP_NAME, experiment_hyperparams=config, track_for_best="macro_fscore", ) engine.run()
datasets_manager=data_manager, device=args.device, ) optimizer = optim.Adam(params=model.parameters(), lr=args.lr) train_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) dev_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) test_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) engine = Engine( model=model, datasets_manager=data_manager, optimizer=optimizer, batch_size=args.bs, save_dir=args.model_save_dir, num_epochs=args.epochs, save_every=args.save_every, log_train_metrics_every=args.log_train_metrics_every, device=args.device, train_metric=train_metric, validation_metric=dev_metric, test_metric=test_metric, use_wandb=True, experiment_name=args.exp_name, experiment_hyperparams=vars(args), track_for_best="macro_fscore", sample_proportion=args.sample_proportion, ) engine.run()
threshold=1e-3, ) engine = Engine( model=model, datasets_manager=data_manager, optimizer=optimizer, batch_size=args.bs, save_dir=args.model_save_dir, num_epochs=args.epochs, save_every=args.save_every, log_train_metrics_every=args.log_train_metrics_every, track_for_best="fscore", device=torch.device(args.device), train_metric=train_metric, validation_metric=dev_metric, test_metric=test_metric, use_wandb=True, experiment_name=args.exp_name, experiment_hyperparams=vars(args), sample_proportion=args.sample_proportion, lr_scheduler=scheduler, seeds={ "random_seed": 17, "numpy_seed": 17, "pytorch_seed": 17 }, ) engine.run()
def setup_engine_once( config_dict: Dict[str, str], experiment_name: str, train_data_filepath: pathlib.Path, test_data_filepath: pathlib.Path, ): DEBUG = config_dict["DEBUG"] DEBUG_DATASET_PROPORTION = config_dict["DEBUG_DATASET_PROPORTION"] BATCH_SIZE = config_dict["BATCH_SIZE"] LEARNING_RATE = config_dict["LEARNING_RATE"] NUM_EPOCHS = config_dict["NUM_EPOCHS"] SAVE_EVERY = config_dict["SAVE_EVERY"] LOG_TRAIN_METRICS_EVERY = config_dict["LOG_TRAIN_METRICS_EVERY"] EMBEDDING_DIMENSION = config_dict["EMBEDDING_DIMENSION"] CHAR_EMBEDDING_DIMENSION = config_dict["CHAR_EMBEDDING_DIMENSION"] EMBEDDING_TYPE = config_dict["EMBEDDING_TYPE"] MAX_NUM_WORDS = config_dict["MAX_NUM_WORDS"] MAX_LENGTH = config_dict["MAX_LENGTH"] DEVICE = config_dict["DEVICE"] HIDDEN_DIM = config_dict["HIDDEN_DIM"] BIDIRECTIONAL = config_dict["BIDIRECTIONAL"] COMBINE_STRATEGY = config_dict["COMBINE_STRATEGY"] MAX_CHAR_LENGTH = config_dict["MAX_CHAR_LENGTH"] USE_CHAR_ENCODER = config_dict["USE_CHAR_ENCODER"] CHAR_ENCODER_HIDDEN_DIM = config_dict["CHAR_ENCODER_HIDDEN_DIM"] DROPOUT = config_dict["DROPOUT"] EXP_NAME = experiment_name EXP_DIR_PATH = os.path.join(OUTPUT_DIR, EXP_NAME) if not os.path.isdir(EXP_DIR_PATH): os.mkdir(EXP_DIR_PATH) MODEL_SAVE_DIR = os.path.join(EXP_DIR_PATH, "checkpoints") if not os.path.isdir(MODEL_SAVE_DIR): os.mkdir(MODEL_SAVE_DIR) VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "vocab.json") CHAR_VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "char_vocab.json") CAPITALIZATION_VOCAB_STORE_LOCATION = os.path.join( EXP_DIR_PATH, "capitalization_vocab.json") CAPITALIZATION_EMBEDDING_DIMENSION = 10 TENSORBOARD_LOGDIR = os.path.join(".", "runs", EXP_NAME) train_dataset = ParscitDataset( filename=str(train_data_filepath), dataset_type="train", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) validation_dataset = ParscitDataset( filename=str(test_data_filepath), dataset_type="valid", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) test_dataset = ParscitDataset( filename=str(test_data_filepath), dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) train_dataset.print_stats() validation_dataset.print_stats() test_dataset.print_stats() VOCAB_SIZE = train_dataset.word_vocab.get_vocab_len() NUM_CLASSES = train_dataset.get_num_classes() embedding = train_dataset.word_vocab.load_embedding() embedding = nn.Embedding.from_pretrained(embedding, freeze=False) char_embedding = train_dataset.char_vocab.load_embedding() char_embedding = nn.Embedding.from_pretrained(char_embedding, freeze=False) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION) if USE_CHAR_ENCODER: char_embedder = VanillaEmbedder( embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION) char_encoder = CharLSTMEncoder( char_emb_dim=CHAR_EMBEDDING_DIMENSION, char_embedder=char_embedder, bidirectional=True, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, combine_strategy="concat", device=torch.device(DEVICE), ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM lstm2seqencoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=DROPOUT, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=True, device=torch.device(DEVICE), ) model = ParscitTagger( rnn2seqencoder=lstm2seqencoder, num_classes=NUM_CLASSES, hid_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, ) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE) metric = TokenClassificationAccuracy( idx2labelname_mapping=train_dataset.idx2classname) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer, mode="max", factor=0.1, patience=2) engine = Engine( model=model, train_dataset=train_dataset, validation_dataset=validation_dataset, test_dataset=test_dataset, optimizer=optimizer, batch_size=BATCH_SIZE, save_dir=MODEL_SAVE_DIR, num_epochs=NUM_EPOCHS, save_every=SAVE_EVERY, log_train_metrics_every=LOG_TRAIN_METRICS_EVERY, tensorboard_logdir=TENSORBOARD_LOGDIR, device=torch.device(DEVICE), metric=metric, track_for_best="macro_fscore", lr_scheduler=scheduler, ) config_dict["VOCAB_STORE_LOCATION"] = VOCAB_STORE_LOCATION config_dict["CHAR_VOCAB_STORE_LOCATION"] = CHAR_VOCAB_STORE_LOCATION config_dict["MODEL_SAVE_DIR"] = MODEL_SAVE_DIR config_dict["VOCAB_SIZE"] = VOCAB_SIZE config_dict["NUM_CLASSES"] = NUM_CLASSES with open(os.path.join(f"{EXP_DIR_PATH}", "config.json"), "w") as fp: json.dump(config_dict, fp) return engine
def setup_parscit_inference(seq_dataset_manager, tmpdir_factory): HIDDEN_DIM = 100 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" dataset_manager = seq_dataset_manager embedder = WordEmbedder(embedding_type="glove_6B_50") char_embedder = CharEmbedder( char_embedding_dimension=10, hidden_dimension=20, datasets_manager=dataset_manager, ) embedder = ConcatEmbedders([embedder, char_embedder]) encoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, add_projection_layer=False, ) tagger = RnnSeqCrfTagger( rnn2seqencoder=encoder, encoding_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, datasets_manager=dataset_manager, ) train_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) dev_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) test_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) optimizer = torch.optim.Adam(params=tagger.parameters()) batch_size = 1 save_dir = tmpdir_factory.mktemp("experiment_1") num_epochs = 1 save_every = 1 log_train_metrics_every = 10 engine = Engine( model=tagger, datasets_manager=dataset_manager, optimizer=optimizer, batch_size=batch_size, save_dir=save_dir, num_epochs=num_epochs, save_every=save_every, log_train_metrics_every=log_train_metrics_every, train_metric=train_metric, validation_metric=dev_metric, test_metric=test_metric, track_for_best="macro_fscore", ) engine.run() model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt") inference_client = SequenceLabellingInference( model=tagger, model_filepath=model_filepath, datasets_manager=dataset_manager ) return inference_client
def setup_engine_test_with_simple_classifier(request, tmpdir_factory): MAX_NUM_WORDS = 1000 MAX_LENGTH = 50 vocab_store_location = tmpdir_factory.mktemp("tempdir").join("vocab.json") DEBUG = True BATCH_SIZE = 1 NUM_TOKENS = 3 EMB_DIM = 300 train_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="train", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) validation_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="valid", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) test_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) VOCAB_SIZE = MAX_NUM_WORDS + len(train_dataset.word_vocab.special_vocab) NUM_CLASSES = train_dataset.get_num_classes() NUM_EPOCHS = 1 embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM])) labels = torch.LongTensor([1]) metric = PrecisionRecallFMeasure( idx2labelname_mapping=train_dataset.idx2classname) embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding) encoder = BOW_Encoder(emb_dim=EMB_DIM, embedder=embedder, dropout_value=0, aggregation_type="sum") tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TOKENS)) tokens = torch.LongTensor(tokens) model = SimpleClassifier( encoder=encoder, encoding_dim=EMB_DIM, num_classes=NUM_CLASSES, classification_layer_bias=False, ) optimizer = optim.SGD(model.parameters(), lr=0.01) engine = Engine( model, train_dataset, validation_dataset, test_dataset, optimizer=optimizer, batch_size=BATCH_SIZE, save_dir=tmpdir_factory.mktemp("model_save"), num_epochs=NUM_EPOCHS, save_every=1, log_train_metrics_every=10, metric=metric, track_for_best=request.param, ) options = { "MAX_NUM_WORDS": MAX_NUM_WORDS, "MAX_LENGTH": MAX_LENGTH, "BATCH_SIZE": BATCH_SIZE, "NUM_TOKENS": NUM_TOKENS, "EMB_DIM": EMB_DIM, "VOCAB_SIZE": VOCAB_SIZE, "NUM_CLASSES": NUM_CLASSES, "NUM_EPOCHS": NUM_EPOCHS, } return engine, tokens, labels, options