def setup_sectlabel_bow_glove_infer(request, clf_datasets_manager, tmpdir_factory): track_for_best = request.param sample_proportion = 0.5 datasets_manager = clf_datasets_manager word_embedder = WordEmbedder(embedding_type="glove_6B_50") bow_encoder = BOW_Encoder(embedder=word_embedder) classifier = SimpleClassifier( encoder=bow_encoder, encoding_dim=word_embedder.get_embedding_dimension(), num_classes=2, classification_layer_bias=True, datasets_manager=datasets_manager, ) train_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager) validation_metric = PrecisionRecallFMeasure( datasets_manager=datasets_manager) test_metric = PrecisionRecallFMeasure(datasets_manager=datasets_manager) optimizer = torch.optim.Adam(params=classifier.parameters()) batch_size = 1 save_dir = tmpdir_factory.mktemp("experiment_1") num_epochs = 1 save_every = 1 log_train_metrics_every = 10 engine = Engine( model=classifier, datasets_manager=datasets_manager, optimizer=optimizer, batch_size=batch_size, save_dir=save_dir, num_epochs=num_epochs, save_every=save_every, log_train_metrics_every=log_train_metrics_every, train_metric=train_metric, validation_metric=validation_metric, test_metric=test_metric, track_for_best=track_for_best, sample_proportion=sample_proportion, ) engine.run() model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt") infer = ClassificationInference( model=classifier, model_filepath=str(model_filepath), datasets_manager=datasets_manager, ) return infer
def build_sectlabel_bow_elmo_model(dirname: str): exp_dirpath = pathlib.Path(dirname) DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) embedder = BowElmoEmbedder(layer_aggregation="last") encoder = BOW_Encoder(aggregation_type="sum", embedder=embedder) model = SimpleClassifier( encoder=encoder, encoding_dim=1024, num_classes=data_manager.num_labels["label"], classification_layer_bias=True, datasets_manager=data_manager, ) infer_client = ClassificationInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return infer_client
def setup_simple_classifier(): BATCH_SIZE = 1 NUM_TOKENS = 3 EMB_DIM = 300 VOCAB_SIZE = 10 NUM_CLASSES = 3 embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM])) embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding) labels = torch.LongTensor([[1]]) encoder = BOW_Encoder(emb_dim=EMB_DIM, embedder=embedder, dropout_value=0, aggregation_type="sum") tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TOKENS)) tokens = torch.LongTensor(tokens) simple_classifier = SimpleClassifier( encoder=encoder, encoding_dim=EMB_DIM, num_classes=NUM_CLASSES, classification_layer_bias=False, ) iter_dict = {"tokens": tokens, "label": labels} return iter_dict, simple_classifier, BATCH_SIZE, NUM_CLASSES
def _get_model(self) -> nn.Module: embedding_type = self.hparams.get("emb_type") word_embedder = WordEmbedder(embedding_type=embedding_type) elmo_embedder = ElmoEmbedder(datasets_manager=self.data_manager) embedder = ConcatEmbedders([word_embedder, elmo_embedder]) hidden_dim = self.hparams.get("hidden_dim") combine_strategy = self.hparams.get("combine_strategy") bidirectional = self.hparams.get("bidirectional") encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=hidden_dim, combine_strategy=combine_strategy, bidirectional=bidirectional, ) classifier_encoding_dim = 2 * hidden_dim if bidirectional else hidden_dim model = SimpleClassifier( encoder=encoder, encoding_dim=classifier_encoding_dim, num_classes=3, classification_layer_bias=True, datasets_manager=self.data_manager, ) return model
def build_model(self): word_embedder = WordEmbedder( embedding_type=self.hparams.get("embedding_type"), device=self.hparams.get("device")) elmo_embedder = ElmoEmbedder(device=self.hparams.get("device")) embedder = ConcatEmbedders([word_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=self.hparams.get("hidden_dim"), combine_strategy=self.hparams.get("combine_strategy"), bidirectional=self.hparams.get("bidirectional"), device=torch.device(self.hparams.get("device")), ) classiier_encoding_dim = (2 * self.hparams.get("hidden_dim") if self.hparams.get("bidirectional") else self.hparams.get("hidden_dim")) model = SimpleClassifier( encoder=encoder, encoding_dim=classiier_encoding_dim, num_classes=self.hparams.get("num_classes"), classification_layer_bias=True, datasets_manager=self.data_manager, device=self.hparams.get("device"), ) return model
def _get_model(self): elmo_embedder = BowElmoEmbedder(layer_aggregation="sum") # instantiate the vanilla embedder vanilla_embedder = WordEmbedder( embedding_type=self.hparams.get("emb_type")) # concat the embeddings embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder]) hidden_dim = self.hparams.get("hidden_dim") bidirectional = self.hparams.get("bidirectional") combine_strategy = self.hparams.get("combine_strategy") encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=hidden_dim, bidirectional=bidirectional, combine_strategy=combine_strategy, ) encoding_dim = (2 * hidden_dim if bidirectional and combine_strategy == "concat" else hidden_dim) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=23, classification_layer_bias=True, datasets_manager=self.data_manager, ) return model
def build_sectlabel_elmobilstm_model(dirname: str): exp_dirpath = pathlib.Path(dirname) DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) DEVICE = "cpu" EMBEDDING_TYPE = "glove_6B_50" HIDDEN_DIM = 512 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" elmo_embedder = BowElmoEmbedder(cuda_device_id=-1 if DEVICE == "cpu" else int(DEVICE.split("cuda:")[1])) vanilla_embedder = WordEmbedder(embedding_type=EMBEDDING_TYPE) embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( embedder=embedders, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, device=torch.device(DEVICE), ) encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=23, classification_layer_bias=True, datasets_manager=data_manager, ) inference = ClassificationInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return inference
def build_model(self): embedder = WordEmbedder(embedding_type=self.hparams.get("emb_type")) encoder = BOW_Encoder(embedder=embedder) model = SimpleClassifier( encoder=encoder, encoding_dim=self.hparams.get("encoding_dim"), num_classes=self.hparams.get("num_classes"), classification_layer_bias=True, datasets_manager=self.data_manager, ) return model
def get_bilstm_lc_infer_parsect(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) EMBEDDING_DIM = config["EMBEDDING_DIMENSION"] HIDDEN_DIM = config["HIDDEN_DIMENSION"] COMBINE_STRATEGY = config["COMBINE_STRATEGY"] BIDIRECTIONAL = config["BIDIRECTIONAL"] VOCAB_SIZE = config["VOCAB_SIZE"] NUM_CLASSES = config["NUM_CLASSES"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM) embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIM, embedding=embedding) encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIM, embedder=embedder, hidden_dim=HIDDEN_DIM, combine_strategy=COMBINE_STRATEGY, bidirectional=BIDIRECTIONAL, ) model = SimpleClassifier( encoder=encoder, encoding_dim=classifier_encoding_dim, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = SectLabelDataset(**test_dataset_args) inference = ClassificationInference(model=model, model_filepath=model_filepath, dataset=dataset) return inference
def setup_simple_classifier(clf_dataset_manager): datasets_manager = clf_dataset_manager embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = BOW_Encoder(embedder=embedder) classifier = SimpleClassifier( encoder=encoder, encoding_dim=50, num_classes=2, datasets_manager=datasets_manager, classification_layer_bias=True, ) train_dataset = datasets_manager.train_dataset lines, labels = train_dataset.get_lines_labels() return classifier, lines, labels
def build_model(self): embedder = BowElmoEmbedder( layer_aggregation=self.hparams.get("layer_aggregation")) encoder = BOW_Encoder( aggregation_type=self.hparams.get("word_aggregation"), embedder=embedder) model = SimpleClassifier( encoder=encoder, encoding_dim=self.hparams.get("encoding_dim"), num_classes=self.hparams.get("num_classes"), classification_layer_bias=True, datasets_manager=self.data_manager, ) return model
def get_bow_bert_emb_lc_gensect_infer(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) EMBEDDING_DIM = config["EMBEDDING_DIMENSION"] NUM_CLASSES = config["NUM_CLASSES"] BERT_TYPE = config["BERT_TYPE"] DEVICE = config["DEVICE"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") embedder = BertEmbedder( emb_dim=EMBEDDING_DIM, dropout_value=0.0, aggregation_type="average", bert_type=BERT_TYPE, device=torch.device(DEVICE), ) encoder = BOW_Encoder( embedder=embedder, emb_dim=EMBEDDING_DIM, aggregation_type="average" ) model = SimpleClassifier( encoder=encoder, encoding_dim=EMBEDDING_DIM, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = GenericSectDataset(**test_dataset_args) parsect_inference = ClassificationInference( model=model, model_filepath=model_filepath, dataset=dataset ) return parsect_inference
def build_sectlabel_bilstm_model(dirname: str): exp_dirpath = pathlib.Path(dirname) DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) HIDDEN_DIM = 512 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=HIDDEN_DIM, combine_strategy=COMBINE_STRATEGY, bidirectional=BIDIRECTIONAL, ) model = SimpleClassifier( encoder=encoder, encoding_dim=classifier_encoding_dim, num_classes=23, classification_layer_bias=True, datasets_manager=data_manager, ) inference = ClassificationInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return inference
def get_bow_lc_parsect_infer(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) EMBEDDING_DIMENSION = config["EMBEDDING_DIMENSION"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] VOCAB_SIZE = config["VOCAB_SIZE"] NUM_CLASSES = config["NUM_CLASSES"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIMENSION) embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIMENSION, embedding=embedding) encoder = BOW_Encoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=0.0, aggregation_type="sum", ) model = SimpleClassifier( encoder=encoder, encoding_dim=EMBEDDING_DIMENSION, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = SectLabelDataset(**test_dataset_args) dataset.print_stats() parsect_inference = ClassificationInference(model=model, model_filepath=model_filepath, dataset=dataset) return parsect_inference
def build_sectlabel_bow_model(dirname: str): """ Parameters ---------- dirname : The directory where sciwing stores your outputs for the model Returns ------- """ exp_dirpath = pathlib.Path(dirname) DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = BOW_Encoder(embedder=embedder) model = SimpleClassifier( encoder=encoder, encoding_dim=embedder.get_embedding_dimension(), num_classes=data_manager.num_labels["label"], classification_layer_bias=True, datasets_manager=data_manager, ) infer = ClassificationInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return infer
def get_elmo_emb_lc_infer_gensect(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) EMBEDDING_DIM = config["EMBEDDING_DIMENSION"] NUM_CLASSES = config["NUM_CLASSES"] EMBEDDING_DIMENSION = config["EMBEDDING_DIMENSION"] LAYER_AGGREGATION = config["LAYER_AGGREGATION"] WORD_AGGREGATION = config["WORD_AGGREGATION"] embedder = BowElmoEmbedder(emb_dim=EMBEDDING_DIMENSION, layer_aggregation=LAYER_AGGREGATION) encoder = BOW_Encoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, aggregation_type=WORD_AGGREGATION, ) model = SimpleClassifier( encoder=encoder, encoding_dim=EMBEDDING_DIM, num_classes=NUM_CLASSES, classification_layer_bias=True, ) MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") dataset = GenericSectDataset(**test_dataset_args) parsect_inference = ClassificationInference(model=model, model_filepath=model_filepath, dataset=dataset) return parsect_inference
def build_model(self): embedder = WordEmbedder( embedding_type=self.hparams.get("embedding_type")) encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=self.hparams.get("hidden_dim"), combine_strategy=self.hparams.get("combine_strategy"), bidirectional=self.hparams.get("bidirectional"), ) model = SimpleClassifier( encoder=encoder, encoding_dim=2 * self.hparams.get("hidden_dim"), num_classes=self.hparams.get("num_classes"), classification_layer_bias=True, datasets_manager=self.data_manager, ) return model
def build_sectlabel_bow_bert(dirname: str): exp_dirpath = pathlib.Path(dirname) DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) embedder = BertEmbedder( dropout_value=0.0, aggregation_type="average", bert_type="bert-base-uncased", device=torch.device("cpu"), ) encoder = BOW_Encoder(embedder=embedder, aggregation_type="average") model = SimpleClassifier( encoder=encoder, encoding_dim=768, num_classes=23, classification_layer_bias=True, datasets_manager=data_manager, ) parsect_inference = ClassificationInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return parsect_inference
encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIMENSION + 1024, embedder=embedder, hidden_dim=HIDDEN_DIMENSION, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, device=torch.device(DEVICE), ) encoding_dim = (2 * HIDDEN_DIMENSION if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=NUM_CLASSES, classification_layer_bias=True, ) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE) metric = PrecisionRecallFMeasure( idx2labelname_mapping=train_dataset.idx2classname) engine = Engine( model=model, train_dataset=train_dataset, validation_dataset=validation_dataset, test_dataset=test_dataset, optimizer=optimizer, batch_size=BATCH_SIZE, save_dir=MODEL_SAVE_DIR,
dev_filename=str(dev_file), test_filename=str(test_file), ) # BowElmoEmbedder embeds sentences using ELMO embedder = BowElmoEmbedder(layer_aggregation=args.layer_aggregation, device=args.device) encoder = BOW_Encoder(embedder=embedder, aggregation_type=args.word_aggregation, device=args.device) model = SimpleClassifier( encoder=encoder, encoding_dim=1024, num_classes=data_manager.num_labels["label"], classification_layer_bias=True, datasets_manager=data_manager, device=args.device, ) optimizer = optim.Adam(params=model.parameters(), lr=args.lr) train_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) dev_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) test_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) engine = Engine( model=model, datasets_manager=data_manager, optimizer=optimizer, batch_size=args.bs, save_dir=args.model_save_dir,
def setup_engine_test_with_simple_classifier(request, tmpdir_factory): MAX_NUM_WORDS = 1000 MAX_LENGTH = 50 vocab_store_location = tmpdir_factory.mktemp("tempdir").join("vocab.json") DEBUG = True BATCH_SIZE = 1 NUM_TOKENS = 3 EMB_DIM = 300 train_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="train", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) validation_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="valid", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) test_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) VOCAB_SIZE = MAX_NUM_WORDS + len(train_dataset.word_vocab.special_vocab) NUM_CLASSES = train_dataset.get_num_classes() NUM_EPOCHS = 1 embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM])) labels = torch.LongTensor([1]) metric = PrecisionRecallFMeasure( idx2labelname_mapping=train_dataset.idx2classname) embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding) encoder = BOW_Encoder(emb_dim=EMB_DIM, embedder=embedder, dropout_value=0, aggregation_type="sum") tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TOKENS)) tokens = torch.LongTensor(tokens) model = SimpleClassifier( encoder=encoder, encoding_dim=EMB_DIM, num_classes=NUM_CLASSES, classification_layer_bias=False, ) optimizer = optim.SGD(model.parameters(), lr=0.01) engine = Engine( model, train_dataset, validation_dataset, test_dataset, optimizer=optimizer, batch_size=BATCH_SIZE, save_dir=tmpdir_factory.mktemp("model_save"), num_epochs=NUM_EPOCHS, save_every=1, log_train_metrics_every=10, metric=metric, track_for_best=request.param, ) options = { "MAX_NUM_WORDS": MAX_NUM_WORDS, "MAX_LENGTH": MAX_LENGTH, "BATCH_SIZE": BATCH_SIZE, "NUM_TOKENS": NUM_TOKENS, "EMB_DIM": EMB_DIM, "VOCAB_SIZE": VOCAB_SIZE, "NUM_CLASSES": NUM_CLASSES, "NUM_EPOCHS": NUM_EPOCHS, } return engine, tokens, labels, options
def get_elmo_bilstm_lc_infer(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) DEVICE = config["DEVICE"] EMBEDDING_DIM = config["EMBEDDING_DIMENSION"] VOCAB_SIZE = config["VOCAB_SIZE"] HIDDEN_DIM = config["HIDDEN_DIMENSION"] BIDIRECTIONAL = config["BIDIRECTIONAL"] COMBINE_STRATEGY = config["COMBINE_STRATEGY"] NUM_CLASSES = config["NUM_CLASSES"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM) elmo_embedder = BowElmoEmbedder( layer_aggregation="sum", cuda_device_id=-1 if DEVICE == "cpu" else int( DEVICE.split("cuda:")[1]), ) vanilla_embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIM) embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIM + 1024, embedder=embedders, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, device=torch.device(DEVICE), ) encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = SectLabelDataset(**test_dataset_args) inference = ClassificationInference(model=model, model_filepath=model_filepath, dataset=dataset) return inference
dev_filename = DATA_DIR.joinpath("genericSect.dev") test_filename = DATA_DIR.joinpath("genericSect.test") data_manager = TextClassificationDatasetManager( train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, ) embedder = WordEmbedder(embedding_type=args.emb_type) encoder = BOW_Encoder(embedder=embedder) model = SimpleClassifier( encoder=encoder, encoding_dim=50, num_classes=12, classification_layer_bias=True, datasets_manager=data_manager, ) optimizer = optim.Adam(params=model.parameters(), lr=args.lr) train_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) dev_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) test_metric = PrecisionRecallFMeasure(datasets_manager=data_manager) engine = Engine( datasets_manager=data_manager, model=model, optimizer=optimizer, batch_size=args.bs, save_dir=args.model_save_dir,